defsplit_by_delimiters(delimiter_list,target_string,split_num=0):u""" Return a list of the words, using words in delimiter_list as the delimiter strings """# Prepare delimiter stringdelimiter_str=''forpatternindelimiter_list:delimiter_str=delimiter_str+pattern+'|'delimiter_str.rstrip('|')# Split target_stringifsplit_num==0:splited_str_list=re.split(delimiter_str,target_string)else:splited_str_list=re.split(delimiter_str,target_string,split_num)iflen(splited_str_list)>=2:returnsplited_str_listreturntarget_string
これを使用して,論文情報の文字列から著者情報と題目情報を抽出する関数を作った.
123456789101112131415161718192021
COLON_DLIMITERS=["\xef\xbc\x9a",":"]COMMA_DELIMITERS=["\xef\xbc\x8c",","]defget_scholar_json_model(scholr_info_str):u"""Return json model for scholar information"""scholr_info_str=scholr_info_str.encode('utf-8')iflen(scholr_info_str)<15:return-1# Split the string into authors and other informationsauthor_others_info=split_by_delimiters(COLON_DLIMITERS,scholr_info_str,1)iflen(author_others_info)==1:return-1# Split the string into scholar's title and other informationstitle_others_info=split_by_delimiters(COMMA_DELIMITERS,author_others_info[1])# Create JSON modelauthors=split_by_delimiters(COMMA_DELIMITERS,author_others_info[0])authors_model=[]forauthorinauthors:authors_model.append({"name":author.strip()})title=title_others_info[0]scholar_model={"authors":authors_model,"title":title.strip()}returnscholar_model
# -*- coding: utf-8 -*-importrefromcollectionsimportnamedtupleimportrequestsfrombs4importBeautifulSoupAuthInfo=namedtuple('AuthInfo','user_name, pass_word')defget_html_from(url,auth_info):u"""Get html in url"""user_name=auth_info.user_namepass_word=auth_info.pass_wordpages_data=requests.get(url,auth=(user_name,pass_word))pages_data.encoding=pages_data.apparent_encodingreturnpages_data.text.encode('utf-8')COLON_DLIMITERS=["\xef\xbc\x9a",":"]COMMA_DELIMITERS=["\xef\xbc\x8c",","]defget_scholar_json_model(scholar_info_str):u"""Return json model for scholar information"""scholar_info_str=scholar_info_str.encode('utf-8')iflen(scholar_info_str)<15:return-1# Split the string into authors and other informationsauthor_others_info=split_by_delimiters(COLON_DLIMITERS,scholar_info_str,1)iflen(author_others_info)==1:return-1# Split the string into scholar's title and other informationstitle_others_info=split_by_delimiters(COMMA_DELIMITERS,author_others_info[1])# Create JSON modelauthors=split_by_delimiters(COMMA_DELIMITERS,author_others_info[0])authors_model=[]forauthorinauthors:authors_model.append({"name":author.strip()})title=title_others_info[0]scholar_model={"authors":authors_model,"title":title.strip()}returnscholar_modeldefsplit_by_delimiters(delimiter_list,target_string,split_num=0):u""" Return a list of the words, using words in delimiter_list as the delimiter strings """# Prepare delimiter stringdelimiter_str=''forpatternindelimiter_list:delimiter_str=delimiter_str+pattern+'|'delimiter_str.rstrip('|')# Split target_stringifsplit_num==0:splited_str_list=re.split(delimiter_str,target_string)else:splited_str_list=re.split(delimiter_str,target_string,split_num)iflen(splited_str_list)>=2:returnsplited_str_listreturntarget_stringdefget_scholars_json_model(html):u"""Return JSON model for scholars information"""soup=BeautifulSoup(html,"lxml")table=soup.find_all('table')[0]results=[]for_,tr_taginenumerate(table.find_all('tr')):scholar_strings=[]for_,datainenumerate(tr_tag.find_all('td')):ifdata.has_attr("align"):scholar_strings.append(data.get_text())forscholar_stringinscholar_strings:info=get_scholar_json_model(scholar_string)ifinfo!=-1:results.append(info)returnresultsif__name__=="__main__":URL='http://...''AUTH_INFO=AuthInfo(user_name='your user name',pass_word='your password')HTML=get_html_from(URL,AUTH_INFO)SCHOLARS=get_scholars_json_model(HTML)forscholarinSCHOLARS:printscholar["title"]forauthorinscholar["authors"]:printauthor["name"]