Acquaintance of reptiles python: using regular expressions crawling "ancient poetry" Web Data brother's papers.
Detailed code is as follows:
# ! / User / bin the env Python # author: the Simple-Sir # Time: 2019/8/1 14:50 # crawling embarrassments words (text) data page Import Requests, Re URLHead = ' HTTPS: //www.qiushibaike. COM ' DEF the getHtml (URL): headers = { ' the User-- Agent ' : ' the Mozilla / 5.0 (the Windows NT 10.0; Win64; x64-) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 75.0.3770.100 Safari / 537.36 ' } respons requests.get = (URL, headers = headers) HTML = respons.text returnHTML DEF getInfos (url): HTML = getHtml (url) in the authors = re.findall (r ' .? <h2> \ the n-(*) \ the n-</ h2> ' , HTML, re.DOTALL) # Get Authors author_sex_lvl = re.findall (r ' <div class = "articleGender (. *?) Icon"> (\ d *?) </ div> ' , HTML, re.DOTALL) # acquisition of gender, class author_sex = [] # gender = author_lvl [] # level for I in author_sex_lvl: author_sex.append (I [0]) author_lvl.append(i[1]) contentHerf = re.findall(r'<a href="(/article.*?)".*?class="contentHerf"',html,re.DOTALL)[1:] # 获取“详细页”href cont = [] # 内容 for contentUrl in contentHerf: contentHerf_all = URLHead + contentUrl contentHtml = getHtml(contentHerf_all) # 详细页html contents = re.findall(r'<div class="content">(.*?)</div>',contentHtml,re.DOTALL) content_br = re.sub(r'<br/>','',contents[0]) #Excluding </ br> tag Content = the re.sub (R & lt ' \\ XA0 ' , '' , content_br) cont.append (Content) the infos = [] for I in ZIP (the authors, author_sex, author_lvl, CONT): author, sex, LVL, text = I info = { ' OF ' : author, ' sex ' : sex, ' level ' : LVL, ' content ' : text } infos.append (info) return the infos DEF main (): Page = int (the INPUT ( ' you want to get the first few pages of data \ the n-? ' )) for i in the Range (1, Page + 1 ): url = ' HTTPS: //www.qiushibaike .com / text / page / {} ' .format (I) Print ( ' are crawling on the page data {}: ' .format (I)) for T in getInfos (URL): Print (T) Print ( ' first {} crawling page data is completed. ' .format (I)) Print ( 'All data has been completed crawling! ' ) IF the __name__ == ' __main__ ' : main ()
Results of the: