. 1 Import OS # established document category, unimplemented 2 Import Re . 3 Import Requests . 4 from BS4 Import the BeautifulSoup . 5 Import CSV . 6 Import Random . 7 Import Time . 8 # extracted Category Table 9 # subclassing URL 10 Books = [] . 11 ourl = ' https://book.douban.com/tag/ ' 12 is headers = { 13 is ' the User-- Agent ' : 'Mozilla / 5.0 (the Windows NT 6.1; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 69.0.3497.100 Safari / 537.36 ' 14 } 15 Response = requests.get (ourl, headers = headers) 16 response.encoding = ' UTF -8 ' 17 # positioning location categories, each category test only select the first row select 2 Total 480 18 is Soup = the BeautifulSoup (response.text, ' html.parser ' ) . 19 dw_ = soup.select ( ' #content > div> div.article> div: Child-Nth (2)> div> table> tbody> TR: Child-Nth (. 1)> TD> A ' ) 20 is leimu = [] # Category table 21 link = [] #类目链接表 22 for dw in dw_: 23 leimu.append(dw.string) 24 link.append('https://book.douban.com'+dw['href']) 25 for href_ in link:#每类首页 26 for page in range(0,40,20): 27 data = { 28 'start':page, 29 'type': 'T' 30 } 31 headers2 = { 32 'Referer': 'https://www.baidu.com/', 33 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' 34 } 35 html2 = requests.get(href_, params=data,headers=headers2) 36 html2.encoding = 'utf-8' 37 soup2 = BeautifulSoup(html2.text, 'html.parser') 38 names = soup2.select('#subject_list > ul > li > div.info > h2 > a') 39 details = soup2.select('#subject_list > ul > li > div.info > div.pub') 40 scores = soup2.select('#subject_list > ul > li > div.info > div.star.clearfix > span.rating_nums') 41 briefs = soup2.select('#subject_list > ul > li > div.info > p') 42 # tag1 = soup2.select('#content > h1') 43 # tag2 = tag1[0].get_text() 44 # Tag3 the re.findall = (R & lt '\: \ S * (*).', Tag2) 45 # Tag = Tag3 [0] # bit redundant sense get the label from the label 46 is 47 for name, Detail, Score, Brief in ZIP (names, Details, Scores, Briefs): 48 the try : 49 dict_book = {} 50 NAME1 = . name.get_text () Strip () 51 is name = '' .join (name1.split ()) # remove all spaces 52 is Detail detail.get_text = (). Split ( ' / ' ) 53 is author = Detail [0] .strip () 54 is pubtime Detail = [-2 ] .strip () 55 price1 = Detail [-1 ] .strip () 56 is price2 = the re.findall (R & lt ' (\ + D \. \ D {0,3}). * ' , price1) 57 is . price price2 = [0] # regular reservation number 58 Score = score.get_text () 59 Brief = brief.get_text () 60 dict_book [ ' title ' ] = name 61 is dict_book [ ' OF ' ] = author 62 dict_book [ ' time to market ' ] = pubtime 63 dict_book [ ' price ' ] = . Price 64- dict_book [ ' books rates ' ] = Score 65 dict_book [ ' Introduction ' ] = the Brief 66 books.append (dict_book) 67 the time.sleep (random.random () *. 3) # random sleep 68 Print (name) 69 the except IndexError AS E: 70 Print ( ' IndexError: ' , E) 71 is the finally : 72 Print ( ' the finally ' ) 73 is with Open (R & lt ' flieName.csv ' , ' W ' , errors = ' the ignore ' ) AS csvFile: 74 filednames = [ ' Title ' , ' author ' , ' time to market ' , ' price ' , 'Books score ', ' Description ' ] 75 Writer = csv.DictWriter (csvFile, filednames) 76 writer.writeheader () 77 for book_ in Books: 78 writer.writerow ({ 79 ' Title ' : book_ [ ' Title ' ], ' author ' : book_ [ ' author ' ], ' time to market ' : book_ [ ' time to market ' ], ' price ' : book_ [ 'Price ' ], 80 ' books Rating ' : book_ [ ' books score ' ], ' Description ' : book_ [ ' Introduction ' ] 81 })
Need to add ip pool unrealized