python learning Notes 1 watercress book information is saved to csv download

. 1  Import OS # established document category, unimplemented 
2  Import Re
 . 3  Import Requests
 . 4  from BS4 Import the BeautifulSoup
 . 5  Import CSV
 . 6  Import Random
 . 7  Import Time
 . 8  # extracted Category Table 
9  # subclassing URL 
10 Books = []
 . 11 ourl = ' https://book.douban.com/tag/ ' 
12 is headers = {
 13 is      ' the User-- Agent ' : 'Mozilla / 5.0 (the Windows NT 6.1; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 69.0.3497.100 Safari / 537.36 ' 
14  }
 15 Response = requests.get (ourl, headers = headers)
 16 response.encoding = ' UTF -8 ' 
17  # positioning location categories, each category test only select the first row select 2 Total 480 
18 is Soup = the BeautifulSoup (response.text, ' html.parser ' )
 . 19 dw_ = soup.select ( ' #content > div> div.article> div: Child-Nth (2)> div> table> tbody> TR: Child-Nth (. 1)> TD> A ' )
 20 is leimu = [] # Category table 
21 link = [] #类目链接表
22 for dw in dw_:
23     leimu.append(dw.string)
24     link.append('https://book.douban.com'+dw['href'])
25 for href_ in link:#每类首页
26     for page in range(0,40,20):
27         data = {
28             'start':page,
29             'type': 'T'
30         }
31         headers2 = {
32             'Referer': 'https://www.baidu.com/',
33             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
34         }
35         html2 = requests.get(href_, params=data,headers=headers2)
36         html2.encoding = 'utf-8'
37         soup2 = BeautifulSoup(html2.text, 'html.parser')
38         names = soup2.select('#subject_list > ul > li > div.info > h2 > a')
39         details = soup2.select('#subject_list > ul > li > div.info > div.pub')
40         scores = soup2.select('#subject_list > ul > li > div.info > div.star.clearfix > span.rating_nums')
41         briefs = soup2.select('#subject_list > ul > li > div.info > p')
42         # tag1 = soup2.select('#content > h1')
43         # tag2 = tag1[0].get_text()
44         # Tag3 the re.findall = (R & lt '\: \ S * (*).', Tag2) 
45          # Tag = Tag3 [0] # bit redundant sense get the label from the label 
46 is  
47          for name, Detail, Score, Brief in ZIP (names, Details, Scores, Briefs):
 48              the try :
 49                  dict_book = {}
 50                  NAME1 = . name.get_text () Strip ()
 51 is                  name = '' .join (name1.split ())   # remove all spaces 
52 is                  Detail detail.get_text = (). Split ( ' / ' )
 53 is                  author = Detail [0] .strip ()
54 is                  pubtime Detail = [-2 ] .strip ()
 55                  price1 = Detail [-1 ] .strip ()
 56 is                  price2 = the re.findall (R & lt ' (\ + D \. \ D {0,3}). * ' , price1)
 57 is                  . price price2 = [0]   # regular reservation number 
58                  Score = score.get_text ()
 59                  Brief = brief.get_text ()
 60                  dict_book [ ' title ' ] = name
 61 is                  dict_book [ ' OF ' ] = author
62                  dict_book [ ' time to market ' ] = pubtime
 63                  dict_book [ ' price ' ] = . Price
 64-                  dict_book [ ' books rates ' ] = Score
 65                  dict_book [ ' Introduction ' ] = the Brief
 66                  books.append (dict_book)
 67                  the time.sleep (random.random () *. 3) # random sleep 
68                  Print (name)
 69              the except IndexError AS E:
 70                 Print ( ' IndexError: ' , E)
 71 is              the finally :
 72                  Print ( ' the finally ' )
 73 is          with Open (R & lt ' flieName.csv ' , ' W ' , errors = ' the ignore ' ) AS csvFile:
 74              filednames = [ ' Title ' , ' author ' , ' time to market ' , ' price ' , 'Books score ', ' Description ' ]
 75              Writer = csv.DictWriter (csvFile, filednames)
 76              writer.writeheader ()
 77              for book_ in Books:
 78                  writer.writerow ({
 79                      ' Title ' : book_ [ ' Title ' ], ' author ' : book_ [ ' author ' ], ' time to market ' : book_ [ ' time to market ' ], ' price ' : book_ [ 'Price ' ],
 80                      ' books Rating ' : book_ [ ' books score ' ], ' Description ' : book_ [ ' Introduction ' ]
 81                  })

Need to add ip pool unrealized

Guess you like

Origin www.cnblogs.com/yueyuecong/p/11491085.html