Python crawling cat eye film 100 list and save it to excel spreadsheet

First, we want to import third-party libraries early there;

 

 

 Cat's Eye by 100 list of movies you can see very regular source such as:

 

 

 

Or also:

 

 

 According to the law we can get non-greedy regular expression

"""<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>"""

Then we observe the changes of the web address (url) of:

This is the URL of the first page:  https://maoyan.com/board/4?offset=0

This is the second page of the web site:  https://maoyan.com/board/4?offset=10

This is the third page of the web site:  https://maoyan.com/board/4?offset= 20

The variation may be found on the page as the current page :( 1) * 10 that is: (N-1) * 10

After following crawling

 1 from urllib import request
 2 import random
 3 import time
 4 import csv
 5 import re
 6 import xlwt
 7 
 8 class catEyesMovie:
 9 
10     def __init__(self):
11         self.url = 'https://maoyan.com/board/4?offset={}'
12         self.ua_list = [
13             'Win7: the Mozilla / 5.0 (the Windows NT 6.1; the WOW64) AppleWebKit / 535.1 (KHTML, like the Gecko) the Chrome / 14.0.835.163 Safari / 535.1 ' 
14          ]
 15          self.line =. 1 ;
 16              # ' - Agent-the User ':' win7: Mozilla / 5.0 (Windows NT 6.1; WOW64) AppleWebKit / 535.1 (KHTML, like Gecko) Chrome / 14.0.835.163 Safari / 535.1 ' 
17  
18  
19      # acquire web content 
20      DEF get_page (Self, url):
 21          # using a random user -agent 
22 is          headers = { ' the User-- Agent ' : The random.choice (self.ua_list)}
 23 is          # Create request object 
24         = request.Request REQ (URL = URL, headers = headers)
 25          # sends a request 
26 is          RES = request.urlopen (REQ)
 27          # obtain the appropriate target 
28          HTML res.read = (). decode ( ' UTF-. 8 ' , ' the ignore ' )
 29          return HTML
 30  
31 is      # cleaning data 
32      DEF clean_page (Self, HTML, xwlt):
 33 is          pattern the re.compile = ( "" " <div class =" Movie-Item-info "> title = *.?" (. *?) ". * ? class =" star "> (. *?) </ p>. *? releasetime"> (. *?) </ p>""", re.S)
34         r_list = pattern.findall(html)
35         self.write_page(r_list,xwlt)
36 
37 
38     #保存内容
39     def write_page(self,r_list,xwlt):
40         one_film_dict = {}
41         for rt in r_list:
42             xwlt.write(self.line,0,rt[0].strip())
43             xwlt.write(self.line,1,rt[1].strip())
44             xwlt.write(self.line,2,rt[2].strip())
45             self.line+=1
46 
47 
48 
49     def main(self,xwlt):
50          # With the string 
51 is          RES = []
 52 is          for I in Range (1,11 ):
 53 is              # splicing the url {} https://maoyan.com/board/4?offset= 
54 is              # obtain Pages 
55              offset = (. 1-I) * 10
 56 is              URL = self.url.format (offset)
 57 is              HTML = self.get_page (URL)
 58              self.clean_page (HTML, xwlt)
 59  
60  
61 is  
62 is  
63 is  
64  IF  the __name__ == ' __main__ ' :
 65     = Start the time.time ()
 66      Spider = catEyesMovie ()
 67      # Create a xlwt objects 
68      Book = xlwt.Workbook (encoding = ' UTF-. 8 ' )
 69      # Create sheet, Sheet1 for the table name, cell_overwirite_ok whether the cover unit is grid 
70      Sheet1 book.add_sheet = (U ' Sheet1 ' , cell_overwrite_ok = True)
 71 is      # a first row header definitions 
72      sheet1.write (0,0, ' movie title ' )
 73 is      sheet1.write (0,1, ' starring ' )
 74      sheet1.write (0,2,' Release date ' )
 75      # crawling 
76      spider.main (Sheet1)
 77      book.save ( " D: \\ write.xls ' )
 78      End = the time.time ()
 79      Print ( ' execution time is:%. . 2F ' % (Start-End))

Each crawling data structure:

1 { ' Movie Name ' : ' Fast Five ' , ' starring ' : ' Fast Five ' , ' Release time ' : ' Fast Five ' }
 2 { ' Movie Name ' : ' tame Dragon ' , ' starring ' : ' tame dragon ' , ' release time ' : ' tame dragon ' }
 3 { 'Movie Name ': ' Braveheart ' , ' starring ' : ' Braveheart ' , ' Release time ' : ' Braveheart ' }
 4 { ' Movie Name ' : ' Scent ' , ' starring ' : ' smells knowledge woman ' , ' release time ' : ' Scent ' }
 5 { ' movie name ' : 'Despicable Me ' ,' Starring ' : ' Despicable Me ' , ' Release time ' : ' Despicable Me ' }

excel completed forms as follows:

 

 

 

 

        Only to learn !!

 

Guess you like

Origin www.cnblogs.com/gongdada/p/11718473.html