First, we want to import third-party libraries early there;
Cat's Eye by 100 list of movies you can see very regular source such as:
Or also:
According to the law we can get non-greedy regular expression
"""<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>"""
Then we observe the changes of the web address (url) of:
This is the URL of the first page: https://maoyan.com/board/4?offset=0
This is the second page of the web site: https://maoyan.com/board/4?offset=10
This is the third page of the web site: https://maoyan.com/board/4?offset= 20
The variation may be found on the page as the current page :( 1) * 10 that is: (N-1) * 10
After following crawling
1 from urllib import request 2 import random 3 import time 4 import csv 5 import re 6 import xlwt 7 8 class catEyesMovie: 9 10 def __init__(self): 11 self.url = 'https://maoyan.com/board/4?offset={}' 12 self.ua_list = [ 13 'Win7: the Mozilla / 5.0 (the Windows NT 6.1; the WOW64) AppleWebKit / 535.1 (KHTML, like the Gecko) the Chrome / 14.0.835.163 Safari / 535.1 ' 14 ] 15 self.line =. 1 ; 16 # ' - Agent-the User ':' win7: Mozilla / 5.0 (Windows NT 6.1; WOW64) AppleWebKit / 535.1 (KHTML, like Gecko) Chrome / 14.0.835.163 Safari / 535.1 ' 17 18 19 # acquire web content 20 DEF get_page (Self, url): 21 # using a random user -agent 22 is headers = { ' the User-- Agent ' : The random.choice (self.ua_list)} 23 is # Create request object 24 = request.Request REQ (URL = URL, headers = headers) 25 # sends a request 26 is RES = request.urlopen (REQ) 27 # obtain the appropriate target 28 HTML res.read = (). decode ( ' UTF-. 8 ' , ' the ignore ' ) 29 return HTML 30 31 is # cleaning data 32 DEF clean_page (Self, HTML, xwlt): 33 is pattern the re.compile = ( "" " <div class =" Movie-Item-info "> title = *.?" (. *?) ". * ? class =" star "> (. *?) </ p>. *? releasetime"> (. *?) </ p>""", re.S) 34 r_list = pattern.findall(html) 35 self.write_page(r_list,xwlt) 36 37 38 #保存内容 39 def write_page(self,r_list,xwlt): 40 one_film_dict = {} 41 for rt in r_list: 42 xwlt.write(self.line,0,rt[0].strip()) 43 xwlt.write(self.line,1,rt[1].strip()) 44 xwlt.write(self.line,2,rt[2].strip()) 45 self.line+=1 46 47 48 49 def main(self,xwlt): 50 # With the string 51 is RES = [] 52 is for I in Range (1,11 ): 53 is # splicing the url {} https://maoyan.com/board/4?offset= 54 is # obtain Pages 55 offset = (. 1-I) * 10 56 is URL = self.url.format (offset) 57 is HTML = self.get_page (URL) 58 self.clean_page (HTML, xwlt) 59 60 61 is 62 is 63 is 64 IF the __name__ == ' __main__ ' : 65 = Start the time.time () 66 Spider = catEyesMovie () 67 # Create a xlwt objects 68 Book = xlwt.Workbook (encoding = ' UTF-. 8 ' ) 69 # Create sheet, Sheet1 for the table name, cell_overwirite_ok whether the cover unit is grid 70 Sheet1 book.add_sheet = (U ' Sheet1 ' , cell_overwrite_ok = True) 71 is # a first row header definitions 72 sheet1.write (0,0, ' movie title ' ) 73 is sheet1.write (0,1, ' starring ' ) 74 sheet1.write (0,2,' Release date ' ) 75 # crawling 76 spider.main (Sheet1) 77 book.save ( " D: \\ write.xls ' ) 78 End = the time.time () 79 Print ( ' execution time is:%. . 2F ' % (Start-End))
Each crawling data structure:
1 { ' Movie Name ' : ' Fast Five ' , ' starring ' : ' Fast Five ' , ' Release time ' : ' Fast Five ' } 2 { ' Movie Name ' : ' tame Dragon ' , ' starring ' : ' tame dragon ' , ' release time ' : ' tame dragon ' } 3 { 'Movie Name ': ' Braveheart ' , ' starring ' : ' Braveheart ' , ' Release time ' : ' Braveheart ' } 4 { ' Movie Name ' : ' Scent ' , ' starring ' : ' smells knowledge woman ' , ' release time ' : ' Scent ' } 5 { ' movie name ' : 'Despicable Me ' ,' Starring ' : ' Despicable Me ' , ' Release time ' : ' Despicable Me ' }
excel completed forms as follows:
Only to learn !!