# Address
movie Paradise - 2019 fine new film - more
# target
movie name, download link
# Analysis
********* a page to be crawled *********** 1, the name of the film 2, film links ********* two pages to be arrested take *********** 1. Download link
Implementation steps
1, it is determined whether there is the content needed to fetch data response
2, find the URL law
Page 1: HTTPS: //www.dytt8.net/html/gndy/dyzz/ list_23_1.html Page 2: HTTPS: //www.dytt8.net/html/gndy/dyzz/ list_23_2.html P the n-: HTTPS: //www.dytt8.net/html/gndy/dyzz/list_23_n.html
3, write regular expressions
1 , a page regular expression <Table width = " 100% " . *? <TD width = " . 5% " . *? <A the href = " (. *?) " . *? ULINK " > (. * ?) </a>. *? </ Table> 2 , two pages of the regular expression <TD style = " WORD-the WRAP. *?>. *?> (. *?) </a>
4, code implementation
from urllib import request import re from useragents import ua_list import time import random class FilmSkySpider (Object): DEF __init__ (Self): # a page url address self.url = ' https://www.dytt8.net/html/gndy ' \ ' /dyzz/list_23_{}.html ' # Get html performance function DEF get_html (Self, url): headers = { 'User-Agent':random.choice(ua_list) } req = request.Request(url=url,headers=headers) RES = request.urlopen (REQ) # through the website page source code, to view the site = charset 'GB2312' # If you encounter a decoding error, can not recognize some of the characters, ignore ignore HTML = res.read (). decode ( ' GB2312 ' , ' the ignore ' ) return html # Regular parsing function DEF re_func (Self, re_bds, HTML): pattern = re.compile(re_bds,re.S) r_list = pattern.findall(html) return r_list # Get Data function - html page is a response content DEF parse_page (Self, HTML): re_bds = r'<table width="100%".*?<td width="5%".*?<a href="(.*?)".*?ulink">(.*?)</a>.*?</table>' # one_page_list: [('/html/xxx','幸福猎人'),()] one_page_list = self.re_func(re_bds,html) item = {} for film in one_page_list: item['name'] = film[1].strip() link = ' https://www.dytt8.net ' + film [0] item['download'] = self.parse_two_page(link) # Uniform: float, after crawling a movie information SLEEP the time.sleep (random.uniform (1, 3 )) print(item) # Analytical Data page two DEF parse_two_page (Self, Link): html = self.get_html(link) re_bds = r'<td style="WORD-WRAP.*?>.*?>(.*?)</a>' # two_page_list: ['ftp://xxxx.mkv'] two_page_list = self.re_func(re_bds,html) download = two_page_list[0].strip() return download def main(self): for page in range(1,11): url = self.url.format(page) html = self.get_html(url) self.parse_page(html) # Uniform: Float the time.sleep (random.uniform (1, 3 )) if __name__ == '__main__': spider = FilmSkySpider() spider.main()