The procedure is carried out on the basis of the last modification, the persistent storage methods into mysql, and increased breakpoint climb function.
import requests import re from fake_useragent import UserAgent import random import time import pymysql from hashlib import md5 from lxml import etree class DianyingtiantangSpider(object): def __init__(self): self.url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html' self.db = pymysql.connect(host='127.0.0.1', port=3306, user=' The root ' , password = ' database password ' , Database = ' filmskydb ' , charset = ' UTF8 ' ) self.cursor = self.db.cursor () DEF get_headers (Self): "" " Construction of the request header : return: " "" UA = UserAgent () headers = { # "the Accept": "text / HTML, file application / XHTML + XML, file application / XML; Q = 0.9, Image / WebP, Image / APNG, * / *; Q = 0.8,application/signed-exchange;v=b3", # "Accept-Encoding": "gzip, deflate, br", # "Accept-Language": "zh-CN,zh;q=0.9", # "Cache-Control": "max-age=0", # "Connection": "keep-alive", # "Cookie": "UM_distinctid=16bdec86bc2679-07c211dd7aebc-15231708-1fa400-16bdec86bc3464; CNZZDATA1260535040=961678368-1562805532-https%253A%252F%252Fwww.baidu.com%252F%7C1562805532", # "Host": "www.dytt8.net", # "If-Modified-Since": "Thu, 19 Sep 2019 00:34:23 GMT", # "If-None-Match": "80d1b3fb816ed51:326", # "Sec-Fetch-Mode": "navigate", # "Sec-Fetch-Site": "none", # "Sec-Fetch-User": "?1", #"Upgrade-Insecure-Requests": ". 1", " the User-- Agent " : ua.random } return headers DEF parse_page (Self, URL): "" " parsing a page : param URL: : return: " "" text requests.get = (URL = URL, headers = self.get_headers ()) text.encoding = ' GBK ' # regular matching of the link in the first two pages re_bds R & lt = ' <Table width = "100%." *? <TD width = ". 5%." *? <A the href = "(. *?)." *? ULINK ">. *? </ Table> ' pattern = re.compile(re_bds, re.S) link_list =pattern.findall (text.text) for Link in link_list: two_url = ' https://www.dytt8.net ' + Link # generate a fingerprint S = MD5 () s.update (two_url.encode ()) two_url_md5 = S. hexdigest () # incorporated link determination function is not present in the database IF self.judge_repetition (two_url_md5): self.parse_two_page (two_url) # saved in the fingerprint database INS = ' INSERT INTO request_finger values (% S) ' self.cursor.execute (INS, [two_url_md5]) # remember to be submitted to the database to perform self.db.commit () # random time interval crawling the time.sleep (random.uniform (. 1,. 3 )) DEF judge_repetition (Self , two_url_md5): "" " fingerprint Analyzing : param two_url_md5: : return: " "" SEL = ' SELECT WHERE finger from finger request_finger% = S ' Result = self.cursor.execute (SEL, [two_url_md5]) IF Not Result: return True DEF parse_two_page(self, two_url): """ 提取二级页面的信息 :param two_url: :return: """ text = requests.get(url=two_url, headers=self.get_headers()) text.encoding = 'GBK' html = etree.HTML(text.text) movie = html.xpath('//*[@id="header"]/div/div[3]/div[3]/div[1]/div[2]/div[1]/h1/font/text()') download = html.xpath('//tbody/tr/td/a/@href') # print(movie) # print(download) # return (movie[0], download[0]) ins = 'insert into filmtab values(%s,%s)' film_list = movie + download self.cursor.execute(ins, film_list) self.db.commit() print(film_list) def run(self): """ 主函数 :return: """ for page in range(1, 201): one_url = self.url.format(page) self.parse_page(one_url) time.sleep(random.uniform(1, 3)) if __name__ == '__main__': spider = DianyingtiantangSpider() spider.run()
Database built in advance if needed, as follows:
create database filmskydb charset utf8; use filmskydb; create table request_finger( finger char(32) )charset=utf8; create table filmtab( name varchar(200), download varchar(500) )charset=utf8;
Summary: 1. principle of incremental crawl is actually very simple, is crawling through the url storage warehouse, then the next time crawling the url will be compared with the library url, it has climbed to remove the url, in order to achieve breakpoint continued to climb. this is important, especially if the computer suddenly flutter when large amounts of data crawling on the street, and then have to start crawling, then, that the impact on productivity is significant.
2. After the unique string generated fingerprint here is actually to climb url with md5 encryption for comparison with the later url