Movie heaven crawling latest movie name and download link (incremental crawl mysql stored version)

The procedure is carried out on the basis of the last modification, the persistent storage methods into mysql, and increased breakpoint climb function.

import requests
import re
from fake_useragent import UserAgent
import random
import time
import pymysql
from hashlib import md5
from lxml import etree


class DianyingtiantangSpider(object):
    def __init__(self):
        self.url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
        self.db = pymysql.connect(host='127.0.0.1', port=3306, user=' The root ' , password = ' database password ' , Database = ' filmskydb ' , 
                                  charset = ' UTF8 ' ) 
        self.cursor = self.db.cursor () 

    DEF get_headers (Self):
         "" " 
            Construction of the request header 
        : return: 
        " "" 
        UA = UserAgent () 
        headers = {
             # "the Accept": "text / HTML, file application / XHTML + XML, file application / XML; Q = 0.9, Image / WebP, Image / APNG, * / *; Q = 0.8,application/signed-exchange;v=b3",
            # "Accept-Encoding": "gzip, deflate, br",
            # "Accept-Language": "zh-CN,zh;q=0.9",
            # "Cache-Control": "max-age=0",
            # "Connection": "keep-alive",
            # "Cookie": "UM_distinctid=16bdec86bc2679-07c211dd7aebc-15231708-1fa400-16bdec86bc3464; CNZZDATA1260535040=961678368-1562805532-https%253A%252F%252Fwww.baidu.com%252F%7C1562805532",
            # "Host": "www.dytt8.net",
            # "If-Modified-Since": "Thu, 19 Sep 2019 00:34:23 GMT",
            # "If-None-Match": "80d1b3fb816ed51:326",
            # "Sec-Fetch-Mode": "navigate",
            # "Sec-Fetch-Site": "none",
            # "Sec-Fetch-User": "?1",
            #"Upgrade-Insecure-Requests": ". 1", 
            " the User-- Agent " : ua.random 
        } 
        return headers 

    DEF parse_page (Self, URL):
         "" " 
            parsing a page 
        : param URL: 
        : return: 
        " "" 
        text requests.get = (URL = URL, headers = self.get_headers ()) 
        text.encoding = ' GBK ' 
        # regular matching of the link in the first two pages 
        re_bds R & lt = ' <Table width = "100%." *? <TD width = ". 5%." *? <A the href = "(. *?)." *? ULINK ">. *? </ Table> '
        pattern = re.compile(re_bds, re.S)
        link_list =pattern.findall (text.text)
         for Link in link_list: 
            two_url = ' https://www.dytt8.net ' + Link
             # generate a fingerprint 
            S = MD5 () 
            s.update (two_url.encode ()) 
            two_url_md5 = S. hexdigest ()
             # incorporated link determination function is not present in the database 
            IF self.judge_repetition (two_url_md5): 
                self.parse_two_page (two_url) 
                # saved in the fingerprint database 
                INS = ' INSERT INTO request_finger values (% S) '
                self.cursor.execute (INS, [two_url_md5]) 
                # remember to be submitted to the database to perform 
                self.db.commit ()
                 # random time interval crawling 
                the time.sleep (random.uniform (. 1,. 3 )) 

    DEF judge_repetition (Self , two_url_md5):
         "" " 
            fingerprint Analyzing 
        : param two_url_md5: 
        : return: 
        " "" 
        SEL = ' SELECT WHERE finger from finger request_finger% = S ' 
        Result = self.cursor.execute (SEL, [two_url_md5])
         IF  Not Result:
             return True 

    DEF parse_two_page(self, two_url):
        """
            提取二级页面的信息
        :param two_url:
        :return:
        """
        text = requests.get(url=two_url, headers=self.get_headers())
        text.encoding = 'GBK'

        html = etree.HTML(text.text)
        movie = html.xpath('//*[@id="header"]/div/div[3]/div[3]/div[1]/div[2]/div[1]/h1/font/text()')
        download = html.xpath('//tbody/tr/td/a/@href')
        # print(movie)
        # print(download)
        # return (movie[0], download[0])
        ins = 'insert into filmtab values(%s,%s)'
        film_list = movie + download
        self.cursor.execute(ins, film_list)
        self.db.commit()
        print(film_list)

    def run(self):
        """
            主函数
        :return:
        """
        for page in range(1, 201):
            one_url = self.url.format(page)
            self.parse_page(one_url)
            time.sleep(random.uniform(1, 3))


if __name__ == '__main__':
    spider = DianyingtiantangSpider()
    spider.run()

Database built in advance if needed, as follows:

create database filmskydb charset utf8;
use filmskydb;
create table request_finger(
finger char(32)
)charset=utf8;
create table filmtab(
name varchar(200),
download varchar(500)
)charset=utf8;

Summary: 1. principle of incremental crawl is actually very simple, is crawling through the url storage warehouse, then the next time crawling the url will be compared with the library url, it has climbed to remove the url, in order to achieve breakpoint continued to climb. this is important, especially if the computer suddenly flutter when large amounts of data crawling on the street, and then have to start crawling, then, that the impact on productivity is significant.

2. After the unique string generated fingerprint here is actually to climb url with md5 encryption for comparison with the later url

Guess you like

Origin www.cnblogs.com/lattesea/p/11585648.html