Movie heaven crawling latest movie name and download link

The goal is to crawl movie heaven latest movie of the latest 200 movie name and download link, download link to the movie in two pages, so you need to match all the links to a page, then two pages one by one request code as follows:

"" " 
    Crawling movie Paradise 2019 movie name and link 
" "" 
Import Requests
 Import CSV
 from fake_useragent Import UserAgent
 from lxml Import etree
 Import Re
 Import Time
 Import Random 


class DianyingtiantangSpider (Object):
     DEF  __init__ (Self): 
        self.url = ' https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html ' 
        # self.url2 = 'https://www.dytt8.net/html/gndy/dyzz/20190918/59138. HTML ' 

    DEF get_headers (Self):
         """
            构造请求头
        :return:
        """
        ua = UserAgent()
        headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            "Cookie": "UM_distinctid=16bdec86bc2679-07c211dd7aebc-15231708-1fa400-16bdec86bc3464; CNZZDATA1260535040=961678368-1562805532-https%253A%252F%252Fwww.baidu.com%252F%7C1562805532",
            "Host": "www.dytt8.net",
            "If-Modified-Since": "Thu, 19 Sep 2019 00:34:23 GMT",
            "If-None-Match": "80d1b3fb816ed51:326",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": ua.random
        }
        return headers

    defre_func (Self, re_bds, HTML):
         "" " 
            regular expression parse out the links on the first page of rankings 
        : param re_bds: 
        : param HTML: 
        : return: 
        " "" 
        pattern = re.compile (re_bds, re.S) 
        r_list = pattern.findall (HTML) 

        return r_list 

    DEF parse_index_page (Self, URL):
         "" " 
            request to resolve the link to the first page 
        : param URL: 
        : return: 
        " "" 
        text = requests.get (URL = URL, headers = Self .get_headers ()) 
        text.encoding = ' GBK '
        # print(text.text)
        re_bds = r'<table width="100%".*?<td width="5%".*?<a href="(.*?)".*?ulink">.*?</table>'
        # link = re.findall('<table width="100%".*?<td width="5%".*?<a href="(.*?)".*?ulink">.*?</table>', text.text)
        # html = etree.HTML(text.text)
        # link = html.xpath("//ul/table[@class='tbspan'][1]/tbody/tr[4]/td/text()")
        link = self.re_func(re_bds, text.text)
        print(link)
        # print(text)
        return link

    def parse_two_page(self, url):
        
        : return:
        : param url:
            Request to resolve the second page of the movie names and download links"" "
        """
        text = requests.get(url=url, headers=self.get_headers())
        # print(text.text)
        text.encoding = 'GBK'

        html = etree.HTML(text.text)
        movie = html.xpath('//*[@id="header"]/div/div[3]/div[3]/div[1]/div[2]/div[1]/h1/font/text()')
        download = html.xpath('//tbody/tr/td/a/@href')
        # movie=re.findall("",text.text)
        print(movie)
        print(download)
        # print(html)
        return (movie[0], download[0])

    def save_csv(self, result):
        """
            保存到csv文件
        :param result:
        :return:
        """
        with open('movie.csv', 'a', newline='')as f:
            writer = csv.writer(f)
            writer.writerows(result)

    def run(self):
        """
            主函数
        :return:
        """
        for i in range(1, 201):=
            Urll self.url.format(i)
            list_result = []
            link = self.parse_index_page(url1)
            for j in link:
                url2 = 'https://www.dytt8.net' + j
                try:
                    result = self.parse_two_page(url2)
                    print(result)
                    list_result.append(result)
                    time.sleep(random.uniform(1, 3))
                except Exception as err:
                    Print (ERR) 
            self.save_csv (list_result) 
            Print ( " % s of pages successfully saved " % I) 


IF  the __name__ == ' __main__ ' : 
    Spider = DianyingtiantangSpider () 
    spider.run ()

One page:

 

 secondary page:

 

 

 

to sum up:

 1. When crawling movie paradise found that when requesting a source web page obtained actually missing some of the labels, so the beginning is always something not match, xpath sometimes not a panacea, especially in face of this pages missing. this time we have to come up with the killer regular expressions, and things as long as the page that appears, you can get a regular, of course, sometimes matching a regular look at duplicate values, this time as long as the use of aggregate functions in python set () to handle it on the line.

2. Also another problem, that is encoded movie heaven is gb2312, so the page request is the beginning of the Chinese is garbled, some found just add the code will be able to successfully resolved after inquiry

text = requests.get(url=url, headers=self.get_headers())
text.encoding = 'GBK'

 

Guess you like

Origin www.cnblogs.com/lattesea/p/11580585.html