The hottest cat's eye movie-reptiles

watch movie. . .

  • The type and length of a person’s preference for watching movies are related to age. The themes and types of popular movies have a wider coverage to a certain extent and have a higher reference value.
  • The Maoyan movie platform is a relatively good platform with overall reference value

Anyone who knows crawlers knows the idea of ​​web data analysis, and no longer describes the
code. After all , the code is not easy to use. It
needs to be packaged into a program file that can be used directly by ordinary people.
Reduce the difficulty of using
pyinstaller This library can encapsulate Python code into .exe files on the Windows platform To use

Packaging process: Python .py files are packaged into .exe files (Windows platform, python 3.x)

Paste the code below; if you don’t understand, please read the comments carefully, if it doesn’t work, please learn the basics of crawling by yourself

# -*- coding: utf-8 -*-
"""
 @Time : 2020/7/26 9:51
 @File : maoyan.py 
 @Software: PyCharm
"""
import os
import time
import random
import requests
from lxml import etree
from fake_useragent import UserAgent


class MaoyanSpider(object):
    def __init__(self):
        self.url = 'https://maoyan.com/films?showType=2&offset={}'
        ua = UserAgent(verify_ssl=False)
        for i in range(1, 3):
            self.headers = {
    
    
                'User-Agent': ua.random,
            }
        # 添加计数(页数)
        self.page = 1

    # 获取页面
    def get_page(self, url):
        # random.choice一定要写在这里,每次请求都会随机选择
        res = requests.get(url, headers=self.headers)
        res.encoding = 'utf-8'
        html = res.text
        self.parse_page(html)

    # 解析页面
    def parse_page(self, html):
        #  创建解析对象
        parse_html = etree.HTML(html)
        # 基准xpath节点对象列表
        dd_list = parse_html.xpath('//dl[@class="movie-list"]//dd')
        print(len(dd_list))
        movie_dict = {
    
    }
        # 依次遍历每个节点对象,提取数据
        for dd in dd_list:
            name = dd.xpath('.//div[@class="movie-hover-title"]//span[@class="name noscore"]/text()')[0].strip()
            star = dd.xpath('.//div[@class="movie-hover-info"]//div[@class="movie-hover-title"][3]/text()')[1].strip()
            type = dd.xpath('.//div[@class="movie-hover-info"]//div[@class="movie-hover-title"][2]/text()')[1].strip()
            dowld = dd.xpath('.//div[@class="movie-item-hover"]/a/@href')[0].strip()
            # print(movie_dict)
            movie = '''【即将上映】

电影名字: %s
主演:%s
类型:%s
详情链接:https://maoyan.com%s
=========================================================
                                   ''' % (name, star, type, dowld)
            print(movie)
            spider.file(movie)

    # 保存 最新猫眼电影.doc 文件
    def file(self, movie):
        # 判断 最新猫眼电影.doc 文件是否存在 不存在则创建
        filename = "./最新猫眼电影.doc"
        if not os.path.exists(filename):
            os.system(r"cd.>{}".format(filename))

        f = open('./最新猫眼电影.doc', 'a', encoding='utf-8')
        f.write(str(movie))
        print(str(movie))
        f.close()

    # 主函数
    def main(self):
        for offset in range(0, 90, 30):
            url = self.url.format(str(offset))
            self.get_page(url)
            print(url)
            print('第%d页完成' % self.page)
            # 请求延时
            time.sleep(random.randint(1, 3))
            self.page += 1


if __name__ == '__main__':
    spider = MaoyanSpider()
    spider.main()

Guess you like

Origin blog.csdn.net/qq_43562262/article/details/107590039