电影天堂小爬虫

电影天堂小爬虫

作用

用python爬取电影天堂的最新电影,并简单展示分析。电影信息首先存为.csv然后使用第三方库简单分析信息。

代码

from lxml import etree
import requests
import pandas_profiling
import pandas as pd

#设置网页信息
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:68.0) \
                    Gecko/20100101 Firefox/68.0'
}
BASE_DOMAIN = 'https://www.ygdy8.net'
#url = BASE_DOMAIN + '/html/gndy/dyzz/index.html'


def get_detail_urls(url):
    # 抓取网页
    resp = requests.get(url, headers=HEADERS)
    text = resp.content.decode('ISO-8859-1')

    html = etree.HTML(text)
    # 拿到电影详情页面的链接
    details_urls = html.xpath("//table[@class='tbspan']//a/@href")
    details_urls = map(lambda url: BASE_DOMAIN+url, details_urls)
    return details_urls


def spider(filename_csv):
    movies = []
    movie = {}
    base_url = 'https://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html'
    for x in range(1, 2):
        url = base_url.format(x)
        details_urls = get_detail_urls(url)
        for details_url in details_urls:
            movie = parser_detail_page(details_url)
            movies.append(movie)
    pd.DataFrame(movies).to_csv(filename_csv)


def parser_detail_page(url):
    movie = {}
    resp = requests.get(url, headers=HEADERS)
    text = resp.content.decode('gbk')
    html = etree.HTML(text)
    title = html.xpath("//title/text()")[0]
    # print(title)
    movie['title'] = title

    div_zoom = html.xpath("//div[@id='Zoom']")[0]
    cover, screenshot = div_zoom.xpath(".//img/@src")
    # print(cover, screenshot)
    movie['cover'] = cover
    movie['screenshot'] = screenshot

    infos = div_zoom.xpath(".//text()")
    for info in infos:
        if info.startswith("◎年  代"):
            year = info.replace('◎年  代 ', '')
            movie['year'] = year
        if info.startswith("◎类  别 "):
            category = info.replace('◎类  别 ', '')
            movie['category'] = category
    return movie


if __name__ == "__main__":
    spider('dytt.csv')
    data = pd.read_csv('dytt.csv')
    report = pandas_profiling.ProfileReport(data)
    report.to_file('dytt.html')
``
发布了80 篇原创文章 · 获赞 68 · 访问量 7564

猜你喜欢

转载自blog.csdn.net/weixin_44048823/article/details/100057016