爬虫实战——爬取电影天堂的电影详情页信息

思路分析

1、获取每一个页面的URL地址

首页的URL:https://www.dytt8.net/html/gndy/dyzz/index.html
以后每一个页面的URL:https://www.dytt8.net/html/gndy/dyzz/list_23_%d.html(其中的%d应该替换为相应的页码)

url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_%d.html' % i

2、获取电影详情页面的URL

利用XPath提取相应的URL即可

def get_url(page_url):
    try:
        response = requests.get(page_url, headers=headers)
        page_url_list = []
    except :
        print(page_url+"请求失败!")
        return []
    html = etree.HTML(response.text)
    a = html.xpath("//a[@class='ulink']")
    for it in a:
        href_list = it.xpath("@href")
        if len(href_list) > 0:
            href = href_list[0]
            page_url_list.append(HOST+href)
    print(page_url_list)
    return page_url_list

3、爬取电影详情页面

根据拿到的电影详情页面的URL发送请求

4、从页面中提取信息

利用XPath提取需要的信息


def get_detail(page_url):
    try:
        r = requests.get(page_url, headers=headers)
    except :
        print(page_url+"请求失败")
        return {}
    r.encoding = 'gbk'
    html = etree.HTML(r.text)
    p_list = html.xpath("//p")
    if len(p_list) == 0:
        return {}
    else:
        p = p_list[0]
    img_list = p.xpath("./img/@src")
    if len(img_list) == 0:
        img = 'null'
    else:
        img = img_list[0]
    download_url_list = p.xpath("./a/@href")
    if len(download_url_list) == 0:
        download_url = "null"
    else:
        download_url = download_url_list[0]
    movie_info = {'image': img}
    info = p.xpath("./text()")
    if len(info) == 0:
        return {}
    for index, item in enumerate(info):
        if item.startswith('◎译  名'):
            translated_names = parse_info('◎译  名', item)
            movie_info['translated_names'] = translated_names
        elif item.startswith('◎片  名'):
            title = parse_info('◎片  名', item)
            movie_info['title'] = title
        elif item.startswith('◎年  代'):
            year = parse_info('◎年  代', item)
            movie_info['year'] = year
        elif item.startswith('◎产  地'):
            place = parse_info('◎产  地', item)
            movie_info['place'] = place
        elif item.startswith('◎类  别'):
            category = parse_info('◎类  别', item)
            movie_info['category'] = category
        elif item.startswith('◎语  言'):
            language = parse_info('◎语  言', item)
            movie_info['language'] = language
        elif item.startswith('◎上映日期'):
            release_date = parse_info('◎上映日期', item)
            movie_info['release_date'] = release_date
        elif item.startswith('◎IMDb评分'):
            IMDB_score = parse_info('◎IMDb评分', item)
            movie_info['IMDB_score'] = IMDB_score
        elif item.startswith('◎豆瓣评分'):
            score = parse_info('◎豆瓣评分', item)
            movie_info['score'] = score
        elif item.startswith('◎片  长'):
            duration = parse_info('◎片  长', item)
            movie_info['duration'] = duration
        elif item.startswith('◎导  演 '):
            director = parse_info('◎导  演 ', item)
            movie_info['director'] = director
        elif item.startswith('◎编  剧'):
            screenwriter = parse_info('◎编  剧', item)
            movie_info['screenwriter'] = screenwriter
        elif item.startswith('◎主  演'):
            actor = [parse_info('◎主  演', item)]
            for i in range(index + 1, len(info)):
                if info[i].startswith('◎'):
                    break
                actor.append(info[i].strip())
            movie_info['actor'] = actor
        elif item.startswith('◎标  签'):
            label = parse_info('◎标  签', item)
            movie_info['label'] = label
        elif item.startswith('◎简  介 '):
            information = ""
            for i in range(index + 1, len(info)):
                if info[i].startswith('◎'):
                    break
                information += info[i].strip()
            movie_info['information'] = information
    movie_info['download_url'] = download_url
    if len(movie_info) < 3:
        return {}
    print(movie_info)
    return movie_info

5、将提取到的信息保存到文件

将列表保存成CSV文件即可

def save_to_file(movie_list, mode, header):
    df = pd.DataFrame(movie_list)
    df.to_csv('./result/Movie_Tian.csv', index=False, encoding='utf_8_sig', mode=mode, header=header)

完整代码

# !/usr/bin/env python
# —*— coding: utf-8 —*—
# @Time:    2020/2/1 21:36
# @Author:  Martin
# @File:    Movie_Tian.py
# @Software:PyCharm
import requests
from lxml import etree
import pandas as pd
HOST = 'https://www.dytt8.net'
index_url = 'https://www.dytt8.net/html/gndy/dyzz/index.html'
raw_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_%d.html'
headers = {
    'Referer': 'https://www.dytt8.net/',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}


def get_url(page_url):
    try:
        response = requests.get(page_url, headers=headers)
        page_url_list = []
    except :
        print(page_url+"请求失败!")
        return []
    html = etree.HTML(response.text)
    a = html.xpath("//a[@class='ulink']")
    for it in a:
        href_list = it.xpath("@href")
        if len(href_list) > 0:
            href = href_list[0]
            page_url_list.append(HOST+href)
    print(page_url_list)
    return page_url_list


def get_detail(page_url):
    try:
        r = requests.get(page_url, headers=headers)
    except :
        print(page_url+"请求失败")
        return {}
    r.encoding = 'gbk'
    html = etree.HTML(r.text)
    p_list = html.xpath("//p")
    if len(p_list) == 0:
        return {}
    else:
        p = p_list[0]
    img_list = p.xpath("./img/@src")
    if len(img_list) == 0:
        img = 'null'
    else:
        img = img_list[0]
    download_url_list = p.xpath("./a/@href")
    if len(download_url_list) == 0:
        download_url = "null"
    else:
        download_url = download_url_list[0]
    movie_info = {'image': img}
    info = p.xpath("./text()")
    if len(info) == 0:
        return {}
    for index, item in enumerate(info):
        if item.startswith('◎译  名'):
            translated_names = parse_info('◎译  名', item)
            movie_info['translated_names'] = translated_names
        elif item.startswith('◎片  名'):
            title = parse_info('◎片  名', item)
            movie_info['title'] = title
        elif item.startswith('◎年  代'):
            year = parse_info('◎年  代', item)
            movie_info['year'] = year
        elif item.startswith('◎产  地'):
            place = parse_info('◎产  地', item)
            movie_info['place'] = place
        elif item.startswith('◎类  别'):
            category = parse_info('◎类  别', item)
            movie_info['category'] = category
        elif item.startswith('◎语  言'):
            language = parse_info('◎语  言', item)
            movie_info['language'] = language
        elif item.startswith('◎上映日期'):
            release_date = parse_info('◎上映日期', item)
            movie_info['release_date'] = release_date
        elif item.startswith('◎IMDb评分'):
            IMDB_score = parse_info('◎IMDb评分', item)
            movie_info['IMDB_score'] = IMDB_score
        elif item.startswith('◎豆瓣评分'):
            score = parse_info('◎豆瓣评分', item)
            movie_info['score'] = score
        elif item.startswith('◎片  长'):
            duration = parse_info('◎片  长', item)
            movie_info['duration'] = duration
        elif item.startswith('◎导  演 '):
            director = parse_info('◎导  演 ', item)
            movie_info['director'] = director
        elif item.startswith('◎编  剧'):
            screenwriter = parse_info('◎编  剧', item)
            movie_info['screenwriter'] = screenwriter
        elif item.startswith('◎主  演'):
            actor = [parse_info('◎主  演', item)]
            for i in range(index + 1, len(info)):
                if info[i].startswith('◎'):
                    break
                actor.append(info[i].strip())
            movie_info['actor'] = actor
        elif item.startswith('◎标  签'):
            label = parse_info('◎标  签', item)
            movie_info['label'] = label
        elif item.startswith('◎简  介 '):
            information = ""
            for i in range(index + 1, len(info)):
                if info[i].startswith('◎'):
                    break
                information += info[i].strip()
            movie_info['information'] = information
    movie_info['download_url'] = download_url
    if len(movie_info) < 3:
        return {}
    print(movie_info)
    return movie_info


def parse_info(string, item):
    return item.replace(string, "").strip()


def spider():
    # 指定结束页码(目前共208页),这里爬取前10页
    end_index = 10
    all_url = get_url(index_url)
    for index, page_url in enumerate(all_url):
        if index == 0:
            movie_list = [get_detail(page_url)]
            save_to_file(movie_list, 'w', True)
        else:
            movie_list = [get_detail(page_url)]
            save_to_file(movie_list, 'a', False)
    for i in range(2, end_index+1):
        all_url = get_url(raw_url % i)
        for page_url in all_url:
            movie_list = [get_detail(page_url)]
            save_to_file(movie_list, 'a', False)


def save_to_file(movie_list, mode, header):
    df = pd.DataFrame(movie_list)
    df.to_csv('./result/Movie_Tian.csv', index=False, encoding='utf_8_sig', mode=mode, header=header)


if __name__ == '__main__':
    spider()

结果展示

在这里插入图片描述

注意事项

1、利用requests库发送请求时,为避免网络状况不佳导致程序崩溃,可以利用tr语句y来增强程序的健壮性。
2、利用XPath提取数据时,由于不能保证每个页面都完全一样,因此,应考虑提取不到数据的情况,避免出现列表索引越界的现象。



虐猫人薛定谔i 2020年2月2日 写于家中

发布了151 篇原创文章 · 获赞 236 · 访问量 3万+

猜你喜欢

转载自blog.csdn.net/Deep___Learning/article/details/104142946