爬取电影天堂的电影详情页信息
思路分析
1、获取每一个页面的URL地址
首页的URL:https://www.dytt8.net/html/gndy/dyzz/index.html
以后每一个页面的URL:https://www.dytt8.net/html/gndy/dyzz/list_23_%d.html(其中的%d应该替换为相应的页码)
url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_%d.html' % i
2、获取电影详情页面的URL
利用XPath提取相应的URL即可
def get_url(page_url):
try:
response = requests.get(page_url, headers=headers)
page_url_list = []
except :
print(page_url+"请求失败!")
return []
html = etree.HTML(response.text)
a = html.xpath("//a[@class='ulink']")
for it in a:
href_list = it.xpath("@href")
if len(href_list) > 0:
href = href_list[0]
page_url_list.append(HOST+href)
print(page_url_list)
return page_url_list
3、爬取电影详情页面
根据拿到的电影详情页面的URL发送请求
4、从页面中提取信息
利用XPath提取需要的信息
def get_detail(page_url):
try:
r = requests.get(page_url, headers=headers)
except :
print(page_url+"请求失败")
return {}
r.encoding = 'gbk'
html = etree.HTML(r.text)
p_list = html.xpath("//p")
if len(p_list) == 0:
return {}
else:
p = p_list[0]
img_list = p.xpath("./img/@src")
if len(img_list) == 0:
img = 'null'
else:
img = img_list[0]
download_url_list = p.xpath("./a/@href")
if len(download_url_list) == 0:
download_url = "null"
else:
download_url = download_url_list[0]
movie_info = {'image': img}
info = p.xpath("./text()")
if len(info) == 0:
return {}
for index, item in enumerate(info):
if item.startswith('◎译 名'):
translated_names = parse_info('◎译 名', item)
movie_info['translated_names'] = translated_names
elif item.startswith('◎片 名'):
title = parse_info('◎片 名', item)
movie_info['title'] = title
elif item.startswith('◎年 代'):
year = parse_info('◎年 代', item)
movie_info['year'] = year
elif item.startswith('◎产 地'):
place = parse_info('◎产 地', item)
movie_info['place'] = place
elif item.startswith('◎类 别'):
category = parse_info('◎类 别', item)
movie_info['category'] = category
elif item.startswith('◎语 言'):
language = parse_info('◎语 言', item)
movie_info['language'] = language
elif item.startswith('◎上映日期'):
release_date = parse_info('◎上映日期', item)
movie_info['release_date'] = release_date
elif item.startswith('◎IMDb评分'):
IMDB_score = parse_info('◎IMDb评分', item)
movie_info['IMDB_score'] = IMDB_score
elif item.startswith('◎豆瓣评分'):
score = parse_info('◎豆瓣评分', item)
movie_info['score'] = score
elif item.startswith('◎片 长'):
duration = parse_info('◎片 长', item)
movie_info['duration'] = duration
elif item.startswith('◎导 演 '):
director = parse_info('◎导 演 ', item)
movie_info['director'] = director
elif item.startswith('◎编 剧'):
screenwriter = parse_info('◎编 剧', item)
movie_info['screenwriter'] = screenwriter
elif item.startswith('◎主 演'):
actor = [parse_info('◎主 演', item)]
for i in range(index + 1, len(info)):
if info[i].startswith('◎'):
break
actor.append(info[i].strip())
movie_info['actor'] = actor
elif item.startswith('◎标 签'):
label = parse_info('◎标 签', item)
movie_info['label'] = label
elif item.startswith('◎简 介 '):
information = ""
for i in range(index + 1, len(info)):
if info[i].startswith('◎'):
break
information += info[i].strip()
movie_info['information'] = information
movie_info['download_url'] = download_url
if len(movie_info) < 3:
return {}
print(movie_info)
return movie_info
5、将提取到的信息保存到文件
将列表保存成CSV文件即可
def save_to_file(movie_list, mode, header):
df = pd.DataFrame(movie_list)
df.to_csv('./result/Movie_Tian.csv', index=False, encoding='utf_8_sig', mode=mode, header=header)
完整代码
# !/usr/bin/env python
# —*— coding: utf-8 —*—
# @Time: 2020/2/1 21:36
# @Author: Martin
# @File: Movie_Tian.py
# @Software:PyCharm
import requests
from lxml import etree
import pandas as pd
HOST = 'https://www.dytt8.net'
index_url = 'https://www.dytt8.net/html/gndy/dyzz/index.html'
raw_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_%d.html'
headers = {
'Referer': 'https://www.dytt8.net/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def get_url(page_url):
try:
response = requests.get(page_url, headers=headers)
page_url_list = []
except :
print(page_url+"请求失败!")
return []
html = etree.HTML(response.text)
a = html.xpath("//a[@class='ulink']")
for it in a:
href_list = it.xpath("@href")
if len(href_list) > 0:
href = href_list[0]
page_url_list.append(HOST+href)
print(page_url_list)
return page_url_list
def get_detail(page_url):
try:
r = requests.get(page_url, headers=headers)
except :
print(page_url+"请求失败")
return {}
r.encoding = 'gbk'
html = etree.HTML(r.text)
p_list = html.xpath("//p")
if len(p_list) == 0:
return {}
else:
p = p_list[0]
img_list = p.xpath("./img/@src")
if len(img_list) == 0:
img = 'null'
else:
img = img_list[0]
download_url_list = p.xpath("./a/@href")
if len(download_url_list) == 0:
download_url = "null"
else:
download_url = download_url_list[0]
movie_info = {'image': img}
info = p.xpath("./text()")
if len(info) == 0:
return {}
for index, item in enumerate(info):
if item.startswith('◎译 名'):
translated_names = parse_info('◎译 名', item)
movie_info['translated_names'] = translated_names
elif item.startswith('◎片 名'):
title = parse_info('◎片 名', item)
movie_info['title'] = title
elif item.startswith('◎年 代'):
year = parse_info('◎年 代', item)
movie_info['year'] = year
elif item.startswith('◎产 地'):
place = parse_info('◎产 地', item)
movie_info['place'] = place
elif item.startswith('◎类 别'):
category = parse_info('◎类 别', item)
movie_info['category'] = category
elif item.startswith('◎语 言'):
language = parse_info('◎语 言', item)
movie_info['language'] = language
elif item.startswith('◎上映日期'):
release_date = parse_info('◎上映日期', item)
movie_info['release_date'] = release_date
elif item.startswith('◎IMDb评分'):
IMDB_score = parse_info('◎IMDb评分', item)
movie_info['IMDB_score'] = IMDB_score
elif item.startswith('◎豆瓣评分'):
score = parse_info('◎豆瓣评分', item)
movie_info['score'] = score
elif item.startswith('◎片 长'):
duration = parse_info('◎片 长', item)
movie_info['duration'] = duration
elif item.startswith('◎导 演 '):
director = parse_info('◎导 演 ', item)
movie_info['director'] = director
elif item.startswith('◎编 剧'):
screenwriter = parse_info('◎编 剧', item)
movie_info['screenwriter'] = screenwriter
elif item.startswith('◎主 演'):
actor = [parse_info('◎主 演', item)]
for i in range(index + 1, len(info)):
if info[i].startswith('◎'):
break
actor.append(info[i].strip())
movie_info['actor'] = actor
elif item.startswith('◎标 签'):
label = parse_info('◎标 签', item)
movie_info['label'] = label
elif item.startswith('◎简 介 '):
information = ""
for i in range(index + 1, len(info)):
if info[i].startswith('◎'):
break
information += info[i].strip()
movie_info['information'] = information
movie_info['download_url'] = download_url
if len(movie_info) < 3:
return {}
print(movie_info)
return movie_info
def parse_info(string, item):
return item.replace(string, "").strip()
def spider():
# 指定结束页码(目前共208页),这里爬取前10页
end_index = 10
all_url = get_url(index_url)
for index, page_url in enumerate(all_url):
if index == 0:
movie_list = [get_detail(page_url)]
save_to_file(movie_list, 'w', True)
else:
movie_list = [get_detail(page_url)]
save_to_file(movie_list, 'a', False)
for i in range(2, end_index+1):
all_url = get_url(raw_url % i)
for page_url in all_url:
movie_list = [get_detail(page_url)]
save_to_file(movie_list, 'a', False)
def save_to_file(movie_list, mode, header):
df = pd.DataFrame(movie_list)
df.to_csv('./result/Movie_Tian.csv', index=False, encoding='utf_8_sig', mode=mode, header=header)
if __name__ == '__main__':
spider()
结果展示
注意事项
1、利用requests库发送请求时,为避免网络状况不佳导致程序崩溃,可以利用tr语句y来增强程序的健壮性。
2、利用XPath提取数据时,由于不能保证每个页面都完全一样,因此,应考虑提取不到数据的情况,避免出现列表索引越界的现象。
虐猫人薛定谔i 2020年2月2日 写于家中