版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
来张图片压压惊!!!1
爬取电影天堂的电影信息。
from lxml import etree
import requests
BASE = 'https://www.dytt8.net'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
'Referer': 'https://www.dytt8.net/html/gndy/dyzz/list_23_2.html'
}
def get_urls(url):
response = requests.get(url,headers=HEADERS) #获取网页
text = response.text #解码后是str类型
html = etree.HTML(text)
table_urls = html.xpath("//table[@class='tbspan']//a/@href")
table_urls = map(lambda url: BASE + url,table_urls)#使用lambda表达式进行网址链接
return table_urls
def parse_page(url):
movie = {}
response = requests.get(url,headers=HEADERS)
text = response.content.decode('gbk')
html = etree.HTML(text)
title = html.xpath("//div[@class='title_all']//font[@color='#07519a']//text()")[0] #获取标签下的所有文本
# print(title)
movie['title'] = title
zoomA = html.xpath('//div[@id="Zoom"]')[0]
imgs = zoomA.xpath(".//img/@src")
# print(imgs)
cover = imgs[0] #获取海报
# print(cover)
#screenshot = imgs[1] #获取截屏
movie['cover'] = cover
#movie['screenshot'] = screenshot
def parse_info(info,rule):
return info.replace(rule,"").strip() #用strip()函数将字段两边的空格去掉 replace()函数将年代下面的字段替换成空
infos = zoomA.xpath(".//text()") #获取当前下的所有文本
for index,info in enumerate(infos): #enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,返回索引和数据
# print(index)
# print(info)
# print("="*10)
if info.startswith("◎年 代"): #字段的起始位置◎年 代
info=parse_info(info,"◎年 代")
# print(info)
movie['year'] = info
elif info.startswith("◎产 地"):
info=parse_info(info,"◎产 地")
movie['country'] = info
elif info.startswith("◎类 别"):
info=parse_info(info,"◎类 别")
movie['category'] = info
elif info.startswith("◎豆瓣评分"):
info=parse_info(info,"◎豆瓣评分")
movie['douban_rating'] = info
elif info.startswith("◎片 长"):
info=parse_info(info,"◎片 长")
movie['duration'] = info
elif info.startswith("◎导 演"):
info=parse_info(info,"◎导 演")
movie['director'] = info
elif info.startswith("◎主 演"):
info=parse_info(info,"◎主 演")
actors = [info] #先将上面的主演的第一个放到 actors
for i in range(index+1,len(infos)):
actor = infos[i].strip() #遍历所有的主演 去掉两边的空格
# print(actor)
if actor.startswith("◎"): # 取到这就终止
break
actors.append(actor)
movie["actors"] = actors
elif info.startswith("◎简 介"):
info=parse_info(info,"◎简 介")
for x in range(index+1,len(infos)):
profile = infos[x].strip()
movie["profile"]=profile
download_url = html.xpath('//td[@bgcolor="#fdfddf"]/a/@href')[0]
# print(download_url)
movie["download_url"] = download_url
return movie
def spider():
base_url = "https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html"
movies = []
for i in range(1,8): #获取前 7 页的页面
url = base_url.format(i) #每个页面的所有电影网址
tables = get_urls(url)
for table in tables: #具体的电影网址
movie = parse_page(table)
movies.append(movie)
# print(movie)
with open('D:/pachong/天堂movie.json','w',encoding='utf-8') as f:
for i in movies:
ii = str(i) + '\n'
f.write(ii)
print("下载完成")
if __name__ == "__main__":
spider()