电影天堂爬虫

import requests
from lxml import etree

Basic_main = ‘http://www.ygdy8.net
Headers = {
‘User-Agent’:‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36’
}
#解析总的页面数据
def parse_tatol_data(url):
response = requests.get(url,headers = Headers)
text = response.text
#利用gbk格式储存会有些乱码
# 调用HTML类对HTML文本进行初始化,成功构造XPath解析对象,同时可以自动修正HMTL文本(标签缺少闭合自动添加上)
html = etree.HTML(text)
#解析所需的URL
part_detail_urls = html.xpath("//table[@class=‘tbspan’]//a/@href")
whole_detail_urls = map(lambda url:Basic_main + url,part_detail_urls)
return whole_detail_urls
#解析分支页面数据
def parse_branch_data(detail_url):
movie = {}
response = requests.get(detail_url,headers = Headers)
text = response.content.decode(‘gbk’)
html = etree.HTML(text)
title = html.xpath("//div[@class=‘title_all’]//font[@color=’#07519a’]/text()")[0]
#取0是为了获取字符串,text()是为了获得文本
movie[‘title’] = title
# for x in title:
# print(etree.tostring(x,encoding=‘utf-8’).decode(‘utf-8’))
#封面图,缩略图,演员以及主演的信息都在zoom标签中,所以单独分开
zoom = html.xpath("//div[@id=‘Zoom’]")[0]

imgs = zoom.xpath(".//img/@src")#标签分开时,注意勿忘  (.)

#获取封面图和缩略图
cover_img = imgs[0]
screenshot = imgs[1]
movie[‘cover’] = cover_img
movie[‘screenshot’] = screenshot

#用‘’替换rule,并返回info
def parse_info(info,rule):
return info.replace(rule,’’).strip()#strip()作用是消除前后空格

infos = zoom.xpath(".//text()")
for index,info in enumerate(infos):
    if info.startswith('◎年  代'):
        info = parse_info(info,'◎年  代')
        movie['year'] = info
    elif info.startswith('◎产  地'):
        info = parse_info(info,'◎产  地')
        movie['place'] = info
    elif info.startswith('◎导  演'):
        info = parse_info(info,'◎导  演')
        movie['director'] = info
    elif info.startswith('◎主  演'):
        info = parse_info(info,'◎主  演')
        actors = [info]
        #演员不止一位
        for x in range(index +1,len(infos)):
            actor = infos[x].strip()
            if actor.startswith('◎标  签'):
                break
            actors.append(actor)
        movie['actors'] = actors
    elif info.startswith('◎简  介 '):
        info = parse_info(info,'◎简  介')
        for x in range(index +1,len(infos)):
            profile = infos[x].strip()
            if profile.startswith('【下载地址】'):
                break
        movie['profile'] = profile
download_url = html.xpath("//td[@bgcolor='#fdfddf']//a/@href")[0]
movie['download_url'] = download_url
return movie

#获取7页总的数据
def main():
basic_url = ‘http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html
movies = []

#获取前七页的url
for x in range(1,8):
    url = basic_url.format(x)
    detail_urls = parse_tatol_data(url)
    #获取一页中的url
    for detail_url in detail_urls:

        movie = parse_branch_data(detail_url)
        movies.append(movie)

if name == ‘main’:
main()

猜你喜欢

转载自blog.csdn.net/sdsc1314/article/details/89021452