电影天堂爬取详情页

爬取电影天堂最新电影,地址https://www.dytt8.net/html/gndy/dyzz/list_23_1.html

 1 import requests
 2 from lxml import etree
 3 
 4 BASE_DOMAIN = 'https://www.dytt8.net'
 5 HEADERS = {
 6     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
 7 }
 8 
 9 
10 def get_detail_urls(url):
11     response = requests.get(url, headers=HEADERS)
12     text = response.content.decode(encoding='gbk', errors='ignore')
13     html = etree.HTML(text)
14     detail_urls = html.xpath('//table[@class="tbspan"]//a/@href')
15     detail_urls = map(lambda url: BASE_DOMAIN+url, detail_urls)
16     return detail_urls
17 
18 
19 def get_detail_info(url):
20     movie = {}
21     response = requests.get(url, headers=HEADERS)
22     text = response.content.decode(encoding='gbk', errors='ignore')
23     html = etree.HTML(text)
24     movie['title'] = html.xpath('//div[@class="title_all"]//font/text()')[0]
25     img = html.xpath("//div[@id='Zoom']//img/@src")
26     movie['cover'] = img[0]
27     movie['screenshot'] = img[1]
28     infos = html.xpath('//div[@id="Zoom"]//p/text()')
29     # 提取信息
30     is_actors = False
31     actors = []
32     for info in infos:
33         # print(info)
34         if info.startswith('◎年  代'):
35             movie['year'] = info.replace("◎年  代", "").strip()
36         elif info.startswith('◎产  地'):
37             movie['country'] = info.replace("◎产  地", "").strip()
38         elif info.startswith('◎类  别'):
39             movie['category'] = info.replace("◎类  别", "").strip()
40         elif info.startswith('◎豆瓣评分'):
41             movie['douban_rating'] = info.replace("◎豆瓣评分", "").strip()
42         elif info.startswith('◎片  长'):
43             movie['duration'] = info.replace("◎片  长", "").strip()
44         elif info.startswith('◎导  演'):
45             movie['director'] = info.replace("◎导  演", "").strip()
46         elif info.startswith('◎主  演'):
47             actors = [info.replace("◎主  演", "").strip()]
48             is_actors = True
49         elif is_actors:
50             if info.startswith(''):
51                 is_actors = False
52                 movie['actors'] = actors
53                 continue
54             actors.append(info.strip())
55     movie['download'] = html.xpath("//div[@id='Zoom']//tbody//a/text()")
56     movie['magnet'] = html.xpath("//div[@id='Zoom']//a/@href")[0]
57 
58     return movie
59 
60 
61 def spider():
62     base_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
63     movies = []
64     for i in range(1, 2):
65         url = base_url.format(i)
66         detail_urls = get_detail_urls(url)
67         for detail_url in detail_urls:
68             # 对详情页提取信息
69             # print(detail_url)
70             movies.append(get_detail_info(detail_url))
71     print(movies)
72 
73 
74 if __name__ == '__main__':
75     spider()

学习的视频中代码有几处跟我的有不同,可以学习

一、

其中提取主演的代码不同,如下

for index,info in enumerate(infos):
        if info.startswith("◎年  代"):
            info = parse_info(info,"◎年  代")
            movie['year'] = info
        # .......省略
        elif info.startswith("◎主  演"):
            info = parse_info(info,"◎主  演")
            actors = [info]
            for x in range(index+1,len(infos)):
                actor = infos[x].strip()
                if actor.startswith(""):
                    break
                actors.append(actor)
            movie['actors'] = actors

采用的是index的方式.

二、

还有

detail_urls = map(lambda url:BASE_DOMAIN+url,detail_urls)

这段代码没怎么用过.记录下.

三、

base_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
    for i in range(1, 10):
        url = base_url.format(i)

以前写的时候没这么写过,都是直接弄成

url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_'+i+'.html'这样.

四、

在自己写代码时有个错误,也需要记录下,在movie的字典赋值的时候,

如下

    for info in infos:
        # print(info)
        if info.startswith('◎年  代'):
            year = info.replace("◎年  代", "").strip()
        elif info.startswith('◎产  地'):
            country = info.replace("◎产  地", "").strip()
        elif info.startswith('◎类  别'):
            category = info.replace("◎类  别", "").strip()
        elif info.startswith('◎豆瓣评分'):
            douban_rating = info.replace("◎豆瓣评分", "").strip()

赋值的时候使用

 movie = {
        'year': year,
        'country': country,
        'category': category,
        'douban_rating': douban_rating
    }

会报错,因为其中有一个豆瓣评分是不存在的,不会对其赋值,所以movie赋值的时候会错误.

UnboundLocalError: local variable 'douban_rating' referenced before assignment

 当然,个人觉得用正则可以更容易解决。

在记录下爬取豆瓣正在上映的电影用xpath的代码

import requests
from lxml import etree

headers = {
    'Referer': 'https://movie.douban.com/',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
url = "https://movie.douban.com/cinema/nowplaying/changsha/"
response = requests.get(url, headers=headers)
text = response.text
html = etree.HTML(text)
lis = html.xpath("//div[@id='nowplaying']//ul[@class='lists']/li")
movies = []
for li in lis:
    # print(etree.tostring(li, encoding='utf-8').decode('utf-8'))
    title = li.xpath("@data-title")[0]
    score = li.xpath("@data-score")[0]
    duration = li.xpath("@data-duration")[0]
    director = li.xpath("@data-director")[0]
    actor = li.xpath("@data-actors")[0]
    img = li.xpath(".//img/@src")[0]
    movie = {
        'title': title,
        'score': score,
        'duration': duration,
        'director': director,
        'actor': actor,
        'img': img
    }
    movies.append(movie)

print(movies)

猜你喜欢

转载自www.cnblogs.com/weiwei2016/p/10411041.html