豆瓣电影分类排行

 1 import requests
 2 # 导入lxml使用xpath提取数据
 3 from lxml import etree
 4 def douban_movies(m_type,nums):
 5     '''
 6     豆瓣电影排行榜爬取
 7     '''
 8 
 9     url = "https://movie.douban.com/j/chart/top_list?"+m_type+"&interval_id=100%3A90&action=&start=0&limit="+nums
10     headers = {
11         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
12     }
13     response = requests.get(url=url,headers=headers)
14     datas = response.json()
15     for data in datas:
16         movies_info = {}
17         movies_info ['image'] = data['cover_url']
18         movies_info ['types'] = data['types']
19         movies_info ['regions'] = data['regions']
20         movies_info ['title'] = data['title']
21         movies_info ['url'] = data['url']
22         movies_info ['release_date'] = data['release_date']
23         movies_info ['score'] = data['score']
24         movies_info ['actors'] = data['actors']
25 
26         with open('./'+key_word+'豆瓣电影分类排行榜爬取.csv','a+',encoding='utf-8')as f:
27             f.writelines(str(movies_info))
28 
29 
30 def get_type():
31 
32     movies_type = {}
33     url = 'https://movie.douban.com/typerank?type_name=%E5%89%A7%E6%83%85&type=11&interval_id=100:90&action='
34     headers = {
35         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
36     }
37     response = requests.get(url=url, headers=headers)
38     douban_html = etree.HTML(response.text)
39     datas = douban_html.xpath("//div[@class='article']/div[2]/div[@class='types']/span")
40 
41     for href in datas:
42         info = href.xpath(".//@href")[0].split('&')[1]
43         name = href.xpath(".//a")[0].text
44         movies_type[name] = info
45         # print(info)
46 
47     return movies_type
48 
49 if __name__ == '__main__':
50     '''
51     分析
52         页面分析一次显示20条信息
53         json数据中的请求url显示为: 
54         https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=0&limit=20
55         https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=20&limit=20
56         https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=40&limit=20
57         start 为起始点
58         limit 为显示信息
59         type 为剧情类型
60     '''
61     key_word = input('请输入查询分类排行榜>>')
62     nums = input('请输入查询数据数量>>')
63     # 获取分类
64     movies_type = get_type()
65     if key_word in movies_type.keys():
66         # 执行爬取
67         m_type = movies_type[key_word]
68         douban_movies(m_type,nums)
69         pass
70     else:
71         print('输入电影分类不存在!!!')

猜你喜欢

转载自www.cnblogs.com/lizhihoublog/p/12550223.html