1 import requests 2 # 导入lxml使用xpath提取数据 3 from lxml import etree 4 def douban_movies(m_type,nums): 5 ''' 6 豆瓣电影排行榜爬取 7 ''' 8 9 url = "https://movie.douban.com/j/chart/top_list?"+m_type+"&interval_id=100%3A90&action=&start=0&limit="+nums 10 headers = { 11 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0' 12 } 13 response = requests.get(url=url,headers=headers) 14 datas = response.json() 15 for data in datas: 16 movies_info = {} 17 movies_info ['image'] = data['cover_url'] 18 movies_info ['types'] = data['types'] 19 movies_info ['regions'] = data['regions'] 20 movies_info ['title'] = data['title'] 21 movies_info ['url'] = data['url'] 22 movies_info ['release_date'] = data['release_date'] 23 movies_info ['score'] = data['score'] 24 movies_info ['actors'] = data['actors'] 25 26 with open('./'+key_word+'豆瓣电影分类排行榜爬取.csv','a+',encoding='utf-8')as f: 27 f.writelines(str(movies_info)) 28 29 30 def get_type(): 31 32 movies_type = {} 33 url = 'https://movie.douban.com/typerank?type_name=%E5%89%A7%E6%83%85&type=11&interval_id=100:90&action=' 34 headers = { 35 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0' 36 } 37 response = requests.get(url=url, headers=headers) 38 douban_html = etree.HTML(response.text) 39 datas = douban_html.xpath("//div[@class='article']/div[2]/div[@class='types']/span") 40 41 for href in datas: 42 info = href.xpath(".//@href")[0].split('&')[1] 43 name = href.xpath(".//a")[0].text 44 movies_type[name] = info 45 # print(info) 46 47 return movies_type 48 49 if __name__ == '__main__': 50 ''' 51 分析 52 页面分析一次显示20条信息 53 json数据中的请求url显示为: 54 https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=0&limit=20 55 https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=20&limit=20 56 https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=40&limit=20 57 start 为起始点 58 limit 为显示信息 59 type 为剧情类型 60 ''' 61 key_word = input('请输入查询分类排行榜>>') 62 nums = input('请输入查询数据数量>>') 63 # 获取分类 64 movies_type = get_type() 65 if key_word in movies_type.keys(): 66 # 执行爬取 67 m_type = movies_type[key_word] 68 douban_movies(m_type,nums) 69 pass 70 else: 71 print('输入电影分类不存在!!!')
豆瓣电影分类排行
猜你喜欢
转载自www.cnblogs.com/lizhihoublog/p/12550223.html
今日推荐
周排行