豆瓣电影TOP250爬取,并获得相关类型的推荐

import requests
import random
from bs4 import BeautifulSoup
import lxml
'''
https://movie.douban.com/top250
https://movie.douban.com/top250?start=25
https://movie.douban.com/top250?start=50&filter=
'''
header1 = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
              '(KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
    'Host': "movie.douban.com"
} #谷歌
header2 = {
    'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
                 " (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362",
    'Host': "movie.douban.com"
} # ie
header3 = {
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0",
    'Host': "movie.douban.com"
}
header_list = [header1, header2, header3]
datas = {}
comedy = {} #喜剧
love = {} #爱情
sci_fi = {} #科幻
thriller = {} #惊悚
crime = {} #犯罪
animation = {} #动画
for i in range(1, 11):
    if i == 1:
        url = "https://movie.douban.com/top250"
    else:
        url = 'https://movie.douban.com/top250?start=%d&filter='%((i-1)*25)
    header = header_list[random.randint(0, 2)]
    req = requests.get(url, headers = header)
    html = req.text
    bf = BeautifulSoup(html, 'lxml')
    soup = bf.find_all('div', class_ = 'info')
    for item in soup:
       data = {}
       movie_name = item.find('a').find('span').string
       score_str = item.find('div', class_= 'star').find('span', class_ = 'rating_num').string
       score = float(score_str)
       director_str = item.find('div', class_ = 'bd').find('p')
       director_str =  str(director_str)
       director_str = director_str.replace(' ', '')
       director_str = director_str.replace('<pclass="">', '')
       director_str = director_str.replace('TimRobbins/...<br/>', '')
       director_str = director_str.replace('</p>', '')
       director_str = director_str.replace('...<br/>', '')
       director_str = director_str.split()
       director = director_str[0]
       starring = director_str[1]
       time = director_str[2]
       type = director_str[-1]
       data['name'] = movie_name
       data['director'] = director[3 : ]
       data['type'] = type
       data['time'] = time
       data['score'] = score
       datas[movie_name] = data
       if '喜剧' in type and score >= 9.0:
           comedy[movie_name] = data
       if '爱情' in type and score >= 9.0:
           love[movie_name] = data
       if '科幻' in type and score >= 9.0:
           sci_fi[movie_name] = data
       if '惊悚' in type and score >= 9.0:
           thriller[movie_name] = data
       if '犯罪' in type and score >= 9.0:
           crime[movie_name] = data
       if '动画' in type and score >= 9.0:
           animation[movie_name] = data
#超级推荐:
print("豆瓣评分最高" + '>'*10)
datas = sorted(datas.items(), key = lambda x:x[1]['score'], reverse=True)
i = 0
tplt = "{0:{2}^10}\t\t\t{1:{2}<10}"
print(tplt.format("电影名称", "评分", chr(12288)))
for value in datas:
    print(tplt.format(value[1]["name"], value[1]["score"], chr(12288)))
    i += 1
    if i == 10:
        break
print()

#喜剧电影
print("喜剧电影推荐" + '>'*10)
comedy = sorted(comedy.items(), key = lambda x:x[1]['score'], reverse=True)
i = 0
tplt = "{0:{2}^10}\t\t\t{1:{2}<10}"
print(tplt.format("电影名称", "评分", chr(12288)))
for value in comedy:
    print(tplt.format(value[1]["name"], value[1]["score"], chr(12288)))
    i += 1
    if i == 10:
        break
print()
#爱情电影
print("爱情电影推荐" + '>'*10)
love = sorted(love.items(), key = lambda x:x[1]['score'], reverse=True)
i = 0
tplt = "{0:{2}^10}\t\t\t{1:{2}<10}"
print(tplt.format("电影名称", "评分", chr(12288)))
for value in love:
    print(tplt.format(value[1]["name"], value[1]["score"], chr(12288)))
    i += 1
    if i == 10:
        break
print()
#科幻电影
print("科幻电影推荐" + '>'*10)
sci_fi = sorted(sci_fi.items(), key = lambda x:x[1]['score'], reverse=True)
i = 0
tplt = "{0:{2}^10}\t\t\t{1:{2}<10}"
print(tplt.format("电影名称", "评分", chr(12288)))
for value in sci_fi:
    print(tplt.format(value[1]["name"], value[1]["score"], chr(12288)))
    i += 1
    if i == 10:
        break
print()

#惊悚电影
print("惊悚电影推荐" + '>'*10)
thriller = sorted(thriller.items(), key = lambda x:x[1]['score'], reverse=True)
i = 0
tplt = "{0:{2}^10}\t\t\t{1:{2}<10}"
print(tplt.format("电影名称", "评分", chr(12288)))
for value in thriller:
    print(tplt.format(value[1]["name"], value[1]["score"], chr(12288)))
    i += 1
    if i == 10:
        break
print()

#犯罪电影
print("犯罪电影推荐" + '>'*10)
crime = sorted(crime.items(), key = lambda x:x[1]['score'], reverse=True)
i = 0
tplt = "{0:{2}^10}\t\t\t{1:{2}<10}"
print(tplt.format("电影名称", "评分", chr(12288)))
for value in crime:
    print(tplt.format(value[1]["name"], value[1]["score"], chr(12288)))
    i += 1
    if i == 10:
        break
print()

#动画电影
print("动画电影推荐" + '>'*10)
animation = sorted(animation.items(), key = lambda x:x[1]['score'], reverse=True)
i = 0
tplt = "{0:{2}^10}\t\t\t{1:{2}<10}"
print(tplt.format("电影名称", "评分", chr(12288)))
for value in animation:
    print(tplt.format(value[1]["name"], value[1]["score"], chr(12288)))
    i += 1
    if i == 10:
        break
print()
发布了84 篇原创文章 · 获赞 10 · 访问量 8567

猜你喜欢

转载自blog.csdn.net/AK47red/article/details/104152050