用Python爬取豆瓣top250电影数据,并绘制折线图和饼状图分析

from bs4 import BeautifulSoup
import requests
import re

### 获取电影详情
def getMovieDetails(link):
    result = {}
    details = BeautifulSoup(requests.get(link).text,"lxml")
    ### 电影名称
    result['title'] = details.find('span',attrs = {'property':'v:itemreviewed'}).text
    ### 发行时间
    year = details.find('span',attrs = {'class':'year'}).text
    result['year'] = re.sub(r"[()]","",year)
    ###评分
    result['rating_num'] = details.find('strong',attrs = {'class':'ll rating_num'}).text
    ### 评论数
    result['votes'] = details.find('span',attrs = {'property':'v:votes'}).text
    #### 类型
    types_node = details.findAll('span',attrs = {'property':'v:genre'})
    types  = [node.text for node in types_node]
#     result['types_node'] = types_node[0].text
    result['types'] = types
#     ##其他信息
#     result['info'] = details.find('div',attrs = {'id':'info'}).text
    ### 简介 .strip()去除字符串首的空格
    summary = details.findAll('span',attrs = {'property':'v:summary'})[0].text
    result['summary'] = "".join(summary.split())
    return result
###获取top250电影列表 并抽取电影详情里面的内容
def getMovies(url,min_rating_num):
    
    resultList = []
    
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text,'lxml')
    for movies in soup.select('.item'):
        rating_num = float(movies.select('.rating_num')[0].text)
        movie_name = movies.select('.info .title')[0].text
#         movie_star_pepole_num = movies.select('.star span')[3].text
        movie_href = movies.select('.info a')[0]['href']
        if rating_num >= min_rating_num:
            resultList.append(getMovieDetails(movie_href))
#             print(movie_name)
#             print(movie_star_pepole_num)
#             print(rating_num)
            print(movie_name)
    return resultList
    
    

#抓取每一页    
url = 'https://movie.douban.com/top250?start={}'
# 大于等于
min_rating_num = 8

movies_total = []

for i in range(0,10):
    newUrl = url.format(25*i)
    print('正在抓取第',(i+1),'页,请稍后...')
    newsary = getMovies(newUrl,min_rating_num)
    movies_total.extend(newsary)

# 将抓取到的数据整理成表格
import pandas
df = pandas.DataFrame(movies_total)
df

猜你喜欢

转载自blog.csdn.net/you943047219/article/details/84961019
今日推荐