python数据分析实战一:IMDB Top 250

This article deals with the top 250 movies in IMDB, including data scraping, data preparation, data cleaning, data analysis and visualization.

Data scraping

First, we need to scrape the data from this website.

# import package
import pandas as pd
import time
import urllib.request
from lxml.html import fromstring
from bs4 import BeautifulSoup

# download html
def download(url):
    print('Downloading:', url)
    request = urllib.request.Request(url)
    request.add_header('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36') #进行伪装
    resp = urllib.request.urlopen(request)
    html = resp.read().decode('utf-8')
    return html


# content to be scrape
Name = []
Year = []
Rate = []
Level = []
Directors = []
Writers = []
Stars = []
Genres = []
Runtime = []
Country = []
Language = []
Budget = []
Box_Office_USA = []
Box_Office_World = []

start_url = download('https://www.imdb.com/chart/top/?ref_=nv_mv_250')
domain = 'https://www.imdb.com/'
start_soup = BeautifulSoup(start_url)

# scrape every item
for k in range(250):
    sub_html = start_soup.find_all('tbody')[0].find_all('a')[2*k+1].get('href')
    url = download(domain + sub_html)
    time.sleep(3)   
    tree = fromstring(url)
    soup = BeautifulSoup(url)
    name = soup.find('span',{
    
    'id':'titleYear'}).previous_sibling
    Name.append(name.replace(name[-1],''))
    Year.append(tree.xpath('//*[@id="titleYear"]/a')[0].text_content())
    Rate.append(tree.xpath('//*[@id="title-overview-widget"]/div[1]/div[2]/div/div[1]/div[1]/div[1]/strong/span')[0].text_content())
    Level.append(soup.find('div',{
    
    'class':'subtext'}).span.previous_sibling.strip())
    try:
        Directors.append(soup.find(text='Director:').parent.parent.find('a').get_text())
    except AttributeError:
        directors = [k.get_text() for k in soup.find(text='Directors:').parent.parent.find_all('a')]
        Directors.append('/'.join(directors))
    try:
        writers = [k.get_text() for k in soup.find(text='Writers:').parent.parent.find_all('a')]
        Writers.append('/'.join(writers))
    except AttributeError:
        Writers.append(soup.find(text='Writer:').parent.parent.find('a').get_text())
    stars = [k.get_text() for k in soup.find(text='Stars:').parent.parent.find_all('a')]
    Stars.append('/'.join(stars))
    genres = [k.get_text().strip() for k in soup.find(text='Genres:').parent.parent.find_all('a')]    
    Genres.append('/'.join(genres))
    Runtime.append(soup.find(text='Runtime:').parent.parent.time.get_text())
    countries = [k.get_text() for k in soup.find(text='Country:').parent.parent.find_all('a')]
    Country.append('/'.join(countries))
    languages = [k.get_text() for k in soup.find(text='Language:').parent.parent.find_all('a')]
    Language.append('/'.join(languages)) 
    try:
        Budget.append(soup.find(text='Budget:').parent.next_sibling.strip())
    except AttributeError:
        Budget.append(None)
    try:
        Box_Office_USA.append(soup.find(text='Gross USA:').parent.next_sibling.strip())
    except AttributeError:
        Box_Office_USA.append(None)
    try:
        Box_Office_World.append(soup.find(text='Cumulative Worldwide Gross:').parent.next_sibling.strip())
    except AttributeError:
        Box_Office_World.append(None)
    
# combine each column
Name_pd = pd.DataFrame(Name)
Year_pd = pd.DataFrame(Year)
Rate_pd = pd.DataFrame(Rate)
Level_pd = pd.DataFrame(Level)
Directors_pd = pd.DataFrame(Directors)
Writers_pd = pd.DataFrame(Writers)
Stars_pd = pd.DataFrame(Stars)
Genres_pd = pd.DataFrame(Genres)
Runtime_pd = pd.DataFrame(Runtime)
Country_pd = pd.DataFrame(Country)
Language_pd = pd.DataFrame(Language)
Budget_pd = pd.DataFrame(Budget)
Box_Office_USA_pd = pd.DataFrame(Box_Office_USA)
Box_Office_World_pd = pd.DataFrame(Box_Office_World)
movie_data = pd.concat([Name_pd,Year_pd,Rate_pd,Level_pd,Directors_pd,Writers_pd,Stars_pd,Genres_pd,Runtime_pd,\
                        Country_pd,Language_pd,Budget_pd,Box_Office_USA_pd,Box_Office_World_pd],axis=1)
movie_data.columns=['Name','Year','Rate','Level','Directors','Writers','Stars','Genres','Runtime','Country',\
                    'Language','Budget','Box_Office_USA','Box_Office_World']


# output
outputpath='c:/Users/zxw/Desktop/修身/与自己/数据分析/数据分析/爬虫/movie.csv' ## The path need to be altered!
movie_data.to_csv(outputpath,sep=',',index=False,header=True,encoding='utf_8_sig')

Data preparation

Then, we need to import packages and load the data.

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
movie_data = pd.read_csv('movie.csv')
movie_data.head()
Name Year Rate Level Directors Writers Stars Genres Runtime Country Language Budget Box_Office_USA Box_Office_World
0 The Shawshank Redemption 1994 9.3 R Frank Darabont Stephen King/Frank Darabont Tim Robbins/Morgan Freeman/Bob Gunton/See full... Drama 142 min USA English $25,000,000 $28,699,976 $28,815,291
1 The Godfather 1972 9.2 NaN Francis Ford Coppola Mario Puzo/Francis Ford Coppola/1 more credit Marlon Brando/Al Pacino/James Caan/See full ca... Crime/Drama 175 min USA English/Italian/Latin $6,000,000 $134,966,411 $246,120,974
2 The Godfather: Part II 1974 9.0 NaN Francis Ford Coppola Francis Ford Coppola/Mario Puzo/1 more credit Al Pacino/Robert De Niro/Robert Duvall/See ful... Crime/Drama 202 min USA English/Italian/Spanish/Latin/Sicilian $13,000,000 $47,834,595 $48,035,783
3 The Dark Knight 2008 9.0 PG-13 Christopher Nolan Jonathan Nolan/Christopher Nolan/3 more credits Christian Bale/Heath Ledger/Aaron Eckhart/See ... Action/Crime/Drama/Thriller 152 min USA/UK English/Mandarin $185,000,000 $535,234,033 $1,005,456,758
4 12 Angry Men 1957 8.9 NaN Sidney Lumet Reginald Rose/Reginald Rose Henry Fonda/Lee J. Cobb/Martin Balsam/See full... Crime/Drama 96 min USA English $350,000 NaN $576

As we can see, the data above is so dirty that we need to clean it before analysis.

Data cleaning

Missing value

movie_data.isnull().sum()
Name                  0
Year                  0
Rate                  0
Level               104
Directors             0
Writers               0
Stars                 0
Genres                0
Runtime               0
Country               0
Language              0
Budget               23
Box_Office_USA       37
Box_Office_World     10
dtype: int64
# fill na with 'Not Known'
movie_data['Level'].fillna('Not Known',inplace = True)
movie_data['Directors'].fillna('Not Known',inplace = True)

Data wrangling

# delete redundant information in 'Writers'
def replace_1(x):
    to_replace = re.search('/[0-9]+ more credit',x)
    if to_replace == None:
        return x
    else:
        x_new = x.replace(to_replace.group(),'')
    return x_new
movie_data['Writers'] = movie_data['Writers'].apply(replace_1)
# delete redundant information in 'Stars'
def replace_2(x):
    to_replace = re.search('/See full cast \& crew',x)
    if to_replace == None:
        return x
    else:
        x_new = x.replace(to_replace.group(),'')
    return x_new
movie_data['Stars'] = movie_data['Stars'].apply(replace_2)
    
# extract runtime
def extract_runtime(x):
    return int(re.search('[0-9]+',x).group())
movie_data['Runtime'] = movie_data['Runtime'].apply(extract_runtime)
# extract money(For simplicity, ignore other currency except dollar)
def extract_number(x):
    try:
        if re.match('^\$',x):
            return float(''.join(re.findall('[0-9]',x)))
        else:
            return None
    except TypeError:
        return None
movie_data['Budget'] = movie_data['Budget'].apply(extract_number)
movie_data['Box_Office_USA'] = movie_data['Box_Office_USA'].apply(extract_number)
movie_data['Box_Office_World'] = movie_data['Box_Office_World'].apply(extract_number)
movie_data.head()
Name Year Rate Level Directors Writers Stars Genres Runtime Country Language Budget Box_Office_USA Box_Office_World
0 The Shawshank Redemption 1994 9.3 R Frank Darabont Stephen King/Frank Darabont Tim Robbins/Morgan Freeman/Bob Gunton Drama 142 USA English 25000000.0 28699976.0 2.881529e+07
1 The Godfather 1972 9.2 Not Known Francis Ford Coppola Mario Puzo/Francis Ford Coppola Marlon Brando/Al Pacino/James Caan Crime/Drama 175 USA English/Italian/Latin 6000000.0 134966411.0 2.461210e+08
2 The Godfather: Part II 1974 9.0 Not Known Francis Ford Coppola Francis Ford Coppola/Mario Puzo Al Pacino/Robert De Niro/Robert Duvall Crime/Drama 202 USA English/Italian/Spanish/Latin/Sicilian 13000000.0 47834595.0 4.803578e+07
3 The Dark Knight 2008 9.0 PG-13 Christopher Nolan Jonathan Nolan/Christopher Nolans Christian Bale/Heath Ledger/Aaron Eckhart Action/Crime/Drama/Thriller 152 USA/UK English/Mandarin 185000000.0 535234033.0 1.005457e+09
4 12 Angry Men 1957 8.9 Not Known Sidney Lumet Reginald Rose/Reginald Rose Henry Fonda/Lee J. Cobb/Martin Balsam Crime/Drama 96 USA English 350000.0 NaN 5.760000e+02

Now we get the clean data and we can make some data analysis and visualization on it.

Data analysis and visualization

Number of top250 movies every year

year_counts = movie_data['Year'].value_counts()
year_counts.columns=['Year','Counts']
plt.figure(figsize=(15, 6.5))
year_counts.sort_index().plot.line(title='Number of top250 movies every year')

There are three peaks during the years.

The distribution of Rate

def get_histgram(x,n):
    movie_data[x].plot.hist(bins = n, title = 'The histgram of {}'.format(x),figsize = (15,6.5))
get_histgram('Rate',15)

As we can see, it is right skewed.

Percent of each level

level_counts = movie_data['Level'].value_counts()
level_counts.columns=['Level','Counts']
level_counts.plot.pie(figsize = (8,8),title = 'Pie chart of the Level',legend = True)

Top 10 directors

def get_bar_chart(x):
    List = movie_data[x].apply(f)
    items = []
    for element in List:
        for item in element:
            item = item.replace(" ", "")
            items.append(item)
    item_pd = pd.Series(items)
    item_pd.value_counts().head(10).plot.bar(figsize = (15,6.5),title = 'Top 10 {}'.format(x))
get_bar_chart('Directors')

Top 10 Writers

get_bar_chart('Writers')

Top 10 Stars

get_bar_chart('Stars')

The distribution of runtime

get_histgram('Runtime',20)

Percent of each genre

def get_pie_chart(x):
    List = movie_data[x].apply(f)
    items = []
    for element in List:
        for item in element:
            item = item.replace(" ", "")
            items.append(item)
    item_pd = pd.Series(items)
    item_counts = item_pd.value_counts()
    item_counts.columns=[x,'Counts']
    item_counts.plot.pie(figsize = (10,10),title = 'Pie chart of the {}'.format(x),legend = True)
get_pie_chart('Genres')

Percent of each country

get_pie_chart('Country')

Percent of each language

get_pie_chart('Language')

The distribution of budget

get_histgram('Budget',20)

The distribution of Box_Office_USA

get_histgram('Box_Office_USA',20)

The distribution of Box_Office_World

get_histgram('Box_Office_World',20)

Correlation among quantitative features

movie_data.corr()
Year Rate Runtime Budget Box_Office_USA Box_Office_World
Year 1.000000 0.021213 0.160196 0.478785 0.312610 0.391026
Rate 0.021213 1.000000 0.244455 0.114680 0.215052 0.208516
Runtime 0.160196 0.244455 1.000000 0.178348 0.138281 0.131039
Budget 0.478785 0.114680 0.178348 1.000000 0.790742 0.841518
Box_Office_USA 0.312610 0.215052 0.138281 0.790742 1.000000 0.949837
Box_Office_World 0.391026 0.208516 0.131039 0.841518 0.949837 1.000000

Top 5 budget movie

movie_data[['Name','Budget']].sort_values(by = 'Budget',ascending = False).head()
Name Budget
73 Avengers: Endgame 356000000.0
63 Avengers: Infinity War 321000000.0
70 The Dark Knight Rises 250000000.0
110 Toy Story 3 200000000.0
3 The Dark Knight 185000000.0

Top 5 Box Office movie of USA

movie_data[['Name','Box_Office_USA']].sort_values(by = 'Box_Office_USA',ascending = False).head()
Name Box_Office_USA
73 Avengers: Endgame 858373000.0
63 Avengers: Infinity War 678815482.0
3 The Dark Knight 535234033.0
24 Star Wars 460998507.0
70 The Dark Knight Rises 448139099.0

Top 5 Box Office movie of the world

movie_data[['Name','Box_Office_World']].sort_values(by = 'Box_Office_World',ascending = False).head()
Name Box_Office_World
73 Avengers: Endgame 2.797801e+09
63 Avengers: Infinity War 2.048360e+09
212 Harry Potter and the Deathly Hallows: Part 2 1.342193e+09
6 The Lord of the Rings: The Return of the King 1.142271e+09
70 The Dark Knight Rises 1.081141e+09

USA Box office Percent

movie_data['USA_percent'] = movie_data['Box_Office_USA']/movie_data['Box_Office_World']
get_histgram('USA_percent',20)

Most commercial successful movie

movie_data['Earning_rate'] = movie_data['Box_Office_World']/movie_data['Budget']-1
movie_data[['Name','Earning_rate']].sort_values(by = 'Earning_rate',ascending = False).head()
Name Earning_rate
230 Rocky 121.134309
166 Gone with the Wind 100.169872
24 Star Wars 69.524447
113 Jodaeiye Nader az Simin 44.852152
1 The Godfather 40.020162

Movie related to China

logic_list = []
for movie in movie_data['Country']:
    logic_list.append('China' in movie or 'HongKong' in movie)
Chinese_movie_list = movie_data.loc[logic_list]
Chinese_movie_list
Name Year Rate Level Directors Writers Stars Genres Runtime Country Language Budget Box_Office_USA Box_Office_World USA_percent Earning_rate
96 1917 2019 8.3 R Sam Mendes Sam Mendes/Krysty Wilson-Cairns Dean-Charles Chapman/George MacKay/Daniel Mays Drama/War 119 USA/UK/India/Spain/Canada/China English/French/German 95000000.0 159227644.0 384792488.0 0.413801 3.050447
129 Green Book 2018 8.2 PG-13 Peter Farrelly Nick Vallelonga/Brian Hayes Currie Viggo Mortensen/Mahershala Ali/Linda Cardellini Biography/Comedy/Drama/Music 130 USA/China English/Italian/Russian/German 23000000.0 85080171.0 321752656.0 0.264427 12.989246
236 Fa yeung nin wah 2000 8.1 PG Kar-Wai Wong Kar-Wai Wong Tony Chiu-Wai Leung/Maggie Cheung/Ping Lam Siu Drama/Romance 98 Hong Kong/China Cantonese/Shanghainese/French/Spanish NaN 2738980.0 12854953.0 0.213068 NaN

猜你喜欢

转载自blog.csdn.net/weixin_43084570/article/details/108986605