Based on movie data, use pyecharts for visual analysis.
Data introduction
import pandas as pd
data=pd.read_csv('./电影.csv')
data.head()
The first 5 rows of data are as follows:
python libraries that need to be installed
pip install pandas
pip install pyecharts
Article directory
data cleaning
View missing values
data.isnull().sum()
Some movies did not have a screenwriter and starring role, so they were not crawled. This does not affect the analysis and visualization of data.
As you can see from the previous data introduction, there is currently no need to clean each field of the acquired data. Just skip this part. (I want to add an emoticon, but I can't find where to add it.)
data visualization
Release year and number of films
Year=data['上映年份'].value_counts().reset_index()
Year.rename(columns={
"index":"上映年份","上映年份":"电影数量"},inplace=True)
Year.head()
I am running it in jupyter notebook . If you run it in other editors and change the last bar.render_notebook()
line of bar.render("xxx.html")
, a file will be generated if the operation succeeds xxx.html
. You should be able to see the visual chart when you open it. The subsequent code is the same.
import pyecharts
from pyecharts.charts import Bar,Pie,Line
import pyecharts.options as opts
bar = (
Bar(init_opts=opts.InitOpts(height='700px', theme='light'))
.add_xaxis(
Year['上映年份'].tolist()[::-1])
.add_yaxis(
"电影数量",
Year['电影数量'].tolist()[::-1],
label_opts=opts.LabelOpts(is_show=False),
)
.set_series_opts(itemstyle_opts=opts.ItemStyleOpts(
border_color='#5C3719', ))
.set_global_opts(
title_opts=opts.TitleOpts(
title='上映年份及电影数量',
subtitle='截止2023年3月',
title_textstyle_opts=opts.TextStyleOpts(
font_family='Microsoft YaHei',
font_weight='bold',
font_size=22,
),
pos_top='1%'),
legend_opts=opts.LegendOpts(is_show=True),
xaxis_opts=opts.AxisOpts(
# name='电影数量',
is_show=True,
max_=int(Year['电影数量'].max()),
axislabel_opts=opts.LabelOpts(
font_family='Microsoft YaHei',
font_weight='bold',
font_size='14' #标签文本大小
)),
yaxis_opts=opts.AxisOpts(
# name='上映年份',
is_show=True,
axislabel_opts=opts.LabelOpts(
#interval=0,#强制显示所有y轴标签,需要可以加上
font_family='Microsoft YaHei',
font_weight='bold',
font_size='14' #标签文本大小
)),
tooltip_opts=opts.TooltipOpts(
is_show=True,
trigger='axis',
trigger_on='mousemove|clike',
axis_pointer_type='shadow',
),
toolbox_opts=opts.ToolboxOpts(is_show=True,
pos_left="right",
pos_top="center",
feature={
"saveAsImage":{
}}
)
).reversal_axis())
bar.render_notebook()
Here I did not set to display all Y-axis labels, and the code gave a comment to force all Y-axis labels to be displayed. According to the chart, the largest number of films released in 2010 was 14 films.
Top 10 Directors and Number of Movies
Director=data['导演'].value_counts()[0:11].reset_index()
Director.rename(columns={
"index":"导演","导演":"电影数量"},inplace=True)
Director.head()
pie = (
Pie(init_opts=opts.InitOpts(theme='light'))
.add(
series_name='电影类型',
data_pair=[list(z) for z in zip(Director['导演'].to_list(), Director['电影数量'].to_list())],
radius=["40%", "75%"],
)
# .set_colors(["blue", "green", "yellow", "red", "pink", "orange", "purple"])
.set_global_opts(
title_opts=opts.TitleOpts(
title="导演及电影数量",
subtitle='TOP10',
title_textstyle_opts=opts.TextStyleOpts(
font_family='Microsoft YaHei',
font_weight='bold',
font_size=22,
),
),
legend_opts=opts.LegendOpts(
pos_left="left",
pos_top="center",
orient='vertical',
is_show=True
),
toolbox_opts=opts.ToolboxOpts(
is_show=True,
pos_left="right",
pos_top="center",
feature={
"saveAsImage":{
}}
)
)
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
)
pie.render_notebook()
This only fetches TOP10 data, if you want to fetch more data, change
Director=data['导演'].value_counts()[0:11].reset_index()
[0:10] means fetching from index 0 to index 9, you can change it yourself.
Top 10 Screenwriters and Number of Movies
Screenwriter=data['编剧'].value_counts()[0:11].reset_index()
Screenwriter.rename(columns={
"index":"编剧","编剧":"电影数量"},inplace=True)
Screenwriter.head()
c = (
Pie(init_opts=opts.InitOpts(theme='light'))
.add(
"",
[list(z) for z in zip(Screenwriter['编剧'].to_list(), Screenwriter['电影数量'].to_list())],
radius=["30%", "75%"],
rosetype="radius",
)
.set_global_opts(
title_opts=opts.TitleOpts(
title="编剧及电影数量",
subtitle='TOP10',
),
legend_opts=opts.LegendOpts(
pos_left="left",
pos_top="center",
orient='vertical',
is_show=True,
)
)
)
c.render_notebook()
Movie length and number
Film_length=data['片长'].value_counts().sort_index().reset_index()
Film_length.rename(columns={
"index":"片长","片长":"电影数量"},inplace=True)
Film_length
c = (
Bar()
.add_xaxis(Film_length['片长'].to_list())
.add_yaxis(
"电影数量", Film_length['电影数量'].to_list(),
label_opts=opts.LabelOpts(is_show=False),
)
.set_global_opts(title_opts=opts.TitleOpts(title="电影片长及数量"))
)
c.render_notebook()
The length of the movie, the shortest is 45 minutes, the longest is 238 minutes, mainly concentrated around 98-132 minutes. I think the quality of a movie has nothing to do with its duration, the main thing is the content.
Movie Rating and Quantity
Douban_score=data['豆瓣评分'].value_counts().sort_index(ascending=False).reset_index()
Douban_score.rename(columns={
"index":"豆瓣评分","豆瓣评分":"电影数量"},inplace=True)
Douban_score
Line()
.set_global_opts(
title_opts=opts.TitleOpts(title="电影豆瓣评分及数量"),
xaxis_opts=opts.AxisOpts(type_="category"),
yaxis_opts=opts.AxisOpts(
type_="value",
axistick_opts=opts.AxisTickOpts(is_show=True),
splitline_opts=opts.SplitLineOpts(is_show=True),
),
)
.add_xaxis(xaxis_data=Douban_score['豆瓣评分'])
.add_yaxis(
series_name="电影数量",
y_axis=Douban_score['电影数量'],
symbol="emptyCircle",
is_symbol_show=True,
label_opts=opts.LabelOpts(is_show=False),
itemstyle_opts=opts.ItemStyleOpts(
color="red"),
markpoint_opts=opts.MarkPointOpts(
data=[opts.MarkPointItem(type_="max",name="最大值")]
)
)
)
c.render_notebook()
It can be seen that Douban has the highest score of 8.9 points, and a total of 41 movies have received this score. Everyone has different preferences. What you think is a good movie may not be so good for others. This also reflects the value of maintaining high-rated movies.
This article is written here first, and I will update it later. There are many configurations about pyecharts. Specifically, you can change the chart to the style you need according to the official website documents.