(一)字符串离散化案例
1.1 实例过程拆分(动手)
对于这一组电影数据(IMDB-Movie-Data)
,如果我们希望统计电影分类(Genre)的情况,应该如何处理数据?
思路:
重新构造一个全为0的数组,列名为分类,如果某一条数据中分类出现过,就让0变为1.
(IMDB-Movie-Data)
数据链接:https://pan.baidu.com/s/1KuJB1ZyDCaIgUFopBM5h3Q
提取码:9cg0
- 查看Genre:
# coding = utf8-f
import pandas as pd
from matplotlib import pyplot as plt
file_path = "C:/Users/Administrator/PycharmProjects/数据分析作业/课时五/day05/code/IMDB-Movie-Data.csv"
df = pd.read_csv(file_path)
print(df["Genre"])
OUT:
0 Action,Adventure,Sci-Fi
1 Adventure,Mystery,Sci-Fi
2 Horror,Thriller
3 Animation,Comedy,Family
4 Action,Adventure,Fantasy
...
995 Crime,Drama,Mystery
996 Horror
997 Drama,Music,Romance
998 Adventure,Comedy
999 Comedy,Family,Fantasy
Name: Genre, Length: 1000, dtype: object
- 统计分类的列表
temp_list = df["Genre"].str.split(",").tolist() # 形成 [[],[]····,[]] 形式
# 如果在DataFrame中,需要使用to_list()
genre_list = list(set([i for j in temp_list for i in j]))
# set(b):把b转化为集合,并且去掉重复的元素
print(genre_list)
OUT:
['Sport', 'Animation', 'Comedy', 'Drama', 'Action', 'Thriller', 'Western', 'Crime', 'Fantasy', 'Musical', 'Sci-Fi', 'Romance', 'Horror', 'War', 'Music', 'Biography', 'Adventure', 'History', 'Mystery', 'Family']
- 构造全为0的数组
# 构造全为0的数组
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
# shape[0]矩阵的行数。
print(zeros_df)
OUT:
Action Musical Crime History ... War Mystery Fantasy Western
0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
.. ... ... ... ... ... ... ... ... ...
995 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
996 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
997 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
998 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
999 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
- 给每个电影出现分类的位置赋值1
# coding = utf8-f
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
file_path = "C:/Users/Administrator/PycharmProjects/数据分析作业/课时五/day05/code/IMDB-Movie-Data.csv"
df = pd.read_csv(file_path)
print(df["Genre"].head(3))
# 统计分类的列表
temp_list = df["Genre"].str.split(",").tolist() # 形成 [[],[],[]] 形式
genre_list = list(set([i for j in temp_list for i in j]))
# 构造全为0的数组
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
#print(zeros_df)
# 给每个电影出现分类的位置赋值1
for i in range(df.shape[0]):
#zeros_df.loc[0,["Sci-fi","Musical"] = 1
zeros_df.loc[i,temp_list[i]] = 1
print(zeros_df.head(3))
OUT:
0 Action,Adventure,Sci-Fi
1 Adventure,Mystery,Sci-Fi
2 Horror,Thriller
Name: Genre, dtype: object
Music Sport Crime Drama ... Comedy Thriller Horror Sci-Fi
0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0
1 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0
2 0.0 0.0 0.0 0.0 ... 0.0 1.0 1.0 0.0
[3 rows x 20 columns]
- 统计每个电影分类的数量和
genre_count = zeros_df.sum(axis=0)
print(genre_count)
OUT:
Adventure 259.0
Music 16.0
Family 51.0
Romance 141.0
Mystery 106.0
History 29.0
Western 7.0
Thriller 195.0
Musical 5.0
Comedy 279.0
Sport 18.0
Crime 150.0
Sci-Fi 120.0
Animation 49.0
Fantasy 101.0
Drama 513.0
Action 303.0
Horror 119.0
War 13.0
Biography 81.0
dtype: float64
- 排序
# 排序
genre_count = genre_count.sort_values()
print(genre_count)
OUT:
Musical 5.0
Western 7.0
War 13.0
Music 16.0
Sport 18.0
History 29.0
Animation 49.0
Family 51.0
Biography 81.0
Fantasy 101.0
Mystery 106.0
Horror 119.0
Sci-Fi 120.0
Romance 141.0
Crime 150.0
Thriller 195.0
Adventure 259.0
Comedy 279.0
Action 303.0
Drama 513.0
dtype: float64
+ 作图
genre_count = genre_count.sort_values()
print(genre_count)
_x = genre_count.index
_y = genre_count.values
# 画图
# 排序
genre_count = genre_count.sort_values()
print(genre_count)
_x = genre_count.index
_y = genre_count.values
# 画图
plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x,fontProperties = my_font)
plt.xlabel("电影类型",fontProperties = my_font)
plt.ylabel("每种类型电影的总数",fontProperties = my_font)
plt.title("电影Genre的统计",fontProperties = my_font)
# 保存图片
plt.savefig("./电影Genre的统计.png")
plt.show()
1.2 源码
# coding = utf8-f
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import font_manager
file_path = "C:/Users/Administrator/PycharmProjects/数据分析作业/课时五/day05/code/IMDB-Movie-Data.csv"
my_font = font_manager.FontProperties(fname="C:/Windows/Fonts/simsun.ttc")
df = pd.read_csv(file_path)
#print(df["Genre"].head(3))
# 统计分类的列表
temp_list = df["Genre"].str.split(",").tolist() # 形成 [[],[],[]] 形式
genre_list = list(set([i for j in temp_list for i in j]))
print(genre_list)
# 构造全为0的数组
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
#print(zeros_df)
# 给每个电影出现分类的位置赋值1
for i in range(df.shape[0]):
#zeros_df.loc[0,["Sci-fi","Musical"] = 1
zeros_df.loc[i,temp_list[i]] = 1
#print(zeros_df.head(3))
#统计每个电影分类的数量和
genre_count = zeros_df.sum(axis=0)
print("$"*250)
print(genre_count)
# 排序
genre_count = genre_count.sort_values()
print("$"*50)
print(genre_count)
_x = genre_count.index
_y = genre_count.values
# 画图
plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x,fontProperties = my_font)
plt.xlabel("电影类型",fontProperties = my_font)
plt.ylabel("每种类型电影的总数",fontProperties = my_font)
plt.title("电影Genre的统计",fontProperties = my_font)
# 保存图片
plt.savefig("./电影Genre的统计.png")
plt.show()
源码下载链接:https://pan.baidu.com/s/1Y3O8t9Fjtg4k55iuoUB6FA
提取码:3i2k