数据科学库(五 · 一)数字的合并与分组聚合(太具有逻辑性,多复习)

(一)字符串离散化案例

1.1 实例过程拆分(动手)

对于这一组电影数据(IMDB-Movie-Data),如果我们希望统计电影分类(Genre)的情况,应该如何处理数据?
思路:重新构造一个全为0的数组,列名为分类,如果某一条数据中分类出现过,就让0变为1.
(IMDB-Movie-Data)数据链接:https://pan.baidu.com/s/1KuJB1ZyDCaIgUFopBM5h3Q
提取码:9cg0

  • 查看Genre
# coding = utf8-f
import pandas as pd
from matplotlib import pyplot as plt

file_path = "C:/Users/Administrator/PycharmProjects/数据分析作业/课时五/day05/code/IMDB-Movie-Data.csv"

df = pd.read_csv(file_path)

print(df["Genre"])

OUT:

0       Action,Adventure,Sci-Fi
1      Adventure,Mystery,Sci-Fi
2               Horror,Thriller
3       Animation,Comedy,Family
4      Action,Adventure,Fantasy
                 ...           
995         Crime,Drama,Mystery
996                      Horror
997         Drama,Music,Romance
998            Adventure,Comedy
999       Comedy,Family,Fantasy
Name: Genre, Length: 1000, dtype: object
  • 统计分类的列表
temp_list = df["Genre"].str.split(",").tolist()   # 形成 [[],[]····,[]] 形式
         # 如果在DataFrame中,需要使用to_list()
genre_list = list(set([i for j in temp_list for i in j]))
        # set(b):把b转化为集合,并且去掉重复的元素
print(genre_list)

OUT:

['Sport', 'Animation', 'Comedy', 'Drama', 'Action', 'Thriller', 'Western', 'Crime', 'Fantasy', 'Musical', 'Sci-Fi', 'Romance', 'Horror', 'War', 'Music', 'Biography', 'Adventure', 'History', 'Mystery', 'Family']
  • 构造全为0的数组
# 构造全为0的数组
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
     # shape[0]矩阵的行数。
print(zeros_df)

OUT:

 Action  Musical  Crime  History  ...  War  Mystery  Fantasy  Western
0       0.0      0.0    0.0      0.0  ...  0.0      0.0      0.0      0.0
1       0.0      0.0    0.0      0.0  ...  0.0      0.0      0.0      0.0
2       0.0      0.0    0.0      0.0  ...  0.0      0.0      0.0      0.0
3       0.0      0.0    0.0      0.0  ...  0.0      0.0      0.0      0.0
4       0.0      0.0    0.0      0.0  ...  0.0      0.0      0.0      0.0
..      ...      ...    ...      ...  ...  ...      ...      ...      ...
995     0.0      0.0    0.0      0.0  ...  0.0      0.0      0.0      0.0
996     0.0      0.0    0.0      0.0  ...  0.0      0.0      0.0      0.0
997     0.0      0.0    0.0      0.0  ...  0.0      0.0      0.0      0.0
998     0.0      0.0    0.0      0.0  ...  0.0      0.0      0.0      0.0
999     0.0      0.0    0.0      0.0  ...  0.0      0.0      0.0      0.0
  • 给每个电影出现分类的位置赋值1
# coding = utf8-f
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

file_path = "C:/Users/Administrator/PycharmProjects/数据分析作业/课时五/day05/code/IMDB-Movie-Data.csv"

df = pd.read_csv(file_path)
print(df["Genre"].head(3))

# 统计分类的列表
temp_list = df["Genre"].str.split(",").tolist()   # 形成 [[],[],[]] 形式
genre_list = list(set([i for j in temp_list for i in j]))

# 构造全为0的数组
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
#print(zeros_df)

# 给每个电影出现分类的位置赋值1
for i in range(df.shape[0]):
    #zeros_df.loc[0,["Sci-fi","Musical"] = 1
    zeros_df.loc[i,temp_list[i]] = 1

print(zeros_df.head(3))

OUT:

0     Action,Adventure,Sci-Fi
1    Adventure,Mystery,Sci-Fi
2             Horror,Thriller
Name: Genre, dtype: object
   Music  Sport  Crime  Drama  ...  Comedy  Thriller  Horror  Sci-Fi
0    0.0    0.0    0.0    0.0  ...     0.0       0.0     0.0     1.0
1    0.0    0.0    0.0    0.0  ...     0.0       0.0     0.0     1.0
2    0.0    0.0    0.0    0.0  ...     0.0       1.0     1.0     0.0

[3 rows x 20 columns]
  • 统计每个电影分类的数量和
genre_count = zeros_df.sum(axis=0)
print(genre_count)

OUT:

Adventure    259.0
Music         16.0
Family        51.0
Romance      141.0
Mystery      106.0
History       29.0
Western        7.0
Thriller     195.0
Musical        5.0
Comedy       279.0
Sport         18.0
Crime        150.0
Sci-Fi       120.0
Animation     49.0
Fantasy      101.0
Drama        513.0
Action       303.0
Horror       119.0
War           13.0
Biography     81.0
dtype: float64
  • 排序
# 排序
genre_count = genre_count.sort_values()
print(genre_count)

OUT:

Musical        5.0
Western        7.0
War           13.0
Music         16.0
Sport         18.0
History       29.0
Animation     49.0
Family        51.0
Biography     81.0
Fantasy      101.0
Mystery      106.0
Horror       119.0
Sci-Fi       120.0
Romance      141.0
Crime        150.0
Thriller     195.0
Adventure    259.0
Comedy       279.0
Action       303.0
Drama        513.0
dtype: float64

+ 作图

genre_count = genre_count.sort_values()
print(genre_count)
_x = genre_count.index
_y = genre_count.values

# 画图

# 排序
genre_count = genre_count.sort_values()
print(genre_count)
_x = genre_count.index
_y = genre_count.values

# 画图
plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x,fontProperties = my_font)
plt.xlabel("电影类型",fontProperties = my_font)
plt.ylabel("每种类型电影的总数",fontProperties = my_font)
plt.title("电影Genre的统计",fontProperties = my_font)

# 保存图片
plt.savefig("./电影Genre的统计.png")

plt.show()

在这里插入图片描述

1.2 源码

# coding = utf8-f
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import font_manager

file_path = "C:/Users/Administrator/PycharmProjects/数据分析作业/课时五/day05/code/IMDB-Movie-Data.csv"
my_font = font_manager.FontProperties(fname="C:/Windows/Fonts/simsun.ttc")

df = pd.read_csv(file_path)
#print(df["Genre"].head(3))

# 统计分类的列表
temp_list = df["Genre"].str.split(",").tolist()   # 形成 [[],[],[]] 形式
genre_list = list(set([i for j in temp_list for i in j]))
print(genre_list)

# 构造全为0的数组
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
#print(zeros_df)

# 给每个电影出现分类的位置赋值1
for i in range(df.shape[0]):
    #zeros_df.loc[0,["Sci-fi","Musical"] = 1
    zeros_df.loc[i,temp_list[i]] = 1
#print(zeros_df.head(3))

#统计每个电影分类的数量和
genre_count = zeros_df.sum(axis=0)
print("$"*250)
print(genre_count)

# 排序
genre_count = genre_count.sort_values()
print("$"*50)
print(genre_count)
_x = genre_count.index
_y = genre_count.values

# 画图
plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x,fontProperties = my_font)
plt.xlabel("电影类型",fontProperties = my_font)
plt.ylabel("每种类型电影的总数",fontProperties = my_font)
plt.title("电影Genre的统计",fontProperties = my_font)

# 保存图片
plt.savefig("./电影Genre的统计.png")

plt.show()

在这里插入图片描述

源码下载链接:https://pan.baidu.com/s/1Y3O8t9Fjtg4k55iuoUB6FA
提取码:3i2k

Guess you like

Origin blog.csdn.net/weixin_54546190/article/details/119133845