For example, I have a box office data:
Types of | box office |
---|---|
Plot, disaster | 2913118 |
War history | 3094524 |
Drama, comedy | 3099961 |
Plot | 3176119 |
Based on the known box office information, I want to know what percentage of the total box office can be captured by different types of films. The effect is as follows:
import pandas as pd
def split_and_sum(dataframe: pd.DataFrame, column: str, sum_column: str):
def get_all_type(type_se):
"""获得全部的类型"""
type_info = type_se[column]
type_list = type_info.split(',')
for i in type_list:
if i in all_data.index:
all_data.loc[i, 0] += type_se[sum_column]
else:
all_data.loc[i] = type_se[sum_column]
def set_value(type_se):
"""为对应的类型赋值"""
type_info = type_se[column]
type_list = type_info.split(',')
type_value = 0.0
for i in type_list:
type_value += data_dict[i]
return type_value
all_data: pd.DataFrame = pd.DataFrame([['test']], index=['test'])
dataframe.apply(get_all_type, axis=1)
all_data.drop('test', inplace=True) # 删除第一行
all_data.loc[:, 0] = all_data.loc[:, 0] / all_data.loc[:, 0].sum() # 计算比值
data_dict = all_data.to_dict()[0]
series = dataframe.apply(set_value, axis=1)
return series
if __name__ == '__main__':
df: pd.DataFrame = pd.DataFrame([
['剧情, 灾难', 2913118, ],
['战争, 历史', 3094524, ],
['剧情, 喜剧', 3099961, ],
['剧情,', 3176119, ],
], columns=['type', 'box_office'])
df['type_value'] = split_and_sum(df, 'type', 'box_office')