day13 Black Friday case

Code, notes, and data set files are placed on gitee.
Click here to enter gitee
https://gitee.com/jiangxun07/python-data-analysis.git

Black Friday (Christmas sales in American malls) is a day for Americans to make big purchases. Let us BlackFriday.csvdraw conclusions based on the analyzed data.

1. Analysis direction

1.1 Sales portrait

  1. sales portrait
    • Sales
    • User situation
    • Product condition
  2. User portrait
    • Gender distribution and consumption situation
    • Age distribution and consumption situation
    • User distribution and consumption in different cities
  3. Product image
    • Distribution and consumption of the most popular products
    • Distribution and consumption of products preferred by different age groups
    • Distribution and consumption of products favored by men and women

2. Raw data

import pandas as pd
df=pd.read_csv('./BlackFriday.csv')
df

Insert image description here
A total of more than 500,000 lines

2.1 Data information

df.shape
df.info()

Insert image description here

2.2 Data cleaning

Rename first row

df=df.rename(columns={
    
    
        'User_ID': '用户ID',
        'Product_ID': '商品ID',
        'Gender': '性别',
        'Age': '年龄',
        'Occupation': '职业',
        'City_Category': '城市类别',
        'Stay_In_Current_City_Years': '居住城市年数',
        'Marital_Status': '婚姻状况',
        'Product_Category_1': '产品类别1',
        'Product_Category_2': '产品类别2',
        'Product_Category_3': '产品类别3',
        'Purchase': '采购额'
})
df

Insert image description here

"""查看数据缺失值所占的比重"""
df.dropna().shape[0] / df.shape[0]  # 非空值所占比重 30%左右  缺失值所占比重达到了 70% 左右
"""重复数据"""
df.drop_duplicates()  # 无重复数据

Insert image description here

3.Data analysis

3.1 Sales situation

"""销售总额"""
df['采购额'].sum()
"""用户总人数"""
df.drop_duplicates('用户ID').shape[0]
df.drop_duplicates('用户ID')['用户ID'].count()
"""人均消费"""
df['采购额'].sum() / df.drop_duplicates('用户ID')['用户ID'].count()
"""商品类别统计"""
print('商品类别数量:', df.drop_duplicates('商品ID')['商品ID'].count())

# print('商品类目的销售数量:\n', df.groupby('商品ID')['商品ID'].count().sort_values(ascending=False))
print('商品类目的销售数量:\n', df['商品ID'].value_counts())

Insert image description here

4. User portrait analysis

# 根据"用户ID"去重, 取出来想要的列, 然后用"用户ID"做排序
df_dd = df.drop_duplicates('用户ID')[
    ['用户ID', '商品ID', '性别', '年龄', '职业', '城市类别', '居住城市年数', '婚姻状况']
].sort_values('用户ID')
df

Insert image description here

# 先根据"用户ID"做分组, 计算每个用户的采购总额, 新增一列采购额的数据
df_dd['采购额'] = df.groupby(by='用户ID')['采购额'].sum().sort_index().values
df_dd

Insert image description here

df.groupby(by='用户ID').get_group(1000002)['采购额'].sum()

Insert image description here

4.1 The impact of gender on consumption power

df_dd['性别'].value_counts()
x_data = ['男性', '女性']
sex_data = df_dd['性别'].value_counts().tolist()
data = list(zip(x_data, sex_data))
data

Insert image description here

from pyecharts import options as opts
from pyecharts.charts import Pie
from pyecharts.faker import Faker

c = (
    Pie()
    .add("", data)
    .set_global_opts(title_opts=opts.TitleOpts(title="男女性别所占比例"))
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}--{d}%"))
)
c.render_notebook()

Insert image description here

"""不同性别的采购额"""
s_gender = df_dd.groupby('性别')['采购额'].sum()
s_gender

Insert image description here

from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.faker import Faker


c = (
    Bar()
    .add_xaxis(['女', '男'])
    .add_yaxis("商家A", s_gender.values.tolist())
    .set_global_opts(title_opts=opts.TitleOpts(title="不同性别的采购额"))
)
c.render_notebook()

Insert image description here

## 婚姻状况对消费能力的影响
df_cc = df_dd['婚姻状况'].value_counts()
df_cc
"""婚姻状况所占比重"""
from pyecharts import options as opts
from pyecharts.charts import Pie

x = ['未婚', '已婚']
y = df_cc.values.tolist()
data = list(zip(x, y))

c = (
    Pie()
    .add("", data)
    .set_global_opts(title_opts=opts.TitleOpts(title="婚姻状况占比"))
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}--{d}%"))
)
c.render_notebook()

Insert image description here

"""不同婚姻状况的采购额"""

# 女性情况
nv_dd = df_dd[df_dd['性别'] == 'F']
nv_result = nv_dd.groupby('婚姻状况')['采购额'].sum()
nv_result

# 男性情况
nan_dd = df_dd[df_dd['性别'] == 'M']
nan_dd_result = nan_dd.groupby('婚姻状况')['采购额'].sum()
nan_dd_result
"""婚姻状况对采购的影响"""
from pyecharts import options as opts
from pyecharts.charts import Line


c = (
    Line()
    .add_xaxis(['未婚', '已婚'])
    .add_yaxis("女性", nv_result.values.tolist())
    .add_yaxis("男性", nan_dd_result.values.tolist())
    .set_global_opts(title_opts=opts.TitleOpts(title="不同婚姻状况不同性别的采购额"))
)
c.render_notebook()

Insert image description here

4.2 The impact of age on consumption

"""统计每个年龄阶段的人数"""

# 性别女数据, 分年龄段的数据
nv_dd = df_dd[df_dd['性别'] == 'F']
nv_result = nv_dd.groupby('年龄')['用户ID'].count()
nv_result

# 性别男数据, 分年龄段的数据
nan_dd = df_dd[df_dd['性别'] == 'M']
nan_dd_result = nan_dd.groupby('年龄')['用户ID'].count()
nan_dd_result

Insert image description here

"""每个年龄阶段男女人数统计"""
from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.faker import Faker


c = (
    Bar()
    .add_xaxis(nan_dd_result.index.tolist())
    .add_yaxis("女性", nv_result.values.tolist())
    .add_yaxis("男性", nan_dd_result.values.tolist())
    .set_global_opts(
        title_opts=opts.TitleOpts(title="每个年龄阶段男女人数统计"),
        toolbox_opts= opts.ToolboxOpts()
    )
)
c.render_notebook()

Insert image description here

"""每个年龄阶段的消费能力"""
xiaofei = df_dd.groupby('年龄')['采购额'].sum()
xiaofei

Insert image description here

"""每个年龄阶段男女人数统计"""
from pyecharts import options as opts
from pyecharts.charts import Bar


c = (
    Bar()
    .add_xaxis(xiaofei.index.tolist())
    .add_yaxis("", xiaofei.values.tolist())
    .set_global_opts(title_opts=opts.TitleOpts(title="每个年龄阶段男女人数统计"))
)
c.render_notebook()

Insert image description here

4.3 City field analysis

df_city = df_dd['城市类别'].value_counts()
df_city = df_city.sort_index()
df_city

Insert image description here

"""城市所占比重"""
from pyecharts import options as opts
from pyecharts.charts import Pie

x = df_city.index.tolist()
y = df_city.values.tolist()
data = list(zip(x, y))

c = (
    Pie()
    .add("", data)
    .set_global_opts(title_opts=opts.TitleOpts(title="不同城市人数比重"))
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}--{d}%"))
)
c.render_notebook()

Insert image description here

"""城市消费采购额统计"""
city_sum = df_dd.groupby('城市类别')['采购额'].sum()
city_sum

Insert image description here

"""城市消费采购额柱状图"""
from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.faker import Faker


c = (
    Bar()
    .add_xaxis(city_sum.index.tolist())
    .add_yaxis("消费水平", city_sum.values.tolist())
    .set_global_opts(title_opts=opts.TitleOpts(title="城市消费采购额柱状图"))
)
c.render_notebook()

Insert image description here

"""每个城市平均消费能力"""
city_sum / df_city

# 人数来说: C城市
# 消费总额: B城市
# 人均消费: A城市
city_live = df_dd.groupby('居住城市年数')['用户ID'].count()
city_live

Insert image description here

"""不同居住年限人数饼图"""
from pyecharts import options as opts
from pyecharts.charts import Pie
from pyecharts.faker import Faker

x = ['游客', '1年', '2年', '3年', '4年以上']
y = city_live.values.tolist()
data = list(zip(x, y))

c = (
    Pie()
    .add("", data)
    .set_global_opts(title_opts=opts.TitleOpts(title="不同居住年限人数饼图"))
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}--{d}%"))
)
c.render_notebook()

Insert image description here

"""不同居住年份的采购额"""
city_xiaofei = df_dd.groupby('居住城市年数')['采购额'].sum()
city_xiaofei

Insert image description here

"""不同居住年限采购额饼图"""
from pyecharts import options as opts
from pyecharts.charts import Pie

x = ['游客', '1年', '2年', '3年', '4年以上']
y = city_xiaofei.values.tolist()
data = list(zip(x, y))

c = (
    Pie()
    .add("", data)
    .set_global_opts(title_opts=opts.TitleOpts(title="不同居住年限采购额饼图"))
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}--{d}%"))
)
c.render_notebook()

Insert image description here

4.4 Career analysis

work = df_dd['职业'].value_counts().sort_values()
work

Insert image description here

"""职业分布柱状图"""
from pyecharts import options as opts
from pyecharts.charts import Bar


c = (
    Bar()
    .add_xaxis(work.index.tolist())
    .add_yaxis("消费水平", work.values.tolist())
    .set_global_opts(title_opts=opts.TitleOpts(title="职业分布柱状图"))
)
c.render_notebook()

Insert image description here

"""不同职业的消费能力"""
ocp_data = df_dd.groupby('职业')['采购额'].sum()
ocp_data

ocp_data_list = []
for i in work.index:
    # print(ocp_data[i])
    ocp_data_list.append(ocp_data[i])
    

data_ser = pd.Series(index=work.index.tolist(), data=ocp_data_list)

from pyecharts import options as opts
from pyecharts.charts import Bar

x = work.index.tolist()

c = (
    Bar()
    .add_xaxis(data_ser.index.tolist())
    .add_yaxis("消费水平", data_ser.values.tolist())
    .set_global_opts(title_opts=opts.TitleOpts(title="不同职业的消费能力"))
)
c.render_notebook()

Insert image description here

work.index.tolist()
data_ser

Insert image description here

5. Summary

  • In terms of gender: men have higher spending power than women
  • Marital status: Among the people who buy goods, there are more unmarried people than married people. The purchase amount of unmarried people is higher than that of married people. The marital status of men has a greater impact on men’s purchase amount.
  • Age: People in the 18-45 age range have strong spending power
  • City: City B is guessed to be a medium to large city with higher consumption power than other cities.
  • Length of residence: People who have lived for about one year have higher spending power. As the length of residence increases, the spending power will decrease.
  • Occupation: The consumption power of different occupations varies greatly, so the focus of marketing planning can be on [14 20 12 17 1 7 0 4]

Guess you like

Origin blog.csdn.net/m0_73678713/article/details/134574572