day05-pandas进阶课件代码

01作业：多个箱线图绘制案例

import matplotlib.pyplot as plt
import numpy as np


# 构建数据
def build_data():
    """
    构建数据
    :return: 数据
    """
    # 加载数据 --返回对象
    res = np.load("./国民经济核算季度数据.npz", allow_pickle=True)
    # 从对象中获取具体的数组
    columns = res["columns"]
    values = res["values"]
    # 打印
    print("columns:\n", columns)
    print("values:\n", values)
    # 返回数据
    return columns, values


def show_data(columns, values):
    """
    绘制图形
    :param columns: 列名
    :param values: 数据
    :return: Nonoe
    """
    # 1、创建画布--返回画布对象
    # 调整画布大小
    fig = plt.figure(figsize=(20, 12), dpi=120)
    # 支持中文与负号
    plt.rcParams['font.sans-serif'] = 'SimHei'
    plt.rcParams['axes.unicode_minus'] = False

    # 调整子图间距
    plt.subplots_adjust(hspace=0.3)
    # 添加子图 1
    fig.add_subplot(2, 1, 1)
    # 构建数据
    # x = (list(values[:, 3]), list(values[:, 4]), list(values[:, 5]))
    x = values[:, 3:6]
    # 构建标签
    labels = [tmp[:4] for tmp in columns[3:6]]
    # 绘制图形
    plt.boxplot(x, notch=True, labels=labels, meanline=True, showmeans=True)
    # 设置标题
    plt.title("2000-2017年各个产业生产总值箱线图")
    # 设置横轴名称
    plt.xlabel("产业")
    # 纵轴名称
    plt.ylabel("生产总值（亿元）")

    # 添加子图2
    fig.add_subplot(2, 1, 2)
    # 构建数据
    # x = (list(values[:, 6]), list(values[:, 7]), list(values[:, 8]), list(values[:, 9]), list(values[:, 10]),
    #      list(values[:, 11]), list(values[:, 12]), list(values[:, 13]), list(values[:, 14]))
    x = values[:, 6:]
    # 构建标签
    labels = [tmp[:2] for tmp in columns[6:]]
    # 绘制图形
    plt.boxplot(x, notch=True, labels=labels, meanline=True, showmeans=True)
    plt.title("2000-2017年各个行业生产总值箱线图")
    plt.xlabel("行业")
    plt.ylabel("生产总值（亿元）")

    plt.savefig("./2000-2017年各个产业、行业生产总值箱线子图.png")
    plt.show()


def main():
    # 1、加载数据
    columns, values = build_data()
    # 2、绘制图形
    show_data(columns, values)


if __name__ == '__main__':
    main()

02-pandas的统计分析

import pandas as pd
import numpy as np

# 加载数据
data = pd.read_excel("./meal_order_detail.xlsx")
print('data：\n', data)
print('data 的列索引：\n', data.columns)
print('data 的数据类型：\n', data.dtypes)
print('*' * 100)

# 对 amounts 、counts 进行统计指标
# print('amounts  counts 列的最小值：\n',data.loc[:,['amounts','counts']].min())
# print(type(data.loc[:,['amounts','counts']].min()))  # <class 'pandas.core.series.Series'>

# 对 amounts 进行统计指标
# print('amounts 列 最小值：\n', data.loc[:, 'amounts'].min())
# print('amounts 列 最大值：\n', data.loc[:, 'amounts'].max())
# print('amounts 列 均值：\n', data.loc[:, 'amounts'].mean())
# print('amounts 列 方差：\n', data.loc[:, 'amounts'].var())
# print('amounts 列 标准差：\n', data.loc[:, 'amounts'].std())


# print('amounts 列 极差：\n', data.loc[:, 'amounts'].ptp())
# 数据里面 非空数值的 数量
# print('amounts 列 非空数值的数量：\n', data.loc[:, 'amounts'].count())
# print('cost 列 非空数值的数量：\n', data.loc[:, 'cost'].count())

# print('amounts 列最大值的下标：\n', data.loc[:, 'amounts'].idxmax()) # np.argmax
# print('amounts 列最小值的下标：\n', data.loc[:, 'amounts'].idxmin()) # np.argmin

#  统计 amounts 的众数--出现次数最多的数 ---返回众数的series
# print('amounts 列的众数 为：\n',data.loc[:,'amounts'].mode())
# print('amounts 列的众数 为的类型为：\n',type(data.loc[:,'amounts'].mode()))

# 可以对非数值型数据进行统计众数
# print('dishes_name 列的众数为：\n',data.loc[:,'dishes_name'].mode())
# 统计amounts 的分位数 ---默认获取的是中位数
# 给参数q 传你想要的的 占比，就能得到指定的分位数
# print('amounts 列的中位数：\n',data.loc[:,'amounts'].median())
# print('amounts 列的分位数：\n',data.loc[:,'amounts'].quantile())
# print('amounts 列的分位数：\n', data.loc[:, 'amounts'].quantile(q=np.arange(0, 1 + 1 / 4, 1 / 4)))
# print('amounts 列的分位数：\n', data.loc[:, 'amounts'].quantile(q=np.arange(0, 1 + 1 / 5, 1 / 5)))

# describe对于数值型的统计结果
# 返回8种结果
# 非空数据的数量
# 均值
# 标准差
# 分位数--min  25% 50% 75% max
# 对 amounts 列进行 describe描述
# print('amounts 列进行describe 描述：\n',data.loc[:,'amounts'].describe())


# describe 对于非数值型数据进行统计分析
# 返回4种 结果
# 非空数据的数量
# 去重之后结果的数量
# 众数
# 众数出现的次数
# 先将 不确定的列 转化 为 非数值型数据 ---类别型数据--category
# data.loc[:, 'dishes_name'] = data.loc[:, 'dishes_name'].astype('category')
# data.loc[:, 'dishes_name'] = data.loc[:, 'dishes_name'].astype('object')
# # 对于 dishes_name  进行统计描述
# print('dishes_name 列的describe 描述：\n', data.loc[:, 'dishes_name'].describe())
#
# print('data 的数据类型：\n',data.dtypes)

03-案例：剔空案例

import pandas as pd

# 删除掉 detail 中整列都为空的这些列

# 加载数据
data = pd.read_excel("./meal_order_detail.xlsx")
print('data:\n', data)

drop_list = []

# ---count --非空数据的数量
for column in data.columns:
    # 统计每一列的count 指标
    res_count = data.loc[:, column].count()
    if res_count == 0:
        drop_list.append(column)

# 删除掉 drop_list  里面的列
data.drop(labels=drop_list, axis=1, inplace=True)
print('删除之后的结果：\n',data.shape)

04-案例：统计分析案例

import pandas as pd

# 统计饭店的销售最火菜品是？而且卖出了多少份？ ---白饭/大碗 ---不算菜品，算主食

# 加载数据
data = pd.read_excel("./meal_order_detail.xlsx")
print('data:\n', data)
print('data 的列索引：\n', data.columns)

# 删除掉  白饭/大碗
bool_index = data.loc[:, 'dishes_name'] == '白饭/大碗'
# 确定  白饭/大碗 的行名称
drop_index = data.loc[bool_index, :].index
# 删除 这些行
data.drop(labels=drop_index, axis=0, inplace=True)
# # 求解出众数
# mode = data.loc[:, 'dishes_name'].mode()[0]
# print('mode:\n', mode)
#
# # 查看 凉拌菠菜  份
# # 选中 凉拌菠菜 所在的行， 统计 counts这一列的 sum
# bool_id = data.loc[:, 'dishes_name'] == mode
#
# #
# data = data.loc[bool_id, :]
#
# # 对data 里面的counts列求sum
# res_sum = data.loc[:, 'counts'].sum()
# print('res_sum:\n',res_sum)



# #直接使用describe
# # 将 dishes_name 转化为 category
# data.loc[:,"dishes_name"] = data.loc[:,"dishes_name"].astype('category')
# # 再去进行统计describe
# res = data.loc[:,"dishes_name"].describe()
# print('res:\n',res)

05-pandas的时间操作

import pandas as pd

"""
pandas 里面默认支持的时间点类型：Timestamp
pandas 里面默认支持的时间序列类型： DatetimeIndex
numpy 默认支持的时间点的类型 ：datetime64[ns]
"""

# '2020-1-10'
# '2020/1/10'


# res = pd.to_datetime('2020-1-10')
# res = pd.to_datetime('2020/1/10')
# print('res：\n',res)
# print('res 的类型:\n',type(res))

# 多个时间点 ---时间序列
# res = pd.to_datetime(['2020-1-10','2020-1-11','2020-1-12'])
# res = pd.DatetimeIndex(['2020-1-10', '2020-1-11', '2020-1-12'])
# print("res:\n", res)
# print('res 的类型：\n', type(res))

# 加载 数据
# data = pd.read_excel("./meal_order_detail.xlsx")
# print('data:\n', data)
# print('data 的列索引：\n', data.columns)

# 将 place_order_time  转化为 pandas默认支持的时间序列
# data.loc[:, 'place_order_time'] = pd.to_datetime(data.loc[:, 'place_order_time'])
#
# print('data 的元素的类型：\n', data.dtypes)

# 可以通过列表推导式的形式 ---来获取时间属性
# year = [i.year for i in data.loc[:, 'place_order_time']]
# print('year:\n', year)
# print('year:\n', len(year))
#
# month = [i.month for i in data.loc[:, 'place_order_time']]
# print('month:\n', month)
# print('month:\n', len(month))
#
# day = [i.day for i in data.loc[:, 'place_order_time']]
# print('day:\n', day)
# print('day:\n', len(day))

# 时--hour 、 分--minute  秒 --second

# weekday = [i.weekday for i in data.loc[:, 'place_order_time']]
# # print('weekday:\n', weekday)
# # # print('weekday:\n', len(weekday))

# weekday_name = [i.weekday_name for i in data.loc[:, 'place_order_time']]
# print('weekday_name:\n', weekday_name)
# print('weekday_name:\n', len(weekday_name))


# 进行时间的相加减 --- Timedelta 类型
# res = pd.to_datetime('2020-1-11') - pd.to_datetime('2020-1-10')
# print('res:\n',res)
# print('res 的类型：\n',type(res))

# 5天之后
# res = pd.to_datetime('2020-1-10') + pd.Timedelta(days=5)
# 5天之前
# res = pd.to_datetime('2020-1-10') + pd.Timedelta(days=-5)
# res = pd.to_datetime('2020-1-10') - pd.Timedelta(days=5)
# Timedelta可以加减的时间为：[weeks, days, hours, minutes, seconds, milliseconds, microseconds, nanoseconds]
# res = pd.to_datetime('2020-1-10') + pd.Timedelta(years=1)
# print('res:\n', res)


# 查看电脑支持的最早的时间 与 最晚的时间
# print('最早的时间：',pd.Timestamp.min)
# print('最晚的时间：',pd.Timestamp.max)


# 生成时间序列数据  -了解
# start 开始日期
# end 结束日期
# freq --时间间隔
# res = pd.date_range(start='2020-1-10',end='2020-2-1',freq='2D')
# res = pd.date_range(start='2020-1-10',end='2020-2-1',freq='10H')
#  periods ---共创建多少个时间点的序列
res = pd.date_range(start='2020-1-10', periods=10, freq='36D')
print('res:\n', res)

06-分组聚合

import pandas as pd
import numpy as np

# 创建一个dataframe
df = pd.DataFrame(
    data={
        "name": ['zs', 'ls', 'ww', 'oo', 'hh', 'jj', 'gg', 'kk'],
        "high": [178.0, 179, 165.5, 160.5, 168.6, 172.0, 182, 189.0],
        "score": [99, 98, 97, 90, 89, 99, 96.5, 88.5],
        'class': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'],
        'group': ['1', '1', '2', '2', '1', '1', '2', '2']
    },
    index=["stu_1", "stu_2", "stu_3", "stu_4", "stu_5", "stu_6", "stu_7", "stu_8"]
)
print("df:\n", df)
print("df 的类型：\n", type(df))
print('*' * 100)

# 统计所有同学的平均成绩
# print('所有同学的平均成绩为:\n', df['score'].mean())

# 按照单列 进行行分组，统计单列的指标
# 将同学们 按照班级分组，然后统计各个班级的平均成绩
# res = df.groupby(by='class')['score'].mean()
# print('res:\n',res)

# 按照单列 进行行分组，统计 多列的 指标
# 按照 班级进行分组，统计各个班级的 平均身高 与平均成绩
# res = df.groupby(by='class')[['score', 'high']].mean()
# print('res:\n', res)

# 按照 多列 进行行 分组，统计 多列指标
# res = df.groupby(by=['class','group'])[['score', 'high']].mean()
# print('res:\n', res)

# agg 与 aggregate 是一样的
# 对多列 同时求取多个指标
# 对 df 中 的 high score  同时求取 最大值与均值
# res = df.loc[:, ["high", 'score']].agg([np.mean, np.max])
# print('res:\n',res)

# 对不同的列 求取不同的指标
# 对df 中的 high  求取最大值，对 df 中的 score 求取平均值
# res = df.agg({'high': [np.max], 'score': [np.mean]})
# print('res:\n',res)

# 对不同的列 求取不同个数的指标
# 对df 中 high 求取最大值， 对df中的 score 求取 平均值与 最小值
# res = df.agg({'high': [np.max], 'score': [np.mean, np.min]})
# print('res:\n',res)

# 可以进行自定义方法
# apply 自定义
# res = df.loc[:, ['high', 'score']].apply(lambda x: x + 100)
# def hh(data):
#     return data + 100
#
#
# res = df.loc[:, ['high', 'score']].apply(hh)
# print('res:\n', res)

# agg 自定义
# res = df.loc[:, ["high",'score']].agg(lambda x: x + 100)
# print("res:\n",res)

# transform 自定义
# res = df.loc[:, ['high', 'score']].transform(lambda x: x + 100)
# print('res:\n',res)

# 区别：apply 基本跟 agg  、transform 都可以实现自定义方法，agg 可以对不同列 求取不同的指标
# 注意：不能进行列之间的运算，只能在列之内进行运算

07-案例营业额案例

import pandas as pd

# 计算detail 表中店铺 每日的营业额数据
# 利用 counts  amounts
# 利用下单时间 ---分组聚合


# 加载数据
data = pd.read_excel("./meal_order_detail.xlsx")
print('data:\n', data)
print('data 的列索引：\n', data.columns)

# 计算每一个菜品的营业额
data.loc[:, 'money'] = data.loc[:, 'counts'] * data.loc[:, 'amounts']

# 从下单时间 获取日期属性
# 先将 下单时间 类型改为pandas默认支持的时间序列
data.loc[:, "place_order_time"] = pd.to_datetime(data.loc[:, "place_order_time"])

# 获取日 属性
data.loc[:, 'day'] = [i.day for i in data.loc[:, "place_order_time"]]

# 按照 日进行分组，统计 每日的营业额
res = data.groupby(by='day')['money'].sum()
print('res:\n',res)

8，交叉表

import pandas as pd
import numpy as np

# 创建一个dataframe
df = pd.DataFrame(
    data={
        "name": ['zs', 'ls', 'ww', 'oo', 'hh', 'jj', 'gg', 'kk'],
        "high": [178.0, 179, 165.5, 160.5, 168.6, 172.0, 182, 189.0],
        "score": [99, 98, 97, 90, 89, 99, 96.5, 88.5],
        'class': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'],
        'group': ['1', '1', '2', '2', '1', '1', '2', '2']
    },
    index=["stu_1", "stu_2", "stu_3", "stu_4", "stu_5", "stu_6", "stu_7", "stu_8"]
)
print("df:\n", df)
print("df 的类型：\n", type(df))
print('*' * 100)

# 创建一个交叉表
# 统计两列之间的相对个数关系
# res = pd.crosstab(
#     index=df['class'], # 行索引
#     columns=df['score'] # 列索引
# )

# 以 index 进行行分组，以columns为列分组，统计 values 的 aggfunc指标
# 注意：在交叉表中必须传递index，columns
# values  与 aggfunc 必须同时出现
res = pd.crosstab(
    index=df['class'],
    columns=df['group'],
    values=df['score'],
    aggfunc=np.max
)
print('res:\n', res)

09-透视表

import pandas as pd
import  numpy as np

# 创建一个dataframe
df = pd.DataFrame(
    data={
        "name": ['zs', 'ls', 'ww', 'oo', 'hh', 'jj', 'gg', 'kk'],
        "high": [178.0, 179, 165.5, 160.5, 168.6, 172.0, 182, 189.0],
        "score": [99, 98, 97, 90, 89, 99, 96.5, 88.5],
        'class': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'],
        'group': ['1', '1', '2', '2', '1', '1', '2', '2']
    },
    index=["stu_1", "stu_2", "stu_3", "stu_4", "stu_5", "stu_6", "stu_7", "stu_8"]
)
print("df:\n", df)
print("df 的类型：\n", type(df))
print('*' * 100)

# 透视表 是一种plus 版本的 分组聚合
# res = df.groupby(by='class')['high'].mean()
# 创建 透视表
# res = pd.pivot_table(
#     data=df,  # 创建透视表 所需要的dataframe
#     index='class',  # 行分组
#     values='high', # 统计指标的主体
# )

# res = df.groupby(by=['class','group'])[['high','score']].mean()
# res = pd.pivot_table(
# #     data=df,  # 创建透视表 所需要的dataframe
# #     index=['class', 'group'],  # 行分组
# #     values=['high', 'score'],  # 统计指标的主体
# # )

#  可以进行分组，列分组，统计多列的指标
# res = pd.pivot_table(
#     data=df,  # 创建透视表 所需要的dataframe
#     index='class',  # 行分组
#     columns='group',  # 列分组
#     values=['high', 'score'],  # 统计指标的主体
#     aggfunc=np.max
# )
# print('res:\n', res)

10-案例：连锁超市数据分析案例

import pandas as pd

# 加载数据
data = pd.read_csv("./order.csv", encoding='ansi')
print('data:\n', data)
print('data 的列索引：\n', data.columns)

# 发现销量里面 存在 =0 与<0 的数据
# 剔除掉  =0 与 <0 的异常值 ---保留>0
# 保留销量> 0
bool_index = data.loc[:, '销量'] > 0

data = data.loc[bool_index, :]
# 1、哪些类别的商品比较畅销？
# 按照类别 进行分组，求各个类别销量之和，排序
#  sort_values ---排序 ---默认为升序，ascending=False，变为降序
# res = data.groupby(by='类别ID')['销量'].sum().sort_values(ascending=False)
# print('res:\n', res)
# res = pd.pivot_table(
#     data=data,
#     index='类别ID',
#     values='销量',
#     aggfunc=np.sum
# ).sort_values(by='销量',ascending=False)
# print('res:\n',res)

# 2、哪些商品比较畅销？
# res = data.groupby(by='商品ID')['销量'].sum().sort_values(ascending=False)
# print('res:\n',res)
# res = pd.pivot_table(
#     data=data,
#     index='商品ID',
#     values='销量',
#     aggfunc=np.sum,
# ).sort_values(by='销量', ascending=False)
# print('res:\n', res)

# 3、求不同门店的销售额占比
# 计算单个商品的 销售额
# data.loc[:, 'money'] = data.loc[:, '销量'] * data.loc[:, '单价']
#
# # 按照门店编号 进行分组，统计 商品的销售额之和
# res = data.groupby(by='门店编号')['money'].sum()
#
# print(res)
#
# print('各门店的占比为：', (res / res.sum()).apply(lambda x: format(x, '.2%')))


# 4、哪段时间段是超市的客流高峰期？
# 对订单 列进行去重
data.drop_duplicates(subset='订单ID', inplace=True)
# 时间段---按照小时 进行划分

# 先将 成交时间  转化为pandas 默认支持的时间序列
data.loc[:, '成交时间'] = pd.to_datetime(data.loc[:, '成交时间'])
# 获取 时 属性
data.loc[:,'小时'] = [i.hour for i in data.loc[:, '成交时间']]

# 按照小时进行分组，统计每小时 里面订单的数量
res = data.groupby(by="小时")['订单ID'].count().sort_values(ascending=False)
print('res:\n',res)
# 早上 8910 ---老大爷、老大娘 ---出门买菜

# 排序---整个样本跟着 排序那一列 一块移动

return_min

发布了128 篇原创文章 · 获赞 24 · 访问量 4257

私信关注

day05-pandas进阶 课件代码

猜你喜欢

day05-pandas进阶课件代码