pandas data analysis (2)

DataFrame data processing and analysis

Part of the data is as follows.
insert image description here
This data can be found on Baidu, which is the following
insert image description here

Read data in Excel file

import pandas as pd
pd.set_option('display.unicode.ambiguous_as_wide',True)
pd.set_option('display.unicode.east_asian_width',True)
#usecols指定要读取的列的索引或名字
df=pd.read_excel(r'C:\Users\dell\Desktop\超市营业额2.xlsx',usecols=['工号','姓名','时段','交易额'])
print(df[:10],end='\n\n')#输出前10行数据

insert image description here

#读取第一个worksheet中所有列
#跳过第1、3、5行,指定下标为1的列中数据为DataFrame的行索引标签
df=pd.read_excel(r'C:\Users\dell\Desktop\超市营业额2.xlsx',skiprows=[1,3,5],index_col=1)
print(df[:10])

insert image description here

Filter data that matches certain criteria

#读取全部数据,使用默认索引
df=pd.read_excel(r'C:\Users\dell\Desktop\超市营业额2.xlsx')
#下标在[5,10]区间的行,切片限定的是左闭右开区间
df[5:11]

insert image description here

#iloc使用整数做索引
df.iloc[5]#索引为5的行

insert image description here

df.iloc[[3,5,10]]#下标为[3,5,10]的行

insert image description here

df.iloc[[3,5,10],[0,1,4]]#行下标[3,5,10],列下标[0,1,4]

insert image description here

df[['姓名','时段','交易额']][:5]#指定的列前5行的数据

insert image description here

df[:10][['姓名','日期','柜台']]#只查看前10行指定的列

insert image description here

df.loc[[3,5,10],['姓名','交易额']]#下标为[3,5,10]行的指定列

insert image description here

df.at[3,'姓名']#行下标为3,姓名列的值

'赵六'
#如果有报错,看看柜台列的字符是不是跑到交易额列去了
#因为交易额有几个是空值,直接复制来的数据可能位置不对
print(df[df['交易额']>1700])#交易额高于1700元的数据

insert image description here

df['交易额'].sum()#交易总额

327257.0
#注意这个数据里是中文冒号
df[df['时段']=='14:00-21:00']['交易额'].sum()#下午班的交易总额

151228.0
#张三下午班的交易情况
df[(df.姓名=='张三')&(df.时段=='14:00-21:00')]

insert image description here

#日用品柜台销售总额
df[df['柜台']=='日用品']['交易额'].sum()

88162.0
#张三和李四2人销售总额
df[df['姓名'].isin(['张三','李四'])]['交易额'].sum()

116860.0
#交易额在指定范围内的记录
df[df['交易额'].between(800,850)]

insert image description here

View data characteristics and statistics

#查看交易额统计信息
df['交易额'].describe()

insert image description here

#交易额四分位数
df['交易额'].quantile([0,0.25,0.5,0.75,1.0])

insert image description here

#交易额中值
df['交易额'].median()

1259.0
#交易额最小的3条记录
df.nsmallest(3,'交易额')

insert image description here

#交易额最大的3条记录
df.nlargest(3,'交易额')

insert image description here

#最后一个日期
df['日期'].max()

Timestamp('2019-03-31 00:00:00')
#最小的工号
df['工号'].min()

1001
#第一个最小交易额的行下标
index=df['交易额'].idxmin()
print(index)
#第一个最小交易额
print(df.loc[index,'交易额'])

76
53.0
#第一个最大交易额的行下标
index=df['交易额'].idxmax()
print(index)
#第一个最大交易额
print(df.loc[index,'交易额'])

105
12100.0

Sort data by different criteria

#按交易额和工号降序排序
df.sort_values(by=['交易额','工号'],ascending=False)

insert image description here

#按交易额降序、工号升序排序
df.sort_values(by=['交易额','工号'],ascending=[False,True])

insert image description here

#按工号升序排序,na_position指定缺失值放在最前面/后面,first/last
df.sort_values(by='工号',na_position='last')

insert image description here

#按列名升序排序
#汉字的Unicode编码排序
df.sort_values(by='姓名',ascending=True)

insert image description here

Summarize employee performance using grouping and aggregation

#index对5求余,然后求和
df.groupby(by=lambda num:num%5)['交易额'].sum()

insert image description here

#根据指定字典的键对index进行分组,值为index标签
df.groupby(by={
    
    7:'下标为7的行',35:'下标为35的行'})['交易额'].sum()

insert image description here

#不同时段的销售总额
df.groupby(by='时段')['交易额'].sum()

There is a problem with a row of data, but it is harmless, the important thing is the method
insert image description here

#各柜台销售总额
df.groupby(by='柜台')['交易额'].sum()

insert image description here

#查看每个员工上班总时长是否均匀
ddf=df.groupby(by='姓名')['日期'].count()
ddf.name='上班次数'
ddf

insert image description here

#每个员工交易额的平均值
df.groupby(by='姓名')['交易额'].mean().round(2).sort_values()

insert image description here

#汇总交易额转换为整数
df.groupby(by='姓名').sum()['交易额'].apply(int)

insert image description here

#每个员工交易额的中值
df.groupby(by='姓名')['交易额'].median()

insert image description here

# 每个员工交易额中值的排名
dff=df.groupby(by='姓名').median()
dff['排名']=dff['交易额'].rank(ascending=False)
dff[['交易额','排名']]

insert image description here

# 每个员工不同时段的交易额
df.groupby(by=['姓名','时段'])['交易额'].sum()

insert image description here

# 时段和交易额采用不同的聚合方式
df.groupby(by=['姓名'])['时段','交易额'].aggregate({
    
    '交易额':['sum'],'时段':lambda x:'各时段累计'})

insert image description here

# 使用DataFrame结构的agg()方法对指定列进行聚合
df.agg({
    
    '交易额':['sum','mean','min','max','median'],'日期':['min','max']})

insert image description here

# 对分组结果进行聚合
df.groupby(by='姓名').agg(['max','min','mean','median'])[['工号','交易额']]

insert image description here

Guess you like

Origin blog.csdn.net/weixin_46322367/article/details/129388378