import pandas as pd
import numpy as np
df=pd.read_excel('./pandas中的数据分组与透视表/data.xlsx')
df
|
日期 |
名称 |
类别 |
单价 |
数量 |
金额 |
0 |
2018-07-01 |
商品A |
服装 |
20 |
2 |
40 |
1 |
2018-07-02 |
商品B |
服装 |
200 |
3 |
600 |
2 |
2018-07-03 |
商品C |
食品 |
1200 |
4 |
4800 |
3 |
2018-07-04 |
商品A |
服装 |
22 |
5 |
110 |
4 |
2018-07-05 |
商品B |
服装 |
220 |
6 |
1320 |
5 |
2018-07-06 |
商品C |
食品 |
1000 |
7 |
7000 |
6 |
2018-07-07 |
商品A |
服装 |
30 |
3 |
90 |
7 |
2018-07-08 |
商品A |
服装 |
800 |
1 |
800 |
8 |
2018-07-09 |
商品C |
食品 |
1300 |
4 |
5200 |
9 |
2018-07-10 |
商品B |
服装 |
230 |
3 |
690 |
10 |
2018-07-11 |
商品A |
服装 |
28 |
1 |
28 |
grouped=df.groupby('类别')
grouped
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000021B20F06888>
for name, data in grouped:
print(name)
print(data)
服装
日期 名称 类别 单价 数量 金额
0 2018-07-01 商品A 服装 20 2 40
1 2018-07-02 商品B 服装 200 3 600
3 2018-07-04 商品A 服装 22 5 110
4 2018-07-05 商品B 服装 220 6 1320
6 2018-07-07 商品A 服装 30 3 90
7 2018-07-08 商品A 服装 800 1 800
9 2018-07-10 商品B 服装 230 3 690
10 2018-07-11 商品A 服装 28 1 28
食品
日期 名称 类别 单价 数量 金额
2 2018-07-03 商品C 食品 1200 4 4800
5 2018-07-06 商品C 食品 1000 7 7000
8 2018-07-09 商品C 食品 1300 4 5200
grouped['数量','金额'].sum()
|
数量 |
金额 |
类别 |
|
|
服装 |
24 |
3678 |
食品 |
15 |
17000 |
grouped['单价'].max()
类别
服装 800
食品 1300
Name: 单价, dtype: int64
grouped=df.groupby(['类别','名称'])
for i,k in grouped:
print(i,k)
('服装', '商品A') 日期 名称 类别 单价 数量 金额
0 2018-07-01 商品A 服装 20 2 40
3 2018-07-04 商品A 服装 22 5 110
6 2018-07-07 商品A 服装 30 3 90
7 2018-07-08 商品A 服装 800 1 800
10 2018-07-11 商品A 服装 28 1 28
('服装', '商品B') 日期 名称 类别 单价 数量 金额
1 2018-07-02 商品B 服装 200 3 600
4 2018-07-05 商品B 服装 220 6 1320
9 2018-07-10 商品B 服装 230 3 690
('食品', '商品C') 日期 名称 类别 单价 数量 金额
2 2018-07-03 商品C 食品 1200 4 4800
5 2018-07-06 商品C 食品 1000 7 7000
8 2018-07-09 商品C 食品 1300 4 5200
grouped['单价'].max()
类别 名称
服装 商品A 800
商品B 230
食品 商品C 1300
Name: 单价, dtype: int64
grouped[['单价']].mean()
|
|
单价 |
类别 |
名称 |
|
服装 |
商品A |
180.000000 |
商品B |
216.666667 |
食品 |
商品C |
1166.666667 |
grouped=df.groupby('类别')
for name, data in grouped:
print(name)
print(data['名称'].unique())
服装
['商品A' '商品B']
食品
['商品C']
time datetime
import time
time.time()
1585909836.4156868
m=time.localtime(1585835029)
l=time.strftime('%Y--%m--%d %H:%M:%S',time.localtime(1585835029))
time.mktime(m)
1585835029.0
from datetime import datetime
datetime.now()
datetime.datetime(2020, 4, 3, 18, 30, 36, 462421)
datetime.now().year
2020
m=datetime.now().strftime('%Y-%m-%d')
type(m)
str
l=datetime.strptime('2020-04-02','%Y-%m-%d')
datetime.fromtimestamp(1585835029.0)
datetime.datetime(2020, 4, 2, 21, 43, 49)
datetime.now().timestamp()
1585909836.51427
time.mktime(m)
1585835029.0
datetime.now().timestamp()
1585910834.269616
delta=datetime.now()-datetime(2018,5,21)
delta.days
683
pamdas 时间序列
pd.date_range('2020-3-1','2020-4-3')
DatetimeIndex(['2020-03-01', '2020-03-02', '2020-03-03', '2020-03-04',
'2020-03-05', '2020-03-06', '2020-03-07', '2020-03-08',
'2020-03-09', '2020-03-10', '2020-03-11', '2020-03-12',
'2020-03-13', '2020-03-14', '2020-03-15', '2020-03-16',
'2020-03-17', '2020-03-18', '2020-03-19', '2020-03-20',
'2020-03-21', '2020-03-22', '2020-03-23', '2020-03-24',
'2020-03-25', '2020-03-26', '2020-03-27', '2020-03-28',
'2020-03-29', '2020-03-30', '2020-03-31', '2020-04-01',
'2020-04-02', '2020-04-03'],
dtype='datetime64[ns]', freq='D')
pd.date_range('2020-3-1','2020-4-3',freq='w')
DatetimeIndex(['2020-03-01', '2020-03-08', '2020-03-15', '2020-03-22',
'2020-03-29'],
dtype='datetime64[ns]', freq='W-SUN')
pd.date_range('2020-3-1',freq='Q',periods=10)
DatetimeIndex(['2020-03-31', '2020-06-30', '2020-09-30', '2020-12-31',
'2021-03-31', '2021-06-30', '2021-09-30', '2021-12-31',
'2022-03-31', '2022-06-30'],
dtype='datetime64[ns]', freq='Q-DEC')
data={
'time':pd.date_range('2019-1-1',freq='T',periods=200000),
'cpu':np.random.randn(200000) + 10
}
df=pd.DataFrame(data,columns= ['time','cpu'])
df
|
time |
cpu |
0 |
2019-01-01 00:00:00 |
10.608832 |
1 |
2019-01-01 00:01:00 |
10.107891 |
2 |
2019-01-01 00:02:00 |
9.502407 |
3 |
2019-01-01 00:03:00 |
10.453344 |
4 |
2019-01-01 00:04:00 |
10.025292 |
... |
... |
... |
199995 |
2019-05-19 21:15:00 |
9.354853 |
199996 |
2019-05-19 21:16:00 |
10.311234 |
199997 |
2019-05-19 21:17:00 |
10.764722 |
199998 |
2019-05-19 21:18:00 |
10.994048 |
199999 |
2019-05-19 21:19:00 |
10.775006 |
200000 rows × 2 columns
df.head()
|
time |
cpu |
0 |
2019-01-01 00:00:00 |
10.608832 |
1 |
2019-01-01 00:01:00 |
10.107891 |
2 |
2019-01-01 00:02:00 |
9.502407 |
3 |
2019-01-01 00:03:00 |
10.453344 |
4 |
2019-01-01 00:04:00 |
10.025292 |
df.tail()
|
time |
cpu |
199995 |
2019-05-19 21:15:00 |
9.354853 |
199996 |
2019-05-19 21:16:00 |
10.311234 |
199997 |
2019-05-19 21:17:00 |
10.764722 |
199998 |
2019-05-19 21:18:00 |
10.994048 |
199999 |
2019-05-19 21:19:00 |
10.775006 |
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 2 columns):
time 200000 non-null datetime64[ns]
cpu 200000 non-null float64
dtypes: datetime64[ns](1), float64(1)
memory usage: 3.1 MB
s=pd.to_datetime(df.time)
df.index=s
df.head()
|
time |
cpu |
time |
|
|
2019-01-01 00:00:00 |
2019-01-01 00:00:00 |
10.608832 |
2019-01-01 00:01:00 |
2019-01-01 00:01:00 |
10.107891 |
2019-01-01 00:02:00 |
2019-01-01 00:02:00 |
9.502407 |
2019-01-01 00:03:00 |
2019-01-01 00:03:00 |
10.453344 |
2019-01-01 00:04:00 |
2019-01-01 00:04:00 |
10.025292 |
df.drop('time',axis=1)
|
cpu |
time |
|
2019-01-01 00:00:00 |
10.608832 |
2019-01-01 00:01:00 |
10.107891 |
2019-01-01 00:02:00 |
9.502407 |
2019-01-01 00:03:00 |
10.453344 |
2019-01-01 00:04:00 |
10.025292 |
... |
... |
2019-05-19 21:15:00 |
9.354853 |
2019-05-19 21:16:00 |
10.311234 |
2019-05-19 21:17:00 |
10.764722 |
2019-05-19 21:18:00 |
10.994048 |
2019-05-19 21:19:00 |
10.775006 |
200000 rows × 1 columns
df.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 200000 entries, 2019-01-01 00:00:00 to 2019-05-19 21:19:00
Data columns (total 2 columns):
time 200000 non-null datetime64[ns]
cpu 200000 non-null float64
dtypes: datetime64[ns](1), float64(1)
memory usage: 9.6 MB
df['2019-02-01 08:00:00':'2019-02-01 09:10:00']
|
time |
cpu |
time |
|
|
2019-02-01 08:00:00 |
2019-02-01 08:00:00 |
10.016300 |
2019-02-01 08:01:00 |
2019-02-01 08:01:00 |
11.500588 |
2019-02-01 08:02:00 |
2019-02-01 08:02:00 |
9.639299 |
2019-02-01 08:03:00 |
2019-02-01 08:03:00 |
9.365297 |
2019-02-01 08:04:00 |
2019-02-01 08:04:00 |
9.016025 |
... |
... |
... |
2019-02-01 09:06:00 |
2019-02-01 09:06:00 |
10.236108 |
2019-02-01 09:07:00 |
2019-02-01 09:07:00 |
10.644256 |
2019-02-01 09:08:00 |
2019-02-01 09:08:00 |
10.251236 |
2019-02-01 09:09:00 |
2019-02-01 09:09:00 |
10.343902 |
2019-02-01 09:10:00 |
2019-02-01 09:10:00 |
11.026916 |
71 rows × 2 columns
df['2019-01-01']
|
time |
cpu |
time |
|
|
2019-01-01 00:00:00 |
2019-01-01 00:00:00 |
10.608832 |
2019-01-01 00:01:00 |
2019-01-01 00:01:00 |
10.107891 |
2019-01-01 00:02:00 |
2019-01-01 00:02:00 |
9.502407 |
2019-01-01 00:03:00 |
2019-01-01 00:03:00 |
10.453344 |
2019-01-01 00:04:00 |
2019-01-01 00:04:00 |
10.025292 |
... |
... |
... |
2019-01-01 23:55:00 |
2019-01-01 23:55:00 |
9.533162 |
2019-01-01 23:56:00 |
2019-01-01 23:56:00 |
10.536986 |
2019-01-01 23:57:00 |
2019-01-01 23:57:00 |
11.243809 |
2019-01-01 23:58:00 |
2019-01-01 23:58:00 |
8.733646 |
2019-01-01 23:59:00 |
2019-01-01 23:59:00 |
9.542733 |
1440 rows × 2 columns
df.groupby(df.index.date).mean()
|
cpu |
2019-01-01 |
9.957422 |
2019-01-02 |
9.935621 |
2019-01-03 |
10.005468 |
2019-01-04 |
9.992855 |
2019-01-05 |
9.995959 |
... |
... |
2019-05-15 |
10.031266 |
2019-05-16 |
9.973673 |
2019-05-17 |
10.045151 |
2019-05-18 |
10.023991 |
2019-05-19 |
9.994609 |
139 rows × 1 columns
df.groupby(df.index.hour).mean()
|
cpu |
time |
|
0 |
10.007654 |
1 |
9.992322 |
2 |
9.987562 |
3 |
9.987208 |
4 |
10.008922 |
5 |
10.022125 |
6 |
9.978566 |
7 |
9.997908 |
8 |
10.012145 |
9 |
9.987106 |
10 |
9.988722 |
11 |
9.990424 |
12 |
10.002575 |
13 |
10.002844 |
14 |
10.001017 |
15 |
10.006581 |
16 |
9.993640 |
17 |
10.015496 |
18 |
10.001964 |
19 |
9.995674 |
20 |
9.972376 |
21 |
9.998824 |
22 |
10.000342 |
23 |
9.998308 |
df.groupby(df.index.week).mean()
|
cpu |
time |
|
1 |
9.977973 |
2 |
9.997662 |
3 |
9.998770 |
4 |
9.994222 |
5 |
9.999088 |
6 |
9.997922 |
7 |
9.995254 |
8 |
10.010069 |
9 |
10.004777 |
10 |
10.007423 |
11 |
10.001689 |
12 |
10.001328 |
13 |
9.999866 |
14 |
9.997371 |
15 |
9.995390 |
16 |
10.003722 |
17 |
9.991836 |
18 |
9.988586 |
19 |
9.979943 |
20 |
10.013066 |
df.resample('5T').mean()
|
cpu |
time |
|
2019-01-01 00:00:00 |
10.139553 |
2019-01-01 00:05:00 |
9.544466 |
2019-01-01 00:10:00 |
10.198618 |
2019-01-01 00:15:00 |
10.135603 |
2019-01-01 00:20:00 |
9.613786 |
... |
... |
2019-05-19 20:55:00 |
10.348141 |
2019-05-19 21:00:00 |
10.104422 |
2019-05-19 21:05:00 |
9.483435 |
2019-05-19 21:10:00 |
9.783999 |
2019-05-19 21:15:00 |
10.439973 |
40000 rows × 1 columns
df.groupby(df.index.date)
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000021B265E5748>