文章目录
索引
import numpy as np
import pandas as pd
# 指定索引的Series
s = pd.Series(np.arange(6), index=list('BABCDA'))
s
Out[6]:
B 0
A 1
B 2
# 无重复索引吗?
s.index.is_unique
Out[7]: False
# 去掉重复的索引为
s.index.unique()
Out[9]: Index(['B', 'A', 'C', 'D'], dtype='object')
# 按索引分组求和
s.groupby(s.index).sum()
Out[11]:
A 6
B 2
C 3
# 构建一个二级索引所需参数,元组形式
a = [['a','a','a','b','b','c','c'],['1','2','3','1','2','2','3']]
t = list(zip(*a))
t
Out[15]:
[('a', '1'),
('a', '2'),
('a', '3'),
('b', '1'),
('b', '2'),
('c', '2'),
('c', '3')]
# 构建多级索引
index = pd.MultiIndex.from_tuples(t,names=['level1','level2'])
index
Out[18]:
MultiIndex([('a', '1'),
('a', '2'),
('a', '3'),
('b', '1'),
('b', '2'),
('c', '2'),
('c', '3')],
names=['level1', 'level2'])
# 构建Series使用多级索引
s = pd.Series(np.random.rand(7), index=index)
s
Out[20]:
level1 level2
a 1 0.442665
2 0.864886
3 0.563471
b 1 0.301778
2 0.387837
c 2 0.190012
3 0.901903
dtype: float64
# 通过索引取
s['b':'c']
Out[22]:
level1 level2
b 1 0.301778
2 0.387837
c 2 0.190012
3 0.901903
dtype: float64
s[['a','c']]
Out[23]:
level1 level2
a 1 0.442665
2 0.864886
3 0.563471
c 2 0.190012
3 0.901903
dtype: float64
s[:,'2']
Out[28]:
level1
a 0.864886
b 0.387837
c 0.190012
dtype: float64
DataFrame 多层索引
df = pd.DataFrame(np.random.randint(1, 10, (4, 3)),
index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns=[['one', 'one', 'two'], ['blue', 'red', 'blue']])
df.index.names = ['row-1', 'row-2']
df.columns.names = ['col-1', 'col-2']
df
one | two | three | four | five | six | |
---|---|---|---|---|---|---|
A | -0.049437 | -0.526499 | 1.780662 | 1.154747 | 2.434957 | -1.579278 |
D | -0.075226 | 0.552163 | -0.462732 | -0.936051 | -0.590041 | 0.484505 |
F | 1.486168 | 0.725907 | 0.598127 | -0.704809 | -2.815687 | -0.062462 |
H | -0.900819 | -0.177751 | -0.232796 | 0.234088 | -1.758574 | 1.255955 |
df2 = df.reindex(index=list('ABCDEFGH'))
df2
one | two | three | four | five | six | |
---|---|---|---|---|---|---|
A | -0.049437 | -0.526499 | 1.780662 | 1.154747 | 2.434957 | -1.579278 |
B | NaN | NaN | NaN | NaN | NaN | NaN |
C | NaN | NaN | NaN | NaN | NaN | NaN |
D | -0.075226 | 0.552163 | -0.462732 | -0.936051 | -0.590041 | 0.484505 |
E | NaN | NaN | NaN | NaN | NaN | NaN |
F | 1.486168 | 0.725907 | 0.598127 | -0.704809 | -2.815687 | -0.062462 |
G | NaN | NaN | NaN | NaN | NaN | NaN |
H | -0.900819 | -0.177751 | -0.232796 | 0.234088 | -1.758574 | 1.255955 |
df.loc['A']['one'] = 100
df
one | two | three | four | five | six | |
---|---|---|---|---|---|---|
A | 100.000000 | -0.526499 | 1.780662 | 1.154747 | 2.434957 | -1.579278 |
D | -0.075226 | 0.552163 | -0.462732 | -0.936051 | -0.590041 | 0.484505 |
F | 1.486168 | 0.725907 | 0.598127 | -0.704809 | -2.815687 | -0.062462 |
H | -0.900819 | -0.177751 | -0.232796 | 0.234088 | -1.758574 | 1.255955 |
分组计算
分组计算三步曲:拆分 -> 应用 -> 合并
拆分:根据什么进行分组?
应用:每个分组进行什么样的计算?
合并:把每个分组的计算结果合并起来。
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
'key2': ['one', 'two', 'one', 'two', 'one'],
'data1': np.random.randint(1, 10, 5),
'data2': np.random.randint(1, 10, 5)})
df
data1 | data2 | key1 | key2 | |
---|---|---|---|---|
0 | 1 | 6 | a | one |
1 | 5 | 9 | a | two |
2 | 4 | 7 | b | one |
3 | 3 | 7 | b | two |
4 | 3 | 5 | a | one |
grouped = df['data1'].groupby(df['key1'])
grouped.mean()
key1
a 3.0
b 3.5
Name: data1, dtype: float64
df['data1'].groupby([df['key1'], df['key2']]).mean()
key1 key2
a one 2
two 5
b one 4
two 3
Name: data1, dtype: int32
df.groupby('key1').mean()
data1 data2
key1
a 3.0 6.666667
b 3.5 7.000000
means = df.groupby(['key1', 'key2']).mean()['data1']
means
key1 key2
a one 2
two 5
b one 4
two 3
Name: data1, dtype: float64
means.unstack()
key2 one two
key1
a 2 5
b 4 3
for name, group in df.groupby('key1'):
print name
print group
a
data1 data2 key1 key2
0 1 6 a one
1 5 9 a two
4 3 5 a one
b
data1 data2 key1 key2
2 4 7 b one
3 3 7 b two
# 转化字典
d = dict(list(df.groupby('key1')))
d
{'a': data1 data2 key1 key2
0 1 6 a one
1 5 9 a two
4 3 5 a one, 'b': data1 data2 key1 key2
2 4 7 b one
3 3 7 b two}
d['a']
data1 data2 key1 key2
0 1 6 a one
1 5 9 a two
4 3 5 a one
按列分组
df.dtypes
data1 int32
data2 int32
key1 object
key2 object
dtype: object
grouped = df.groupby(df.dtypes, axis=1)
dict(list(grouped))
{dtype('int32'): data1 data2
0 1 6
1 5 9
2 4 7
3 3 7
4 3 5, dtype('O'): key1 key2
0 a one
1 a two
2 b one
3 b two
4 a one}
通过字典进行分组
df = pd.DataFrame(np.random.randint(1, 10, (5, 5)),
columns=['a', 'b', 'c', 'd', 'e'],
index=['Alice', 'Bob', 'Candy', 'Dark', 'Emily'])
df
a b c d e
Alice 7 2 7 9 9
Bob 8 7 7 6 8
Candy 4 2 6 6 5
Dark 6 3 9 4 8
Emily 8 4 9 6 5
df.ix[1, 1:3] = np.NaN
df
a b c d e
Alice 7 2 7 9 9
Bob 8 NaN NaN 6 8
Candy 4 2 6 6 5
Dark 6 3 9 4 8
Emily 8 4 9 6 5
# 定义字典映射关系
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'orange', 'e': 'blue'}
grouped = df.groupby(mapping, axis=1)
grouped.sum()
blue orange red
Alice 16 9 9
Bob 8 6 8
Candy 11 6 6
Dark 17 4 9
Emily 14 6 12
grouped.count()
blue orange red
Alice 2 1 2
Bob 1 1 1
Candy 2 1 2
Dark 2 1 2
Emily 2 1 2
grouped.size()
blue 2
orange 1
red 2
dtype: int64
通过函数来分组
当函数作为分组依据时,数据表里的每个索引(可以是行索引,也可以是列索引)都会调用一次函数,函数的返回值作为分组的索引,即相同的返回值分在同一组。
df = pd.DataFrame(np.random.randint(1, 10, (5, 5)),
columns=['a', 'b', 'c', 'd', 'e'],
index=['Alice', 'Bob', 'Candy', 'Dark', 'Emily'])
df
Out[23]:
a b c d e
Alice 7 9 1 9 1
Bob 6 6 7 1 5
Candy 7 8 5 3 8
Dark 3 4 6 8 1
Emily 1 2 2 1 2
def _dummy_group(idx):
print idx
return idx
df.groupby(_dummy_group)
Alice
Bob
Candy
Dark
Emily
Out[24]:
<pandas.core.groupby.DataFrameGroupBy object at 0x07525650>
多级索引数据根据索引级别来分组
columns = pd.MultiIndex.from_arrays([['China', 'USA', 'China', 'USA', 'China'],
['A', 'A', 'B', 'C', 'B']], names=['country', 'index'])
df = pd.DataFrame(np.random.randint(1, 10, (5, 5)), columns=columns)
df
country China USA China USA China
index A A B C B
0 9 6 9 6 2
1 5 6 1 8 7
2 2 5 4 5 2
3 4 8 9 4 9
4 7 2 9 1 8
# 默认行分组,指定列分组
df.groupby(level='country', axis=1).count()
country China USA
0 3 2
1 3 2
2 3 2
3 3 2
4 3 2
df.groupby(level='country', axis=1).sum()
country China USA
0 20 12
1 13 14
2 8 10
3 22 12
4 24 3
聚合
分组运算,先根据一定规则拆分后的数据,然后对数据进行聚合运算,如前面见到的 mean(), sum() 等就是聚合的例子。聚合时,拆分后的第一个索引指定的数据都会依次传给聚合函数进行运算。最后再把运算结果合并起来,生成最终结果。
聚合函数除了内置的 sum(), min(), max(), mean() 等等之外,还可以自定义聚合函数。自定义聚合函数时,使用 agg() 或 aggregate() 函数。
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
'key2': ['one', 'two', 'one', 'two', 'one'],
'data1': np.random.randint(1, 10, 5),
'data2': np.random.randint(1, 10, 5)})
df
data1 data2 key1 key2
0 9 3 a one
1 3 8 a two
2 9 5 b one
3 8 5 b two
4 9 2 a one
def peak_verbose(s):
print type(s)
return s.max() - s.min()
def peak(s):
return s.max() - s.min()
grouped = df.groupby('key1')
grouped.agg(peak_verbose)
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
Out[38]:
data1 data2
key1
a 6 6
b 1 0
# 应用多个聚合函数
grouped['data1', 'data2'].agg(['mean', 'std', peak])
data1 data2
mean std peak mean std peak
key1
a 7.0 3.464102 6 4.333333 3.21455 6
b 8.5 0.707107 1 5.000000 0.00000 0
# 给聚合后的列取名
grouped['data1'].agg([('agerage', 'mean'), ('max-range', peak)])
agerage max-range
key1
a 7.0 6
b 8.5 1
给不同的列应用不同的聚合函数
使用 dict 作为参数来实现
d = {'data1': ['mean', peak, 'max', 'min'],
'data2': 'sum'}
grouped.agg(d)
data1 | data2 | ||||
---|---|---|---|---|---|
mean | peak | max | min | sum | |
key1 | |||||
a | 7.0 | 6 | 9 | 3 | 13 |
b | 8.5 | 1 | 9 | 8 | 10 |
分组运算和转换
groupby 是特殊的分组运算。更一般的分组运算包括 “拆分 - 应用 - 合并”。这里介绍 transform() 和 apply() 来实现分组运算。
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
'key2': ['one', 'two', 'one', 'two', 'one'],
'data1': np.random.randint(1, 10, 5),
'data2': np.random.randint(1, 10, 5)})
df
Out[44]:
data1 data2 key1 key2
0 4 9 a one
1 5 2 a two
2 1 9 b one
3 3 9 b two
4 1 8 a one
# 给 df 每行都添加一个以 key1 分组后的平均值
k1_mean = df.groupby('key1').mean().add_prefix('mean_')
k1_mean
mean_data1 mean_data2
key1
a 3.333333 6.333333
b 2.000000 9.000000
pd.merge(df, k1_mean, left_on='key1', right_index=True)
data1 data2 key1 key2 mean_data1 mean_data2
0 4 9 a one 3.333333 6.333333
1 5 2 a two 3.333333 6.333333
4 1 8 a one 3.333333 6.333333
2 1 9 b one 2.000000 9.000000
3 3 9 b two 2.000000 9.000000
# 使用 transform 简化处理
k1_mean = df.groupby('key1').transform(np.mean).add_prefix('mean_')
k1_mean
mean_data1 mean_data2
0 3.333333 6.333333
1 3.333333 6.333333
2 2.000000 9.000000
3 2.000000 9.000000
4 3.333333 6.333333
df[k1_mean.columns] = k1_mean
df
data1 data2 key1 key2 mean_data1 mean_data2
0 4 9 a one 3.333333 6.333333
1 5 2 a two 3.333333 6.333333
2 1 9 b one 2.000000 9.000000
3 3 9 b two 2.000000 9.000000
4 1 8 a one 3.333333 6.333333
距平化
与平均值的差异值
df = pd.DataFrame(np.random.randint(1, 10, (5, 5)),
columns=['a', 'b', 'c', 'd', 'e'],
index=['Alice', 'Bob', 'Candy', 'Dark', 'Emily'])
df
a b c d e
Alice 4 8 1 7 6
Bob 4 4 4 9 7
Candy 6 2 2 4 6
Dark 4 2 1 4 5
Emily 4 3 4 2 4
def demean(s):
return s - s.mean()
key = ['one', 'one', 'two', 'one', 'two']
demeaned = df.groupby(key).transform(demean)
demeaned
a b c d e
Alice 0 3.333333 -1 0.333333 0
Bob 0 -0.666667 2 2.333333 1
Candy 1 -0.500000 -1 1.000000 1
Dark 0 -2.666667 -1 -2.666667 -1
Emily -1 0.500000 1 -1.000000 -1
demeaned.groupby(key).mean()
a b c d e
one 0 -2.960595e-16 0 -2.960595e-16 0
two 0 0.000000e+00 0 0.000000e+00 0
apply 函数
我们介绍过 DataFrame 的 apply 函数是逐行或逐列来处理数据。GroupBy 的 apply 函数对每个分组进行计算。
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a', 'a', 'a', 'b', 'b', 'a'],
'key2': ['one', 'two', 'one', 'two', 'one', 'one', 'two', 'one', 'two', 'one'],
'data1': np.random.randint(1, 10, 10),
'data2': np.random.randint(1, 10, 10)})
df
data1 data2 key1 key2
0 3 9 a one
1 5 9 a two
2 7 4 b one
3 7 6 b two
4 9 7 a one
5 3 7 a one
6 3 3 a two
7 4 5 b one
8 8 2 b two
9 7 4 a one
# 根据 column 排序,输出其最大的 n 行数据
def top(df, n=2, column='data1'):
return df.sort_values(by=column, ascending=False)[:n]
top(df, n=5)
data1 data2 key1 key2
4 9 7 a one
8 8 2 b two
2 7 4 b one
3 7 6 b two
9 7 4 a one
df.groupby('key1').apply(top)
data1 data2 key1 key2
key1
a 4 9 7 a one
9 7 4 a one
b 8 8 2 b two
2 7 4 b one
# 传递参数
df.groupby('key1').apply(top, n=3, column='data2')
data1 data2 key1 key2
key1
a 0 3 9 a one
1 5 9 a two
4 9 7 a one
b 3 7 6 b two
7 4 5 b one
2 7 4 b one
# 禁用分组键
df.groupby('key1', group_keys=False).apply(top)
data1 data2 key1 key2
4 9 7 a one
9 7 4 a one
8 8 2 b two
2 7 4 b one
载入数据到 Pandas
- 索引:将一个列或多个列读取出来构成 DataFrame,其中涉及是否从文件中读取索引以及列名
- 类型推断和数据转换:包括用户自定义的转换以及缺失值标记
- 日期解析
- 迭代:针对大文件进行逐块迭代。这个是Pandas和Python原生的csv库的最大区别
- 不规整数据问题:跳过一些行,或注释等等
读取CSV文件,文件内容:
a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
df = pd.read_csv('ex1.csv')
df
# 列索引是文件第一行读出来的,行索引是pandas分配的
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
# 完成相同功能
df = pd.read_table('data/ex1.csv', sep=',')
# 不将第一行作为列索引
pd.read_csv('ex1.csv', header=None)
# 指定列索引
pd.read_csv('ex1.csv', header=None, names=['a','b','c','d','msg'])
# 指定某一列作为行索引
pd.read_csv('data/ex2.csv', header=None, names=['a', 'b', 'c', 'd', 'msg'], index_col='msg')
a b c d
msg
hello 1 2 3 4
world 5 6 7 8
foo 9 10 11 12
# 多层行索引
pd.read_csv('data/ex2.csv', header=None, names=['a', 'b', 'c', 'd', 'msg'], index_col=['msg', 'a'])
b c d
msg a
hello 1 2 3 4
world 5 6 7 8
foo 9 10 11 12
处理不规则的分隔符
数据样式为:
A B C
aaa -0.264438 -1.026059 -0.619500
bbb 0.927272 0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382 1.100491
有多个空格和单个空格组成分隔符
# 正则表达式
pd.read_table('data/ex3.csv', sep='\s+')
# pd.read_table('data/ex3.csv', sep=' ')
# pd.read_csv('data/ex3.csv')
A B C
aaa -0.264438 -1.026059 -0.619500
bbb 0.927272 0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382 1.100491
注意,pandas自动将第一列作为行索引
缺失值处理
数据样式:
something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,8,world
three,9,10,11,12,foo
- pandas自动将空值位置和NA位置赋值
NaN
pd.read_csv('data/ex5.csv')
something a b c d message
0 one 1 2 3 4 NaN
1 two 5 6 NaN 8 world
2 three 9 10 11 12 foo
- 指定需要被识别为NaN的值
pd.read_csv('data/ex5.csv', na_values=['NA', 'NULL', 'foo'])
something a b c d message
0 one 1 2 3 4 NaN
1 two 5 6 NaN 8 world
2 three 9 10 11 12 NaN
- 针对不同列指定不同的缺失值
pd.read_csv('data/ex5.csv', na_values={'message': ['foo', 'NA'], 'something': ['two']})
something a b c d message
0 one 1 2 3 4 NaN
1 NaN 5 6 NaN 8 world
2 three 9 10 11 12 NaN
逐块读取数据
有10000行数据的文件,读10行,格式如下
pd.read_csv('data/ex6.csv', nrows=10)
one two three four key
0 0.467976 -0.038649 -0.295344 -1.824726 L
1 -0.358893 1.404453 0.704965 -0.200638 B
2 -0.501840 0.659254 -0.421691 -0.057688 G
3 0.204886 1.074134 1.388361 -0.982404 R
4 0.354628 -0.133116 0.283763 -0.837063 Q
5 1.817480 0.742273 0.419395 -2.251035 Q
6 -0.776764 0.935518 -0.332872 -1.875641 U
7 -0.913135 1.530624 -0.572657 0.477252 K
8 0.358480 -0.497572 -0.367016 0.507702 S
9 -1.740877 -1.160417 -1.637830 2.172201 G
统计每个 key 出现的次数
tr = pd.read_csv('data/ex6.csv', chunksize=1000)
key_count = pd.Series([])
for pieces in tr:
key_count = key_count.add(pieces['key'].value_counts(), fill_value=0)
key_count = key_count.sort_values(ascending=False)
key_count[:10]
E 368
X 364
L 346
O 343
Q 340
M 338
J 337
F 335
K 334
H 330
dtype: float64
保存数据到磁盘
df.to_csv('data/ex5_out.csv')
df = pd.read_csv('data/ex5_out.csv')
df
Unnamed: 0 something a b c d message
0 0 one 1 2 3 4 NaN
1 1 two 5 6 NaN 8 world
2 2 three 9 10 11 12 foo
# 直接写入存在问题,会把行索引当作数据再次读入
# 不写索引
df = pd.read_csv('data/ex5.csv')
df.to_csv('data/ex5_out.csv', index=False)
# 不写列名称
df = pd.read_csv('data/ex5.csv')
df.to_csv('data/ex5_out.csv', index=False, header=None)
# 指定分隔符
df = pd.read_csv('data/ex5.csv')
df.to_csv('data/ex5_out.csv', index=False, sep='|')
# 只写出一部分列
df = pd.read_csv('data/ex5.csv')
df.to_csv('data/ex5_out.csv', index=False, columns=['a', 'b', 'message'])
二进制格式
二进制的优点是容量小,读取速度快。缺点是可能在不同版本间不兼容。比如 Pandas 版本升级后,早期版本保存的二进制数据可能无法正确地读出来。
df = pd.read_csv('data/ex1.csv')
df
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
pd.to_pickle(df, 'data/ex1_pickle.bin')
pd.read_pickle('data/ex1_pickle.bin')
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
pd.to_pickle(pd.read_csv('data/ex6.csv'), 'data/ex6_pickle.bin')
其他格式
- HDF5: HDF5是个C语言实现的库,可以高效地读取磁盘上的二进制存储的科学数据。
- Excel文件: pd.read_excel/pd.ExcelFile/pd.ExcelWriter
- JSON: 通过 json 模块转换为字典,再转换为 DataFrame
- SQL 数据库:通过 pd.io.sql 模块来从数据库读取数据
- NoSQL (MongoDB) 数据库:需要结合相应的数据库模块,如 pymongo 。再通过游标把数据读出来,转换为 DataFrame