本次介绍 Pandas 总结归纳的8节至15节内容
import pandas as pd
import numpy as np
import matplotlib as plt
import math
8. Pandas分组(GroupBy)
ipl_data = { 'Team' : [ 'Riders' , 'Riders' , 'Devils' , 'Devils' , 'Kings' ,
'kings' , 'Kings' , 'Kings' , 'Riders' , 'Royals' , 'Royals' , 'Riders' ] ,
'Rank' : [ 1 , 2 , 2 , 3 , 3 , 4 , 1 , 1 , 2 , 4 , 1 , 2 ] ,
'Year' : [ 2014 , 2015 , 2014 , 2015 , 2014 , 2015 , 2016 , 2017 , 2016 , 2014 , 2015 , 2017 ] ,
'Points' : [ 876 , 789 , 863 , 673 , 741 , 812 , 756 , 788 , 694 , 701 , 804 , 690 ] }
df = pd. DataFrame( ipl_data)
df. groupby( 'Team' ) . groups
df. groupby( [ 'Team' , 'Year' ] ) . groups
df. groupby( 'Year' ) . get_group( 2014 )
{‘Devils’: Int64Index([2, 3], dtype=‘int64’), ‘Kings’: Int64Index([4, 6, 7], dtype=‘int64’), ‘Riders’: Int64Index([0, 1, 8, 11], dtype=‘int64’), ‘Royals’: Int64Index([9, 10], dtype=‘int64’), ‘kings’: Int64Index([5], dtype=‘int64’)}
{(‘Devils’, 2014): Int64Index([2], dtype=‘int64’), (‘Devils’, 2015): Int64Index([3], dtype=‘int64’), (‘Kings’, 2014): Int64Index([4], dtype=‘int64’), (‘Kings’, 2016): Int64Index([6], dtype=‘int64’), (‘Kings’, 2017): Int64Index([7], dtype=‘int64’), (‘Riders’, 2014): Int64Index([0], dtype=‘int64’), (‘Riders’, 2015): Int64Index([1], dtype=‘int64’), (‘Riders’, 2016): Int64Index([8], dtype=‘int64’), (‘Riders’, 2017): Int64Index([11], dtype=‘int64’), (‘Royals’, 2014): Int64Index([9], dtype=‘int64’), (‘Royals’, 2015): Int64Index([10], dtype=‘int64’), (‘kings’, 2015): Int64Index([5], dtype=‘int64’)}
Points
Rank
Team
Year
0
876
1
Riders
2014
2
863
2
Devils
2014
4
741
3
Kings
2014
9
701
4
Royals
2014
df. groupby( 'Year' ) [ 'Points' ] . agg( np. mean)
df. groupby( 'Team' ) . agg( np. size)
df1= df. groupby( 'Team' )
df1[ 'Points' ] . agg( [ np. sum , np. mean, np. std] )
Year 2014 795.25 2015 769.50 2016 725.00 2017 739.00 Name: Points, dtype: float64
Points
Rank
Year
Team
Devils
2
2
2
Kings
3
3
3
Riders
4
4
4
Royals
2
2
2
kings
1
1
1
sum
mean
std
Team
Devils
1536
768.000000
134.350288
Kings
2285
761.666667
24.006943
Riders
3049
762.250000
88.567771
Royals
1505
752.500000
72.831998
kings
812
812.000000
NaN
df. groupby( 'Team' ) . filter ( lambda x: len ( x) >= 4 )
Points
Rank
Team
Year
0
876
1
Riders
2014
1
789
2
Riders
2015
8
694
2
Riders
2016
11
690
2
Riders
2017
9. Pandas合并/连接
pd.merge(left, right, how=‘inner’, on=None, left_on=None, right_on=None,left_index=False, right_index=False, sort=True)
left - 一个DataFrame对象。
right - 另一个DataFrame对象。
on - 列(名称)连接,必须在左和右DataFrame对象中存在(找到)。
left_on - 左侧DataFrame中的列用作键,可以是列名或长度等于DataFrame长度的数组。
right_on - 来自右的DataFrame的列作为键,可以是列名或长度等于DataFrame长度的数组。
left_index - 如果为True,则使用左侧DataFrame中的索引(行标签)作为其连接键。 在具有MultiIndex(分层)的DataFrame的情况下,级别的数量必须与来自右DataFrame的连接键的数量相匹配。
right_index - 与右DataFrame的left_index具有相同的用法。
how - 它是left, right, outer以及inner之中的一个,默认为内inner。 下面将介绍每种方法的用法。
sort - 按照字典顺序通过连接键对结果DataFrame进行排序。默认为True,设置为False时,在很多情况下大大提高性能。
leftdata = pd. DataFrame( {
'id' : [ 1 , 2 , 3 , 4 , 5 ] ,
'Name' : [ 'Alex' , 'Amy' , 'Allen' , 'Alice' , 'Ayoung' ] ,
'subject_id' : [ 'sub1' , 'sub2' , 'sub4' , 'sub6' , 'sub5' ] } )
rightdata = pd. DataFrame(
{ 'id' : [ 1 , 2 , 3 , 4 , 5 ] ,
'Name' : [ 'Billy' , 'Brian' , 'Bran' , 'Bryce' , 'Betty' ] ,
'subject_id' : [ 'sub2' , 'sub4' , 'sub3' , 'sub6' , 'sub5' ] } )
pd. merge( leftdata, rightdata, on= 'id' )
pd. merge( leftdata, rightdata, on= [ 'id' , 'subject_id' ] )
Name_x
id
subject_id_x
Name_y
subject_id_y
0
Alex
1
sub1
Billy
sub2
1
Amy
2
sub2
Brian
sub4
2
Allen
3
sub4
Bran
sub3
3
Alice
4
sub6
Bryce
sub6
4
Ayoung
5
sub5
Betty
sub5
Name_x
id
subject_id
Name_y
0
Alice
4
sub6
Bryce
1
Ayoung
5
sub5
Betty
pd. merge( leftdata, rightdata, on= 'subject_id' , how= 'left' )
pd. merge( leftdata, rightdata, on= 'subject_id' , how= 'right' )
Name_x
id_x
subject_id
Name_y
id_y
0
Alex
1
sub1
NaN
NaN
1
Amy
2
sub2
Billy
1.0
2
Allen
3
sub4
Brian
2.0
3
Alice
4
sub6
Bryce
4.0
4
Ayoung
5
sub5
Betty
5.0
Name_x
id_x
subject_id
Name_y
id_y
0
Amy
2.0
sub2
Billy
1
1
Allen
3.0
sub4
Brian
2
2
Alice
4.0
sub6
Bryce
4
3
Ayoung
5.0
sub5
Betty
5
4
NaN
NaN
sub3
Bran
3
pd. merge( leftdata, rightdata, on= 'subject_id' , how= 'inner' )
pd. merge( leftdata, rightdata, on= 'subject_id' , how= 'outer' )
Name_x
id_x
subject_id
Name_y
id_y
0
Amy
2
sub2
Billy
1
1
Allen
3
sub4
Brian
2
2
Alice
4
sub6
Bryce
4
3
Ayoung
5
sub5
Betty
5
Name_x
id_x
subject_id
Name_y
id_y
0
Alex
1.0
sub1
NaN
NaN
1
Amy
2.0
sub2
Billy
1.0
2
Allen
3.0
sub4
Brian
2.0
3
Alice
4.0
sub6
Bryce
4.0
4
Ayoung
5.0
sub5
Betty
5.0
5
NaN
NaN
sub3
Bran
3.0
10. Pandas级联函数
函数
函数形式
参数解释
concat ( )
pd.concat ( objs,axis=0,join=‘outer’,join_axes=None,ignore_index=False )
bjs - 这是Series,DataFrame或Panel对象的序列或映射 axis - {0,1,…},默认为0,这是连接的轴 join - {‘inner’, ‘outer’},默认inner,如何处理其他轴上的索引,联合的外部和交叉的内部 ignore_index − 布尔值,默认为False。如果指定为True,结果轴将被标记为:0,…,n-1 join_axes - 这是Index对象的列表,用于其他(n-1)轴的特定索引,而不是执行内部/外部集逻辑
append ( )
append ( [object ] )
object - 数据框对象
df1 = pd. DataFrame( { 'Name' : [ 'Alex' , 'Amy' , 'Allen' , 'Alice' , 'Ayoung' ] ,
'subject_id' : [ 'sub1' , 'sub2' , 'sub4' , 'sub6' , 'sub5' ] ,
'Marks_scored' : [ 98 , 90 , 87 , 69 , 78 ] } , index= [ 1 , 2 , 3 , 4 , 5 ] )
df2 = pd. DataFrame( { 'Name' : [ 'Billy' , 'Brian' , 'Bran' , 'Bryce' , 'Betty' ] ,
'subject_id' : [ 'sub2' , 'sub4' , 'sub3' , 'sub6' , 'sub5' ] ,
'Marks_scored' : [ 89 , 80 , 79 , 97 , 88 ] } , index= [ 1 , 2 , 3 , 4 , 5 ] )
pd. concat( [ df1, df2] )
pd. concat( [ df1, df2] , keys= [ 'a' , 'b' ] )
pd. concat( [ df1, df2] , keys= [ 'a' , 'b' ] , ignore_index= True )
Marks_scored
Name
subject_id
1
98
Alex
sub1
2
90
Amy
sub2
3
87
Allen
sub4
4
69
Alice
sub6
5
78
Ayoung
sub5
1
89
Billy
sub2
2
80
Brian
sub4
3
79
Bran
sub3
4
97
Bryce
sub6
5
88
Betty
sub5
Marks_scored
Name
subject_id
a
1
98
Alex
sub1
2
90
Amy
sub2
3
87
Allen
sub4
4
69
Alice
sub6
5
78
Ayoung
sub5
b
1
89
Billy
sub2
2
80
Brian
sub4
3
79
Bran
sub3
4
97
Bryce
sub6
5
88
Betty
sub5
Marks_scored
Name
subject_id
0
98
Alex
sub1
1
90
Amy
sub2
2
87
Allen
sub4
3
69
Alice
sub6
4
78
Ayoung
sub5
5
89
Billy
sub2
6
80
Brian
sub4
7
79
Bran
sub3
8
97
Bryce
sub6
9
88
Betty
sub5
pd. concat( [ df1, df2] , axis= 1 )
df1. append( [ df2, df1, df1[ 1 : 3 ] ] )
Marks_scored
Name
subject_id
Marks_scored
Name
subject_id
1
98
Alex
sub1
89
Billy
sub2
2
90
Amy
sub2
80
Brian
sub4
3
87
Allen
sub4
79
Bran
sub3
4
69
Alice
sub6
97
Bryce
sub6
5
78
Ayoung
sub5
88
Betty
sub5
Marks_scored
Name
subject_id
1
98
Alex
sub1
2
90
Amy
sub2
3
87
Allen
sub4
4
69
Alice
sub6
5
78
Ayoung
sub5
1
89
Billy
sub2
2
80
Brian
sub4
3
79
Bran
sub3
4
97
Bryce
sub6
5
88
Betty
sub5
1
98
Alex
sub1
2
90
Amy
sub2
3
87
Allen
sub4
4
69
Alice
sub6
5
78
Ayoung
sub5
2
90
Amy
sub2
3
87
Allen
sub4
11. Pandas日期时间函数
函数
函数作用
pd.datetime.now( )
用于获取当前的日期和时间
pd.Timestamp( )
创建一个时间戳
pd.to_datetime( [date1,date2,…. ] )
转换为时间戳
pd.date_range( start_date, end_date ,freq=’ ’ ).time
创建一个时间范围
pd.date_range( date, periods=int_number )
创建日期序列
pd.date_range( date, periods=int_number, freq=‘M’ )
更改日期频率
pd.bdate_range( date, periods=int_number )
用来创建商业日期范围,不包括周六和周天
偏移别名
描述说明
偏移别名
描述说明
B
工作日频率
H
小时频率
BQS
商务季度开始频率
MS
月起始频率
D
日历/自然日频率
T, min
分钟的频率
A
年度(年)结束频率
SMS
SMS半开始频率
W
每周频率
S
秒频率
BA
商务年底结束
BMS
商务月开始频率
M
月结束频率
L, ms
毫秒
BAS
商务年度开始频率
Q
季度结束频率
SM
半月结束频率
U, us
微秒
BH
商务时间频率
BQ
商务季度结束频率
SM
半月结束频率
N
纳秒
BH
商务时间频率
BQ
商务季度结束频率
BM
商务月结束频率
QS
季度开始频率
pd. datetime. now( )
pd. Timestamp( '2018-12-20' )
pd. Timestamp( 1588612345 , unit= 's' )
pd. date_range( "12:00" , "23:59" , freq= "H" ) . time
pd. to_datetime( [ 'Jul 31, 2009' , '2019-10-10' , '2009/11/23' , '2019.12.31' , None ] )
datetime.datetime(2018, 12, 21, 14, 20, 25, 4339)
Timestamp(‘2018-12-20 00:00:00’)
Timestamp(‘2020-05-04 17:12:25’)
array([datetime.time(12, 0), datetime.time(13, 0), datetime.time(14, 0), datetime.time(15, 0), datetime.time(16, 0), datetime.time(17, 0), datetime.time(18, 0), datetime.time(19, 0), datetime.time(20, 0), datetime.time(21, 0), datetime.time(22, 0), datetime.time(23, 0)], dtype=object)
DatetimeIndex([‘2009-07-31’, ‘2019-10-10’, ‘2009-11-23’, ‘2019-12-31’, ‘NaT’], dtype=‘datetime64[ns]’, freq=None)
pd. date_range( '2018/11/21' , periods= 5 )
pd. date_range( '2018/11/21' , periods= 5 , freq= 'M' )
pd. bdate_range( '2018/12/21' , periods= 5 )
DatetimeIndex([‘2018-11-21’, ‘2018-11-22’, ‘2018-11-23’, ‘2018-11-24’, ‘2018-11-25’], dtype=‘datetime64[ns]’, freq=‘D’)
DatetimeIndex([‘2018-11-30’, ‘2018-12-31’, ‘2019-01-31’, ‘2019-02-28’, ‘2019-03-31’], dtype=‘datetime64[ns]’, freq=‘M’)
DatetimeIndex([‘2018-12-21’, ‘2018-12-24’, ‘2018-12-25’, ‘2018-12-26’, ‘2018-12-27’], dtype=‘datetime64[ns]’, freq=‘B’)
12. Pandas分类构造函数
函数
函数作用
pd.Categorical(values, categories, ordered)
创建一个类别对象
pd. Series( [ "a" , "b" , "c" , "a" ] , dtype= "category" )
pd. Categorical( [ 'a' , 'b' , 'c' , 'a' , 'b' , 'c' ] , ordered= True )
0 a 1 b 2 c 3 a dtype: category Categories (3, object): [a, b, c]
[a, b, c, a, b, c] Categories (3, object): [a < b < c]
cat = pd. Categorical( [ "a" , "c" , "c" , np. nan] , categories= [ "b" , "a" , "c" ] )
df = pd. DataFrame( { "cat" : cat, "s" : [ "a" , "c" , "c" , np. nan] } )
df. describe( )
df[ "cat" ] . describe( )
cat
s
count
3
3
unique
2
2
top
c
c
freq
2
2
count 3 unique 2 top c freq 2 Name: cat, dtype: object
cat. ordered
cat. categories = [ "Group %s" % g for g in cat. categories]
cat. add_categories( [ 4 ] )
cat. remove_categories( "Group a" )
False
[Group a, Group c, Group c, NaN] Categories (4, object): [Group b, Group a, Group c, 4]
[NaN, Group c, Group c, NaN] Categories (2, object): [Group b, Group c]
13. Pandas可视化
绘图函数
图形
df.plot( )
绘图
df.plot.bar()或plot.barh()
条形图
df.plot.hist()
直方图
df.plot.box()
盒型图
df.plot.area()
面积
df.plot.scatter(x=,y=)
散点图
df.plot.pie( )
饼状图
14. Pandas其他相关函数应用
函数
函数作用
形式
pipe()
可以通过将函数和适当数量的参数作为管道参数来执行自定义函数对DataFrame进行操作
df.piper ( function, perp )
apply()
apply()方法沿DataFrame或Panel的轴应用任意函数,它与描述性统计方法一样,采用可选的轴参数
df.apply(np.mean,axis=1)
reindex()
采用可选参数方法,它是一个填充方法
df.reindex( index= , columns= ) df2.reindex_like( df1, method=‘ffill’ ) pad/ffill - 向前填充值;bfill/backfill - 向后填充值;nearest - 从最近的索引值填充
rename()
允许基于一些映射(字典或者系列)或任意函数来重新标记一个轴
df.rename( index= , columns= )
isnull()
检查缺失值
notnull()
检查缺失值
fillna()
函数可以通过几种方法用非空数据“填充”NA值
df.fillna(number) 用标量值替换NaN df.fillna(method=‘pad’) 填写NA前进和后退method=pad/fill /bfill/backfill
dropna()
如果行内的任何值是NA,那么整个行被排除
df.dropna(axis=0/1 )
def adder ( ele1, ele2) :
return ele1+ ele2
df = pd. DataFrame( np. random. randn( 5 , 3 ) , columns= [ 'col1' , 'col2' , 'col3' ] )
df. pipe( adder, 2 )
col1
col2
col3
0
0.490731
0.926610
2.269177
1
1.154097
2.823656
2.141345
2
1.475614
2.054501
1.195006
3
1.984239
2.726985
2.644600
4
1.836258
2.737405
1.790489
df. apply ( np. mean)
df. apply ( np. mean, axis= 1 )
df. apply ( lambda x: x. max ( ) - x. min ( ) )
col1 -0.611812 col2 0.253831 col3 0.008124 dtype: float64
0 -0.771160 1 0.039699 2 -0.424960 3 0.451941 4 0.121384 dtype: float64
col1 1.493508 col2 1.897046 col3 1.449594 dtype: float64
df1 = pd. DataFrame( np. random. randn( 6 , 3 ) , columns= [ 'col1' , 'col2' , 'col3' ] )
df2 = pd. DataFrame( np. random. randn( 2 , 3 ) , columns= [ 'col1' , 'col2' , 'col3' ] )
df_reindexed = df1. reindex( index= [ 0 , 2 , 3 ] , columns= [ 'col1' , 'col2' ] )
df2. reindex_like( df1)
df2. reindex_like( df1, method= 'ffill' )
df2. reindex_like( df1, method= 'ffill' , limit= 1 )
col1
col2
col3
0
-2.311801
0.782469
0.696058
1
-1.351515
-0.019053
1.087809
2
NaN
NaN
NaN
3
NaN
NaN
NaN
4
NaN
NaN
NaN
5
NaN
NaN
NaN
col1
col2
col3
0
-2.311801
0.782469
0.696058
1
-1.351515
-0.019053
1.087809
2
-1.351515
-0.019053
1.087809
3
-1.351515
-0.019053
1.087809
4
-1.351515
-0.019053
1.087809
5
-1.351515
-0.019053
1.087809
col1
col2
col3
0
-2.311801
0.782469
0.696058
1
-1.351515
-0.019053
1.087809
2
-1.351515
-0.019053
1.087809
3
NaN
NaN
NaN
4
NaN
NaN
NaN
5
NaN
NaN
NaN
df. rename( columns= { 'col1' : 'c1' , 'col2' : 'c2' } , index = { 0 : 'apple' , 1 : 'banana' , 2 : 'durian' } )
c1
c2
col3
apple
-1.509269
-1.073390
0.269177
banana
-0.845903
0.823656
0.141345
durian
-0.524386
0.054501
-0.804994
3
-0.015761
0.726985
0.644600
4
-0.163742
0.737405
-0.209511
df3= df_reindexed. reindex_like( df1)
df3
df3[ 'col1' ] . isnull( )
df3[ 'col1' ] . notnull( )
df3. fillna( 0 )
col1
col2
col3
0
-0.321847
0.838906
NaN
1
NaN
NaN
NaN
2
1.113037
-1.161580
NaN
3
-0.256907
-1.525070
NaN
4
NaN
NaN
NaN
5
NaN
NaN
NaN
0 False 1 True 2 False 3 False 4 True 5 True Name: col1, dtype: bool
0 True 1 False 2 True 3 True 4 False 5 False Name: col1, dtype: bool
col1
col2
col3
0
-0.321847
0.838906
0.0
1
0.000000
0.000000
0.0
2
1.113037
-1.161580
0.0
3
-0.256907
-1.525070
0.0
4
0.000000
0.000000
0.0
5
0.000000
0.000000
0.0
df3. fillna( method= 'pad' )
df3. fillna( method= 'pad' ) . dropna( axis= 1 )
col1
col2
col3
0
-0.321847
0.838906
NaN
1
-0.321847
0.838906
NaN
2
1.113037
-1.161580
NaN
3
-0.256907
-1.525070
NaN
4
-0.256907
-1.525070
NaN
5
-0.256907
-1.525070
NaN
col1
col2
0
-0.321847
0.838906
1
-0.321847
0.838906
2
1.113037
-1.161580
3
-0.256907
-1.525070
4
-0.256907
-1.525070
5
-0.256907
-1.525070
15. Pandas读取外部数据
读取函数
函数作用
pd.read_csv(filename)
导入CSV文档
pd.read_table(filename)
导入分隔的文本文件 (如TSV)
pd.read_excel(filename)
导入Excel文档
pd.read_sql(query, connection_object)
读取SQL 表/数据库
pd.read_json(json_string)
读取JSON格式的字符串, URL或文件.
pd.read_html(url)
解析html URL,字符串或文件,并将表提取到数据框列表
pd.read_clipboard()
获取剪贴板的内容并将其传递给read_table()
写入函数
函数作用
df.to_csv(filename)
写入CSV文件
df.to_excel(filename)
写入Excel文件
df.to_sql(table_name, connection_object)
写入一个SQL表
df.to_json(filename)
写入JSON格式的文件
以上函数具体参数用法自行查阅