十九、数据整理(上)
作者:Chris Albon
译者:飞龙
协议:CC BY-NC-SA 4.0
在 Pandas 中通过分组应用函数
import pandas as pd
data = { 'Platoon' : [ 'A' , 'A' , 'A' , 'A' , 'A' , 'A' , 'B' , 'B' , 'B' , 'B' , 'B' , 'C' , 'C' , 'C' , 'C' , 'C' ] ,
'Casualties' : [ 1 , 4 , 5 , 7 , 5 , 5 , 6 , 1 , 4 , 5 , 6 , 7 , 4 , 6 , 4 , 6 ] }
df = pd. DataFrame( data)
df
Casualties
Platoon
0
1
A
1
4
A
2
5
A
3
7
A
4
5
A
5
5
A
6
6
B
7
1
B
8
4
B
9
5
B
10
6
B
11
7
C
12
4
C
13
6
C
14
4
C
15
6
C
df. groupby( 'Platoon' ) [ 'Casualties' ] . apply ( lambda x: x. rolling( center= False , window= 2 ) . mean( ) )
'''
0 NaN
1 2.5
2 4.5
3 6.0
4 6.0
5 5.0
6 NaN
7 3.5
8 2.5
9 4.5
10 5.5
11 NaN
12 5.5
13 5.0
14 5.0
15 5.0
dtype: float64
'''
在 Pandas 中向分组应用操作
import pandas as pd
raw_data = { 'regiment' : [ 'Nighthawks' , 'Nighthawks' , 'Nighthawks' , 'Nighthawks' , 'Dragoons' , 'Dragoons' , 'Dragoons' , 'Dragoons' , 'Scouts' , 'Scouts' , 'Scouts' , 'Scouts' ] ,
'company' : [ '1st' , '1st' , '2nd' , '2nd' , '1st' , '1st' , '2nd' , '2nd' , '1st' , '1st' , '2nd' , '2nd' ] ,
'name' : [ 'Miller' , 'Jacobson' , 'Ali' , 'Milner' , 'Cooze' , 'Jacon' , 'Ryaner' , 'Sone' , 'Sloan' , 'Piger' , 'Riani' , 'Ali' ] ,
'preTestScore' : [ 4 , 24 , 31 , 2 , 3 , 4 , 24 , 31 , 2 , 3 , 2 , 3 ] ,
'postTestScore' : [ 25 , 94 , 57 , 62 , 70 , 25 , 94 , 57 , 62 , 70 , 62 , 70 ] }
df = pd. DataFrame( raw_data, columns = [ 'regiment' , 'company' , 'name' , 'preTestScore' , 'postTestScore' ] )
df
regiment
company
name
preTestScore
postTestScore
0
Nighthawks
1st
Miller
4
25
1
Nighthawks
1st
Jacobson
24
94
2
Nighthawks
2nd
Ali
31
57
3
Nighthawks
2nd
Milner
2
62
4
Dragoons
1st
Cooze
3
70
5
Dragoons
1st
Jacon
4
25
6
Dragoons
2nd
Ryaner
24
94
7
Dragoons
2nd
Sone
31
57
8
Scouts
1st
Sloan
2
62
9
Scouts
1st
Piger
3
70
10
Scouts
2nd
Riani
2
62
11
Scouts
2nd
Ali
3
70
groupby_regiment = df[ 'preTestScore' ] . groupby( df[ 'regiment' ] )
groupby_regiment
“这个分组变量现在是GroupBy
对象。 除了分组的键df ['key1']
的一些中间数据之外,它实际上还没有计算任何东西。 我们的想法是,该对象具有将所有操作应用于每个分组所需的所有信息。” – PyDA
使用list()
显示分组的样子。
list ( df[ 'preTestScore' ] . groupby( df[ 'regiment' ] ) )
'''
[('Dragoons', 4 3
5 4
6 24
7 31
Name: preTestScore, dtype: int64), ('Nighthawks', 0 4
1 24
2 31
3 2
Name: preTestScore, dtype: int64), ('Scouts', 8 2
9 3
10 2
11 3
Name: preTestScore, dtype: int64)]
'''
df[ 'preTestScore' ] . groupby( df[ 'regiment' ] ) . describe( )
count
mean
std
min
25%
50%
75%
max
regiment
Dragoons
4.0
15.50
14.153916
3.0
3.75
14.0
25.75
31.0
Nighthawks
4.0
15.25
14.453950
2.0
3.50
14.0
25.75
31.0
Scouts
4.0
2.50
0.577350
2.0
2.00
2.5
3.00
3.0
groupby_regiment. mean( )
'''
regiment
Dragoons 15.50
Nighthawks 15.25
Scouts 2.50
Name: preTestScore, dtype: float64
'''
df[ 'preTestScore' ] . groupby( [ df[ 'regiment' ] , df[ 'company' ] ] ) . mean( )
'''
regiment company
Dragoons 1st 3.5
2nd 27.5
Nighthawks 1st 14.0
2nd 16.5
Scouts 1st 2.5
2nd 2.5
Name: preTestScore, dtype: float64
'''
df[ 'preTestScore' ] . groupby( [ df[ 'regiment' ] , df[ 'company' ] ] ) . mean( ) . unstack( )
company
1st
2nd
regiment
Dragoons
3.5
27.5
Nighthawks
14.0
16.5
Scouts
2.5
2.5
df. groupby( [ 'regiment' , 'company' ] ) . mean( )
preTestScore
postTestScore
regiment
company
Dragoons
1st
3.5
47.5
2nd
27.5
75.5
Nighthawks
1st
14.0
59.5
2nd
16.5
59.5
Scouts
1st
2.5
66.0
2nd
2.5
66.0
df. groupby( [ 'regiment' , 'company' ] ) . size( )
'''
regiment company
Dragoons 1st 2
2nd 2
Nighthawks 1st 2
2nd 2
Scouts 1st 2
2nd 2
dtype: int64
'''
for name, group in df. groupby( 'regiment' ) :
print ( name)
print ( group)
'''
Dragoons
regiment company name preTestScore postTestScore
4 Dragoons 1st Cooze 3 70
5 Dragoons 1st Jacon 4 25
6 Dragoons 2nd Ryaner 24 94
7 Dragoons 2nd Sone 31 57
Nighthawks
regiment company name preTestScore postTestScore
0 Nighthawks 1st Miller 4 25
1 Nighthawks 1st Jacobson 24 94
2 Nighthawks 2nd Ali 31 57
3 Nighthawks 2nd Milner 2 62
Scouts
regiment company name preTestScore postTestScore
8 Scouts 1st Sloan 2 62
9 Scouts 1st Piger 3 70
10 Scouts 2nd Riani 2 62
11 Scouts 2nd Ali 3 70
'''
按列分组:
特别是在这种情况下:按列对数据类型(即axis = 1
)分组,然后使用list()
查看该分组的外观。
list ( df. groupby( df. dtypes, axis= 1 ) )
'' '
[ ( dtype( 'int64' ) , preTestScore postTestScore
0 4 25
1 24 94
2 31 57
3 2 62
4 3 70
5 4 25
6 24 94
7 31 57
8 2 62
9 3 70
10 2 62
11 3 70 ) ,
( dtype( 'O' ) , regiment company name
0 Nighthawks 1st Miller
1 Nighthawks 1st Jacobson
2 Nighthawks 2nd Ali
3 Nighthawks 2nd Milner
4 Dragoons 1st Cooze
5 Dragoons 1st Jacon
6 Dragoons 2nd Ryaner
7 Dragoons 2nd Sone
8 Scouts 1st Sloan
9 Scouts 1st Piger
10 Scouts 2nd Riani
11 Scouts 2nd Ali) ]
df. groupby( 'regiment' ) . mean( ) . add_prefix( 'mean_' )
mean_preTestScore
mean_postTestScore
regiment
Dragoons
15.50
61.5
Nighthawks
15.25
59.5
Scouts
2.50
66.0
def get_stats ( group) :
return { 'min' : group. min ( ) , 'max' : group. max ( ) , 'count' : group. count( ) , 'mean' : group. mean( ) }
bins = [ 0 , 25 , 50 , 75 , 100 ]
group_names = [ 'Low' , 'Okay' , 'Good' , 'Great' ]
df[ 'categories' ] = pd. cut( df[ 'postTestScore' ] , bins, labels= group_names)
df[ 'postTestScore' ] . groupby( df[ 'categories' ] ) . apply ( get_stats) . unstack( )
count
max
mean
min
categories
Good
8.0
70.0
63.75
57.0
Great
2.0
94.0
94.00
94.0
Low
2.0
25.0
25.00
25.0
Okay
0.0
NaN
NaN
NaN
在 Pandas 数据帧上应用操作
import pandas as pd
import numpy as np
data = { 'name' : [ 'Jason' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ] ,
'year' : [ 2012 , 2012 , 2013 , 2014 , 2014 ] ,
'reports' : [ 4 , 24 , 31 , 2 , 3 ] ,
'coverage' : [ 25 , 94 , 57 , 62 , 70 ] }
df = pd. DataFrame( data, index = [ 'Cochice' , 'Pima' , 'Santa Cruz' , 'Maricopa' , 'Yuma' ] )
df
coverage
name
reports
year
Cochice
25
Jason
4
2012
Pima
94
Molly
24
2012
Santa Cruz
57
Tina
31
2013
Maricopa
62
Jake
2
2014
Yuma
70
Amy
3
2014
capitalizer = lambda x: x. upper( )
将capitalizer
函数应用于name
列。
apply()
可以沿数据帧的任意轴应用函数。
df[ 'name' ] . apply ( capitalizer)
'''
Cochice JASON
Pima MOLLY
Santa Cruz TINA
Maricopa JAKE
Yuma AMY
Name: name, dtype: object
'''
将capitalizer
lambda 函数映射到序列name
中的每个元素。
map()
对序列的每个元素应用操作。
df[ 'name' ] . map ( capitalizer)
'''
Cochice JASON
Pima MOLLY
Santa Cruz TINA
Maricopa JAKE
Yuma AMY
Name: name, dtype: object
'''
将平方根函数应用于整个数据帧中的每个单元格。
applymap()
将函数应用于整个数据帧中的每个元素。
df = df. drop( 'name' , axis= 1 )
df. applymap( np. sqrt)
coverage
reports
year
Cochice
5.000000
2.000000
44.855323
Pima
9.695360
4.898979
44.855323
Santa Cruz
7.549834
5.567764
44.866469
Maricopa
7.874008
1.414214
44.877611
Yuma
8.366600
1.732051
44.877611
在数据帧上应用函数。
def times100 ( x) :
if type ( x) is str :
return x
elif x:
return 100 * x
else :
return
df. applymap( times100)
coverage
reports
year
Cochice
2500
400
201200
Pima
9400
2400
201200
Santa Cruz
5700
3100
201300
Maricopa
6200
200
201400
Yuma
7000
300
201400
向 Pandas 数据帧赋予新列
import pandas as pd
df = pd. DataFrame( )
df[ 'name' ] = [ 'John' , 'Steve' , 'Sarah' ]
df
name
0
John
1
Steve
2
Sarah
df. assign( age = [ 31 , 32 , 19 ] )
name
age
0
John
31
1
Steve
32
2
Sarah
19
将列表拆分为大小为 N 的分块
在这个片段中,我们接受一个列表并将其分解为大小为 n 的块。 在处理具有最大请求大小的 API 时,这是一种非常常见的做法。
这个漂亮的函数由 Ned Batchelder 贡献,发布于 StackOverflow 。
first_names = [ 'Steve' , 'Jane' , 'Sara' , 'Mary' , 'Jack' , 'Bob' , 'Bily' , 'Boni' , 'Chris' , 'Sori' , 'Will' , 'Won' , 'Li' ]
def chunks ( l, n) :
for i in range ( 0 , len ( l) , n) :
yield l[ i: i+ n]
list ( chunks( first_names, 5 ) )
'''
[['Steve', 'Jane', 'Sara', 'Mary', 'Jack'],
['Bob', 'Bily', 'Boni', 'Chris', 'Sori'],
['Will', 'Won', 'Li']]
'''
在 Pandas 中使用正则表达式将字符串分解为列
import re
import pandas as pd
data = { 'raw' : [ 'Arizona 1 2014-12-23 3242.0' ,
'Iowa 1 2010-02-23 3453.7' ,
'Oregon 0 2014-06-20 2123.0' ,
'Maryland 0 2014-03-14 1123.6' ,
'Florida 1 2013-01-15 2134.0' ,
'Georgia 0 2012-07-14 2345.6' ] }
df = pd. DataFrame( data, columns = [ 'raw' ] )
df
raw
0
Arizona 1 2014-12-23 3242.0
1
Iowa 1 2010-02-23 3453.7
2
Oregon 0 2014-06-20 2123.0
3
Maryland 0 2014-03-14 1123.6
4
Florida 1 2013-01-15 2134.0
5
Georgia 0 2012-07-14 2345.6
df[ 'raw' ] . str . contains( '....-..-..' , regex= True )
'''
0 True
1 True
2 True
3 True
4 True
5 True
Name: raw, dtype: bool
'''
df[ 'female' ] = df[ 'raw' ] . str . extract( '(\d)' , expand= True )
df[ 'female' ]
'''
0 1
1 1
2 0
3 0
4 1
5 0
Name: female, dtype: object
'''
df[ 'date' ] = df[ 'raw' ] . str . extract( '(....-..-..)' , expand= True )
df[ 'date' ]
'''
0 2014-12-23
1 2010-02-23
2 2014-06-20
3 2014-03-14
4 2013-01-15
5 2012-07-14
Name: date, dtype: object
'''
df[ 'score' ] = df[ 'raw' ] . str . extract( '(\d\d\d\d\.\d)' , expand= True )
df[ 'score' ]
'''
0 3242.0
1 3453.7
2 2123.0
3 1123.6
4 2134.0
5 2345.6
Name: score, dtype: object
'''
df[ 'state' ] = df[ 'raw' ] . str . extract( '([A-Z]\w{0,})' , expand= True )
df[ 'state' ]
'''
0 Arizona
1 Iowa
2 Oregon
3 Maryland
4 Florida
5 Georgia
Name: state, dtype: object
'''
df
raw
female
date
score
state
0
Arizona 1 2014-12-23 3242.0
1
2014-12-23
3242.0
Arizona
1
Iowa 1 2010-02-23 3453.7
1
2010-02-23
3453.7
Iowa
2
Oregon 0 2014-06-20 2123.0
0
2014-06-20
2123.0
Oregon
3
Maryland 0 2014-03-14 1123.6
0
2014-03-14
1123.6
Maryland
4
Florida 1 2013-01-15 2134.0
1
2013-01-15
2134.0
Florida
5
Georgia 0 2012-07-14 2345.6
0
2012-07-14
2345.6
Georgia
由两个数据帧贡献列
import pandas as pd
dataframe_one = pd. DataFrame( )
dataframe_one[ '1' ] = [ '1' , '1' , '1' ]
dataframe_one[ 'B' ] = [ 'b' , 'b' , 'b' ]
dataframe_two = pd. DataFrame( )
dataframe_two[ '2' ] = [ '2' , '2' , '2' ]
dataframe_two[ 'B' ] = [ 'b' , 'b' , 'b' ]
set . intersection( set ( dataframe_one) , set ( dataframe_two) )
从多个列表构建字典
officer_names = [ 'Sodoni Dogla' , 'Chris Jefferson' , 'Jessica Billars' , 'Michael Mulligan' , 'Steven Johnson' ]
officer_armies = [ 'Purple Army' , 'Orange Army' , 'Green Army' , 'Red Army' , 'Blue Army' ]
dict ( zip ( officer_names, officer_armies) )
'''
{'Chris Jefferson': 'Orange Army',
'Jessica Billars': 'Green Army',
'Michael Mulligan': 'Red Army',
'Sodoni Dogla': 'Purple Army',
'Steven Johnson': 'Blue Army'}
'''
将 CSV 转换为 Python 代码来重建它
import pandas as pd
df_original = pd. read_csv( 'http://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv' )
df = pd. read_csv( 'http://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv' )
print ( '==============================' )
print ( 'RUN THE CODE BELOW THIS LINE' )
print ( '==============================' )
print ( 'raw_data =' , df. to_dict( orient= 'list' ) )
print ( 'df = pd.DataFrame(raw_data, columns = ' + str ( list ( df_original) ) + ')' )
'''
==============================
RUN THE CODE BELOW THIS LINE
==============================
raw_data = {'Sepal.Length': [5.0999999999999996, 4.9000000000000004, 4.7000000000000002, 4.5999999999999996, 5.0, 5.4000000000000004, 4.5999999999999996, 5.0, 4.4000000000000004, 4.9000000000000004, 5.4000000000000004, 4.7999999999999998, 4.7999999999999998, 4.2999999999999998, 5.7999999999999998, 5.7000000000000002, 5.4000000000000004, 5.0999999999999996, 5.7000000000000002, 5.0999999999999996, 5.4000000000000004, 5.0999999999999996, 4.5999999999999996, 5.0999999999999996, 4.7999999999999998, 5.0, 5.0, 5.2000000000000002, 5.2000000000000002, 4.7000000000000002, 4.7999999999999998, 5.4000000000000004, 5.2000000000000002, 5.5, 4.9000000000000004, 5.0, 5.5, 4.9000000000000004, 4.4000000000000004, 5.0999999999999996, 5.0, 4.5, 4.4000000000000004, 5.0, 5.0999999999999996, 4.7999999999999998, 5.0999999999999996, 4.5999999999999996, 5.2999999999999998, 5.0, 7.0, 6.4000000000000004, 6.9000000000000004, 5.5, 6.5, 5.7000000000000002, 6.2999999999999998, 4.9000000000000004, 6.5999999999999996, 5.2000000000000002, 5.0, 5.9000000000000004, 6.0, 6.0999999999999996, 5.5999999999999996, 6.7000000000000002, 5.5999999999999996, 5.7999999999999998, 6.2000000000000002, 5.5999999999999996, 5.9000000000000004, 6.0999999999999996, 6.2999999999999998, 6.0999999999999996, 6.4000000000000004, 6.5999999999999996, 6.7999999999999998, 6.7000000000000002, 6.0, 5.7000000000000002, 5.5, 5.5, 5.7999999999999998, 6.0, 5.4000000000000004, 6.0, 6.7000000000000002, 6.2999999999999998, 5.5999999999999996, 5.5, 5.5, 6.0999999999999996, 5.7999999999999998, 5.0, 5.5999999999999996, 5.7000000000000002, 5.7000000000000002, 6.2000000000000002, 5.0999999999999996, 5.7000000000000002, 6.2999999999999998, 5.7999999999999998, 7.0999999999999996, 6.2999999999999998, 6.5, 7.5999999999999996, 4.9000000000000004, 7.2999999999999998, 6.7000000000000002, 7.2000000000000002, 6.5, 6.4000000000000004, 6.7999999999999998, 5.7000000000000002, 5.7999999999999998, 6.4000000000000004, 6.5, 7.7000000000000002, 7.7000000000000002, 6.0, 6.9000000000000004, 5.5999999999999996, 7.7000000000000002, 6.2999999999999998, 6.7000000000000002, 7.2000000000000002, 6.2000000000000002, 6.0999999999999996, 6.4000000000000004, 7.2000000000000002, 7.4000000000000004, 7.9000000000000004, 6.4000000000000004, 6.2999999999999998, 6.0999999999999996, 7.7000000000000002, 6.2999999999999998, 6.4000000000000004, 6.0, 6.9000000000000004, 6.7000000000000002, 6.9000000000000004, 5.7999999999999998, 6.7999999999999998, 6.7000000000000002, 6.7000000000000002, 6.2999999999999998, 6.5, 6.2000000000000002, 5.9000000000000004], 'Petal.Width': [0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.40000000000000002, 0.29999999999999999, 0.20000000000000001, 0.20000000000000001, 0.10000000000000001, 0.20000000000000001, 0.20000000000000001, 0.10000000000000001, 0.10000000000000001, 0.20000000000000001, 0.40000000000000002, 0.40000000000000002, 0.29999999999999999, 0.29999999999999999, 0.29999999999999999, 0.20000000000000001, 0.40000000000000002, 0.20000000000000001, 0.5, 0.20000000000000001, 0.20000000000000001, 0.40000000000000002, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.40000000000000002, 0.10000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.10000000000000001, 0.20000000000000001, 0.20000000000000001, 0.29999999999999999, 0.29999999999999999, 0.20000000000000001, 0.59999999999999998, 0.40000000000000002, 0.29999999999999999, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 1.3999999999999999, 1.5, 1.5, 1.3, 1.5, 1.3, 1.6000000000000001, 1.0, 1.3, 1.3999999999999999, 1.0, 1.5, 1.0, 1.3999999999999999, 1.3, 1.3999999999999999, 1.5, 1.0, 1.5, 1.1000000000000001, 1.8, 1.3, 1.5, 1.2, 1.3, 1.3999999999999999, 1.3999999999999999, 1.7, 1.5, 1.0, 1.1000000000000001, 1.0, 1.2, 1.6000000000000001, 1.5, 1.6000000000000001, 1.5, 1.3, 1.3, 1.3, 1.2, 1.3999999999999999, 1.2, 1.0, 1.3, 1.2, 1.3, 1.3, 1.1000000000000001, 1.3, 2.5, 1.8999999999999999, 2.1000000000000001, 1.8, 2.2000000000000002, 2.1000000000000001, 1.7, 1.8, 1.8, 2.5, 2.0, 1.8999999999999999, 2.1000000000000001, 2.0, 2.3999999999999999, 2.2999999999999998, 1.8, 2.2000000000000002, 2.2999999999999998, 1.5, 2.2999999999999998, 2.0, 2.0, 1.8, 2.1000000000000001, 1.8, 1.8, 1.8, 2.1000000000000001, 1.6000000000000001, 1.8999999999999999, 2.0, 2.2000000000000002, 1.5, 1.3999999999999999, 2.2999999999999998, 2.3999999999999999, 1.8, 1.8, 2.1000000000000001, 2.3999999999999999, 2.2999999999999998, 1.8999999999999999, 2.2999999999999998, 2.5, 2.2999999999999998, 1.8999999999999999, 2.0, 2.2999999999999998, 1.8], 'Petal.Length': [1.3999999999999999, 1.3999999999999999, 1.3, 1.5, 1.3999999999999999, 1.7, 1.3999999999999999, 1.5, 1.3999999999999999, 1.5, 1.5, 1.6000000000000001, 1.3999999999999999, 1.1000000000000001, 1.2, 1.5, 1.3, 1.3999999999999999, 1.7, 1.5, 1.7, 1.5, 1.0, 1.7, 1.8999999999999999, 1.6000000000000001, 1.6000000000000001, 1.5, 1.3999999999999999, 1.6000000000000001, 1.6000000000000001, 1.5, 1.5, 1.3999999999999999, 1.5, 1.2, 1.3, 1.3999999999999999, 1.3, 1.5, 1.3, 1.3, 1.3, 1.6000000000000001, 1.8999999999999999, 1.3999999999999999, 1.6000000000000001, 1.3999999999999999, 1.5, 1.3999999999999999, 4.7000000000000002, 4.5, 4.9000000000000004, 4.0, 4.5999999999999996, 4.5, 4.7000000000000002, 3.2999999999999998, 4.5999999999999996, 3.8999999999999999, 3.5, 4.2000000000000002, 4.0, 4.7000000000000002, 3.6000000000000001, 4.4000000000000004, 4.5, 4.0999999999999996, 4.5, 3.8999999999999999, 4.7999999999999998, 4.0, 4.9000000000000004, 4.7000000000000002, 4.2999999999999998, 4.4000000000000004, 4.7999999999999998, 5.0, 4.5, 3.5, 3.7999999999999998, 3.7000000000000002, 3.8999999999999999, 5.0999999999999996, 4.5, 4.5, 4.7000000000000002, 4.4000000000000004, 4.0999999999999996, 4.0, 4.4000000000000004, 4.5999999999999996, 4.0, 3.2999999999999998, 4.2000000000000002, 4.2000000000000002, 4.2000000000000002, 4.2999999999999998, 3.0, 4.0999999999999996, 6.0, 5.0999999999999996, 5.9000000000000004, 5.5999999999999996, 5.7999999999999998, 6.5999999999999996, 4.5, 6.2999999999999998, 5.7999999999999998, 6.0999999999999996, 5.0999999999999996, 5.2999999999999998, 5.5, 5.0, 5.0999999999999996, 5.2999999999999998, 5.5, 6.7000000000000002, 6.9000000000000004, 5.0, 5.7000000000000002, 4.9000000000000004, 6.7000000000000002, 4.9000000000000004, 5.7000000000000002, 6.0, 4.7999999999999998, 4.9000000000000004, 5.5999999999999996, 5.7999999999999998, 6.0999999999999996, 6.4000000000000004, 5.5999999999999996, 5.0999999999999996, 5.5999999999999996, 6.0999999999999996, 5.5999999999999996, 5.5, 4.7999999999999998, 5.4000000000000004, 5.5999999999999996, 5.0999999999999996, 5.0999999999999996, 5.9000000000000004, 5.7000000000000002, 5.2000000000000002, 5.0, 5.2000000000000002, 5.4000000000000004, 5.0999999999999996], 'Species': ['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica'], 'Sepal.Width': [3.5, 3.0, 3.2000000000000002, 3.1000000000000001, 3.6000000000000001, 3.8999999999999999, 3.3999999999999999, 3.3999999999999999, 2.8999999999999999, 3.1000000000000001, 3.7000000000000002, 3.3999999999999999, 3.0, 3.0, 4.0, 4.4000000000000004, 3.8999999999999999, 3.5, 3.7999999999999998, 3.7999999999999998, 3.3999999999999999, 3.7000000000000002, 3.6000000000000001, 3.2999999999999998, 3.3999999999999999, 3.0, 3.3999999999999999, 3.5, 3.3999999999999999, 3.2000000000000002, 3.1000000000000001, 3.3999999999999999, 4.0999999999999996, 4.2000000000000002, 3.1000000000000001, 3.2000000000000002, 3.5, 3.6000000000000001, 3.0, 3.3999999999999999, 3.5, 2.2999999999999998, 3.2000000000000002, 3.5, 3.7999999999999998, 3.0, 3.7999999999999998, 3.2000000000000002, 3.7000000000000002, 3.2999999999999998, 3.2000000000000002, 3.2000000000000002, 3.1000000000000001, 2.2999999999999998, 2.7999999999999998, 2.7999999999999998, 3.2999999999999998, 2.3999999999999999, 2.8999999999999999, 2.7000000000000002, 2.0, 3.0, 2.2000000000000002, 2.8999999999999999, 2.8999999999999999, 3.1000000000000001, 3.0, 2.7000000000000002, 2.2000000000000002, 2.5, 3.2000000000000002, 2.7999999999999998, 2.5, 2.7999999999999998, 2.8999999999999999, 3.0, 2.7999999999999998, 3.0, 2.8999999999999999, 2.6000000000000001, 2.3999999999999999, 2.3999999999999999, 2.7000000000000002, 2.7000000000000002, 3.0, 3.3999999999999999, 3.1000000000000001, 2.2999999999999998, 3.0, 2.5, 2.6000000000000001, 3.0, 2.6000000000000001, 2.2999999999999998, 2.7000000000000002, 3.0, 2.8999999999999999, 2.8999999999999999, 2.5, 2.7999999999999998, 3.2999999999999998, 2.7000000000000002, 3.0, 2.8999999999999999, 3.0, 3.0, 2.5, 2.8999999999999999, 2.5, 3.6000000000000001, 3.2000000000000002, 2.7000000000000002, 3.0, 2.5, 2.7999999999999998, 3.2000000000000002, 3.0, 3.7999999999999998, 2.6000000000000001, 2.2000000000000002, 3.2000000000000002, 2.7999999999999998, 2.7999999999999998, 2.7000000000000002, 3.2999999999999998, 3.2000000000000002, 2.7999999999999998, 3.0, 2.7999999999999998, 3.0, 2.7999999999999998, 3.7999999999999998, 2.7999999999999998, 2.7999999999999998, 2.6000000000000001, 3.0, 3.3999999999999999, 3.1000000000000001, 3.0, 3.1000000000000001, 3.1000000000000001, 3.1000000000000001, 2.7000000000000002, 3.2000000000000002, 3.2999999999999998, 3.0, 2.5, 3.0, 3.3999999999999999, 3.0], 'Unnamed: 0': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150]}
'''
df = pd. DataFrame( raw_data, columns = [ 'Unnamed: 0' , 'Sepal.Length' , 'Sepal.Width' , 'Petal.Length' , 'Petal.Width' , 'Species' ] )
raw_data = { 'Petal.Width' : [ 0.20000000000000001 , 0.20000000000000001 , 0.20000000000000001 , 0.20000000000000001 , 0.20000000000000001 , 0.40000000000000002 , 0.29999999999999999 , 0.20000000000000001 , 0.20000000000000001 , 0.10000000000000001 , 0.20000000000000001 , 0.20000000000000001 , 0.10000000000000001 , 0.10000000000000001 , 0.20000000000000001 , 0.40000000000000002 , 0.40000000000000002 , 0.29999999999999999 , 0.29999999999999999 , 0.29999999999999999 , 0.20000000000000001 , 0.40000000000000002 , 0.20000000000000001 , 0.5 , 0.20000000000000001 , 0.20000000000000001 , 0.40000000000000002 , 0.20000000000000001 , 0.20000000000000001 , 0.20000000000000001 , 0.20000000000000001 , 0.40000000000000002 , 0.10000000000000001 , 0.20000000000000001 , 0.20000000000000001 , 0.20000000000000001 , 0.20000000000000001 , 0.10000000000000001 , 0.20000000000000001 , 0.20000000000000001 , 0.29999999999999999 , 0.29999999999999999 , 0.20000000000000001 , 0.59999999999999998 , 0.40000000000000002 , 0.29999999999999999 , 0.20000000000000001 , 0.20000000000000001 , 0.20000000000000001 , 0.20000000000000001 , 1.3999999999999999 , 1.5 , 1.5 , 1.3 , 1.5 , 1.3 , 1.6000000000000001 , 1.0 , 1.3 , 1.3999999999999999 , 1.0 , 1.5 , 1.0 , 1.3999999999999999 , 1.3 , 1.3999999999999999 , 1.5 , 1.0 , 1.5 , 1.1000000000000001 , 1.8 , 1.3 , 1.5 , 1.2 , 1.3 , 1.3999999999999999 , 1.3999999999999999 , 1.7 , 1.5 , 1.0 , 1.1000000000000001 , 1.0 , 1.2 , 1.6000000000000001 , 1.5 , 1.6000000000000001 , 1.5 , 1.3 , 1.3 , 1.3 , 1.2 , 1.3999999999999999 , 1.2 , 1.0 , 1.3 , 1.2 , 1.3 , 1.3 , 1.1000000000000001 , 1.3 , 2.5 , 1.8999999999999999 , 2.1000000000000001 , 1.8 , 2.2000000000000002 , 2.1000000000000001 , 1.7 , 1.8 , 1.8 , 2.5 , 2.0 , 1.8999999999999999 , 2.1000000000000001 , 2.0 , 2.3999999999999999 , 2.2999999999999998 , 1.8 , 2.2000000000000002 , 2.2999999999999998 , 1.5 , 2.2999999999999998 , 2.0 , 2.0 , 1.8 , 2.1000000000000001 , 1.8 , 1.8 , 1.8 , 2.1000000000000001 , 1.6000000000000001 , 1.8999999999999999 , 2.0 , 2.2000000000000002 , 1.5 , 1.3999999999999999 , 2.2999999999999998 , 2.3999999999999999 , 1.8 , 1.8 , 2.1000000000000001 , 2.3999999999999999 , 2.2999999999999998 , 1.8999999999999999 , 2.2999999999999998 , 2.5 , 2.2999999999999998 , 1.8999999999999999 , 2.0 , 2.2999999999999998 , 1.8 ] , 'Sepal.Width' : [ 3.5 , 3.0 , 3.2000000000000002 , 3.1000000000000001 , 3.6000000000000001 , 3.8999999999999999 , 3.3999999999999999 , 3.3999999999999999 , 2.8999999999999999 , 3.1000000000000001 , 3.7000000000000002 , 3.3999999999999999 , 3.0 , 3.0 , 4.0 , 4.4000000000000004 , 3.8999999999999999 , 3.5 , 3.7999999999999998 , 3.7999999999999998 , 3.3999999999999999 , 3.7000000000000002 , 3.6000000000000001 , 3.2999999999999998 , 3.3999999999999999 , 3.0 , 3.3999999999999999 , 3.5 , 3.3999999999999999 , 3.2000000000000002 , 3.1000000000000001 , 3.3999999999999999 , 4.0999999999999996 , 4.2000000000000002 , 3.1000000000000001 , 3.2000000000000002 , 3.5 , 3.6000000000000001 , 3.0 , 3.3999999999999999 , 3.5 , 2.2999999999999998 , 3.2000000000000002 , 3.5 , 3.7999999999999998 , 3.0 , 3.7999999999999998 , 3.2000000000000002 , 3.7000000000000002 , 3.2999999999999998 , 3.2000000000000002 , 3.2000000000000002 , 3.1000000000000001 , 2.2999999999999998 , 2.7999999999999998 , 2.7999999999999998 , 3.2999999999999998 , 2.3999999999999999 , 2.8999999999999999 , 2.7000000000000002 , 2.0 , 3.0 , 2.2000000000000002 , 2.8999999999999999 , 2.8999999999999999 , 3.1000000000000001 , 3.0 , 2.7000000000000002 , 2.2000000000000002 , 2.5 , 3.2000000000000002 , 2.7999999999999998 , 2.5 , 2.7999999999999998 , 2.8999999999999999 , 3.0 , 2.7999999999999998 , 3.0 , 2.8999999999999999 , 2.6000000000000001 , 2.3999999999999999 , 2.3999999999999999 , 2.7000000000000002 , 2.7000000000000002 , 3.0 , 3.3999999999999999 , 3.1000000000000001 , 2.2999999999999998 , 3.0 , 2.5 , 2.6000000000000001 , 3.0 , 2.6000000000000001 , 2.2999999999999998 , 2.7000000000000002 , 3.0 , 2.8999999999999999 , 2.8999999999999999 , 2.5 , 2.7999999999999998 , 3.2999999999999998 , 2.7000000000000002 , 3.0 , 2.8999999999999999 , 3.0 , 3.0 , 2.5 , 2.8999999999999999 , 2.5 , 3.6000000000000001 , 3.2000000000000002 , 2.7000000000000002 , 3.0 , 2.5 , 2.7999999999999998 , 3.2000000000000002 , 3.0 , 3.7999999999999998 , 2.6000000000000001 , 2.2000000000000002 , 3.2000000000000002 , 2.7999999999999998 , 2.7999999999999998 , 2.7000000000000002 , 3.2999999999999998 , 3.2000000000000002 , 2.7999999999999998 , 3.0 , 2.7999999999999998 , 3.0 , 2.7999999999999998 , 3.7999999999999998 , 2.7999999999999998 , 2.7999999999999998 , 2.6000000000000001 , 3.0 , 3.3999999999999999 , 3.1000000000000001 , 3.0 , 3.1000000000000001 , 3.1000000000000001 , 3.1000000000000001 , 2.7000000000000002 , 3.2000000000000002 , 3.2999999999999998 , 3.0 , 2.5 , 3.0 , 3.3999999999999999 , 3.0 ] , 'Species' : [ 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'setosa' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'versicolor' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' , 'virginica' ] , 'Unnamed: 0' : [ 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 , 64 , 65 , 66 , 67 , 68 , 69 , 70 , 71 , 72 , 73 , 74 , 75 , 76 , 77 , 78 , 79 , 80 , 81 , 82 , 83 , 84 , 85 , 86 , 87 , 88 , 89 , 90 , 91 , 92 , 93 , 94 , 95 , 96 , 97 , 98 , 99 , 100 , 101 , 102 , 103 , 104 , 105 , 106 , 107 , 108 , 109 , 110 , 111 , 112 , 113 , 114 , 115 , 116 , 117 , 118 , 119 , 120 , 121 , 122 , 123 , 124 , 125 , 126 , 127 , 128 , 129 , 130 , 131 , 132 , 133 , 134 , 135 , 136 , 137 , 138 , 139 , 140 , 141 , 142 , 143 , 144 , 145 , 146 , 147 , 148 , 149 , 150 ] , 'Sepal.Length' : [ 5.0999999999999996 , 4.9000000000000004 , 4.7000000000000002 , 4.5999999999999996 , 5.0 , 5.4000000000000004 , 4.5999999999999996 , 5.0 , 4.4000000000000004 , 4.9000000000000004 , 5.4000000000000004 , 4.7999999999999998 , 4.7999999999999998 , 4.2999999999999998 , 5.7999999999999998 , 5.7000000000000002 , 5.4000000000000004 , 5.0999999999999996 , 5.7000000000000002 , 5.0999999999999996 , 5.4000000000000004 , 5.0999999999999996 , 4.5999999999999996 , 5.0999999999999996 , 4.7999999999999998 , 5.0 , 5.0 , 5.2000000000000002 , 5.2000000000000002 , 4.7000000000000002 , 4.7999999999999998 , 5.4000000000000004 , 5.2000000000000002 , 5.5 , 4.9000000000000004 , 5.0 , 5.5 , 4.9000000000000004 , 4.4000000000000004 , 5.0999999999999996 , 5.0 , 4.5 , 4.4000000000000004 , 5.0 , 5.0999999999999996 , 4.7999999999999998 , 5.0999999999999996 , 4.5999999999999996 , 5.2999999999999998 , 5.0 , 7.0 , 6.4000000000000004 , 6.9000000000000004 , 5.5 , 6.5 , 5.7000000000000002 , 6.2999999999999998 , 4.9000000000000004 , 6.5999999999999996 , 5.2000000000000002 , 5.0 , 5.9000000000000004 , 6.0 , 6.0999999999999996 , 5.5999999999999996 , 6.7000000000000002 , 5.5999999999999996 , 5.7999999999999998 , 6.2000000000000002 , 5.5999999999999996 , 5.9000000000000004 , 6.0999999999999996 , 6.2999999999999998 , 6.0999999999999996 , 6.4000000000000004 , 6.5999999999999996 , 6.7999999999999998 , 6.7000000000000002 , 6.0 , 5.7000000000000002 , 5.5 , 5.5 , 5.7999999999999998 , 6.0 , 5.4000000000000004 , 6.0 , 6.7000000000000002 , 6.2999999999999998 , 5.5999999999999996 , 5.5 , 5.5 , 6.0999999999999996 , 5.7999999999999998 , 5.0 , 5.5999999999999996 , 5.7000000000000002 , 5.7000000000000002 , 6.2000000000000002 , 5.0999999999999996 , 5.7000000000000002 , 6.2999999999999998 , 5.7999999999999998 , 7.0999999999999996 , 6.2999999999999998 , 6.5 , 7.5999999999999996 , 4.9000000000000004 , 7.2999999999999998 , 6.7000000000000002 , 7.2000000000000002 , 6.5 , 6.4000000000000004 , 6.7999999999999998 , 5.7000000000000002 , 5.7999999999999998 , 6.4000000000000004 , 6.5 , 7.7000000000000002 , 7.7000000000000002 , 6.0 , 6.9000000000000004 , 5.5999999999999996 , 7.7000000000000002 , 6.2999999999999998 , 6.7000000000000002 , 7.2000000000000002 , 6.2000000000000002 , 6.0999999999999996 , 6.4000000000000004 , 7.2000000000000002 , 7.4000000000000004 , 7.9000000000000004 , 6.4000000000000004 , 6.2999999999999998 , 6.0999999999999996 , 7.7000000000000002 , 6.2999999999999998 , 6.4000000000000004 , 6.0 , 6.9000000000000004 , 6.7000000000000002 , 6.9000000000000004 , 5.7999999999999998 , 6.7999999999999998 , 6.7000000000000002 , 6.7000000000000002 , 6.2999999999999998 , 6.5 , 6.2000000000000002 , 5.9000000000000004 ] , 'Petal.Length' : [ 1.3999999999999999 , 1.3999999999999999 , 1.3 , 1.5 , 1.3999999999999999 , 1.7 , 1.3999999999999999 , 1.5 , 1.3999999999999999 , 1.5 , 1.5 , 1.6000000000000001 , 1.3999999999999999 , 1.1000000000000001 , 1.2 , 1.5 , 1.3 , 1.3999999999999999 , 1.7 , 1.5 , 1.7 , 1.5 , 1.0 , 1.7 , 1.8999999999999999 , 1.6000000000000001 , 1.6000000000000001 , 1.5 , 1.3999999999999999 , 1.6000000000000001 , 1.6000000000000001 , 1.5 , 1.5 , 1.3999999999999999 , 1.5 , 1.2 , 1.3 , 1.3999999999999999 , 1.3 , 1.5 , 1.3 , 1.3 , 1.3 , 1.6000000000000001 , 1.8999999999999999 , 1.3999999999999999 , 1.6000000000000001 , 1.3999999999999999 , 1.5 , 1.3999999999999999 , 4.7000000000000002 , 4.5 , 4.9000000000000004 , 4.0 , 4.5999999999999996 , 4.5 , 4.7000000000000002 , 3.2999999999999998 , 4.5999999999999996 , 3.8999999999999999 , 3.5 , 4.2000000000000002 , 4.0 , 4.7000000000000002 , 3.6000000000000001 , 4.4000000000000004 , 4.5 , 4.0999999999999996 , 4.5 , 3.8999999999999999 , 4.7999999999999998 , 4.0 , 4.9000000000000004 , 4.7000000000000002 , 4.2999999999999998 , 4.4000000000000004 , 4.7999999999999998 , 5.0 , 4.5 , 3.5 , 3.7999999999999998 , 3.7000000000000002 , 3.8999999999999999 , 5.0999999999999996 , 4.5 , 4.5 , 4.7000000000000002 , 4.4000000000000004 , 4.0999999999999996 , 4.0 , 4.4000000000000004 , 4.5999999999999996 , 4.0 , 3.2999999999999998 , 4.2000000000000002 , 4.2000000000000002 , 4.2000000000000002 , 4.2999999999999998 , 3.0 , 4.0999999999999996 , 6.0 , 5.0999999999999996 , 5.9000000000000004 , 5.5999999999999996 , 5.7999999999999998 , 6.5999999999999996 , 4.5 , 6.2999999999999998 , 5.7999999999999998 , 6.0999999999999996 , 5.0999999999999996 , 5.2999999999999998 , 5.5 , 5.0 , 5.0999999999999996 , 5.2999999999999998 , 5.5 , 6.7000000000000002 , 6.9000000000000004 , 5.0 , 5.7000000000000002 , 4.9000000000000004 , 6.7000000000000002 , 4.9000000000000004 , 5.7000000000000002 , 6.0 , 4.7999999999999998 , 4.9000000000000004 , 5.5999999999999996 , 5.7999999999999998 , 6.0999999999999996 , 6.4000000000000004 , 5.5999999999999996 , 5.0999999999999996 , 5.5999999999999996 , 6.0999999999999996 , 5.5999999999999996 , 5.5 , 4.7999999999999998 , 5.4000000000000004 , 5.5999999999999996 , 5.0999999999999996 , 5.0999999999999996 , 5.9000000000000004 , 5.7000000000000002 , 5.2000000000000002 , 5.0 , 5.2000000000000002 , 5.4000000000000004 , 5.0999999999999996 ] }
df = pd. DataFrame( raw_data, columns = [ 'Unnamed: 0' , 'Sepal.Length' , 'Sepal.Width' , 'Petal.Length' , 'Petal.Width' , 'Species' ] )
df. head( )
Unnamed: 0
Sepal.Length
Sepal.Width
Petal.Length
Petal.Width
Species
0
1
5.1
3.5
1.4
0.2
setosa
1
2
4.9
3.0
1.4
0.2
setosa
2
3
4.7
3.2
1.3
0.2
setosa
3
4
4.6
3.1
1.5
0.2
setosa
4
5
5.0
3.6
1.4
0.2
setosa
df_original. head( )
Unnamed: 0
Sepal.Length
Sepal.Width
Petal.Length
Petal.Width
Species
0
1
5.1
3.5
1.4
0.2
setosa
1
2
4.9
3.0
1.4
0.2
setosa
2
3
4.7
3.2
1.3
0.2
setosa
3
4
4.6
3.1
1.5
0.2
setosa
4
5
5.0
3.6
1.4
0.2
setosa
将分类变量转换为虚拟变量
import pandas as pd
raw_data = { 'first_name' : [ 'Jason' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ] ,
'last_name' : [ 'Miller' , 'Jacobson' , 'Ali' , 'Milner' , 'Cooze' ] ,
'sex' : [ 'male' , 'female' , 'male' , 'female' , 'female' ] }
df = pd. DataFrame( raw_data, columns = [ 'first_name' , 'last_name' , 'sex' ] )
df
first_name
last_name
sex
0
Jason
Miller
male
1
Molly
Jacobson
female
2
Tina
Ali
male
3
Jake
Milner
female
4
Amy
Cooze
female
df_sex = pd. get_dummies( df[ 'sex' ] )
df_new = pd. concat( [ df, df_sex] , axis= 1 )
df_new
first_name
last_name
sex
female
male
0
Jason
Miller
male
0.0
1.0
1
Molly
Jacobson
female
1.0
0.0
2
Tina
Ali
male
0.0
1.0
3
Jake
Milner
female
1.0
0.0
4
Amy
Cooze
female
1.0
0.0
df_new = df. join( df_sex)
df_new
first_name
last_name
sex
female
male
0
Jason
Miller
male
0.0
1.0
1
Molly
Jacobson
female
1.0
0.0
2
Tina
Ali
male
0.0
1.0
3
Jake
Milner
female
1.0
0.0
4
Amy
Cooze
female
1.0
0.0
将分类变量转换为虚拟变量
import pandas as pd
import patsy
raw_data = { 'countrycode' : [ 1 , 2 , 3 , 2 , 1 ] }
df = pd. DataFrame( raw_data, columns = [ 'countrycode' ] )
df
countrycode
0
1
1
2
2
3
3
2
4
1
patsy. dmatrix( 'C(countrycode)-1' , df, return_type= 'dataframe' )
C(countrycode)[1]
C(countrycode)[2]
C(countrycode)[3]
0
1.0
0.0
0.0
1
0.0
1.0
0.0
2
0.0
0.0
1.0
3
0.0
1.0
0.0
4
1.0
0.0
0.0
将字符串分类变量转换为数字变量
import pandas as pd
raw_data = { 'patient' : [ 1 , 1 , 1 , 2 , 2 ] ,
'obs' : [ 1 , 2 , 3 , 1 , 2 ] ,
'treatment' : [ 0 , 1 , 0 , 1 , 0 ] ,
'score' : [ 'strong' , 'weak' , 'normal' , 'weak' , 'strong' ] }
df = pd. DataFrame( raw_data, columns = [ 'patient' , 'obs' , 'treatment' , 'score' ] )
df
patient
obs
treatment
score
0
1
1
0
strong
1
1
2
1
weak
2
1
3
0
normal
3
2
1
1
weak
4
2
2
0
strong
def score_to_numeric ( x) :
if x== 'strong' :
return 3
if x== 'normal' :
return 2
if x== 'weak' :
return 1
df[ 'score_num' ] = df[ 'score' ] . apply ( score_to_numeric)
df
patient
obs
treatment
score
score_num
0
1
1
0
strong
3
1
1
2
1
weak
1
2
1
3
0
normal
2
3
2
1
1
weak
1
4
2
2
0
strong
3
将变量转换为时间序列
import pandas as pd
raw_data = { 'date' : [ '2014-06-01T01:21:38.004053' , '2014-06-02T01:21:38.004053' , '2014-06-03T01:21:38.004053' ] ,
'score' : [ 25 , 94 , 57 ] }
df = pd. DataFrame( raw_data, columns = [ 'date' , 'score' ] )
df
date
score
0
2014-06-01T01:21:38.004053
25
1
2014-06-02T01:21:38.004053
94
2
2014-06-03T01:21:38.004053
57
df[ "date" ] = pd. to_datetime( df[ "date" ] )
df = df. set_index( df[ "date" ] )
df
date
score
date
—
—
—
2014-06-01 01:21:38.004053
2014-06-01 01:21:38.004053
25
2014-06-02 01:21:38.004053
2014-06-02 01:21:38.004053
94
2014-06-03 01:21:38.004053
2014-06-03 01:21:38.004053
57
在 Pandas 数据帧中计数
import pandas as pd
year = pd. Series( [ 1875 , 1876 , 1877 , 1878 , 1879 , 1880 , 1881 , 1882 , 1883 , 1884 ,
1885 , 1886 , 1887 , 1888 , 1889 , 1890 , 1891 , 1892 , 1893 , 1894 ] )
guardCorps = pd. Series( [ 0 , 2 , 2 , 1 , 0 , 0 , 1 , 1 , 0 , 3 , 0 , 2 , 1 , 0 , 0 , 1 , 0 , 1 , 0 , 1 ] )
corps1 = pd. Series( [ 0 , 0 , 0 , 2 , 0 , 3 , 0 , 2 , 0 , 0 , 0 , 1 , 1 , 1 , 0 , 2 , 0 , 3 , 1 , 0 ] )
corps2 = pd. Series( [ 0 , 0 , 0 , 2 , 0 , 2 , 0 , 0 , 1 , 1 , 0 , 0 , 2 , 1 , 1 , 0 , 0 , 2 , 0 , 0 ] )
corps3 = pd. Series( [ 0 , 0 , 0 , 1 , 1 , 1 , 2 , 0 , 2 , 0 , 0 , 0 , 1 , 0 , 1 , 2 , 1 , 0 , 0 , 0 ] )
corps4 = pd. Series( [ 0 , 1 , 0 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 ] )
corps5 = pd. Series( [ 0 , 0 , 0 , 0 , 2 , 1 , 0 , 0 , 1 , 0 , 0 , 1 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 0 ] )
corps6 = pd. Series( [ 0 , 0 , 1 , 0 , 2 , 0 , 0 , 1 , 2 , 0 , 1 , 1 , 3 , 1 , 1 , 1 , 0 , 3 , 0 , 0 ] )
corps7 = pd. Series( [ 1 , 0 , 1 , 0 , 0 , 0 , 1 , 0 , 1 , 1 , 0 , 0 , 2 , 0 , 0 , 2 , 1 , 0 , 2 , 0 ] )
corps8 = pd. Series( [ 1 , 0 , 0 , 0 , 1 , 0 , 0 , 1 , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 1 , 1 , 0 , 1 ] )
corps9 = pd. Series( [ 0 , 0 , 0 , 0 , 0 , 2 , 1 , 1 , 1 , 0 , 2 , 1 , 1 , 0 , 1 , 2 , 0 , 1 , 0 , 0 ] )
corps10 = pd. Series( [ 0 , 0 , 1 , 1 , 0 , 1 , 0 , 2 , 0 , 2 , 0 , 0 , 0 , 0 , 2 , 1 , 3 , 0 , 1 , 1 ] )
corps11 = pd. Series( [ 0 , 0 , 0 , 0 , 2 , 4 , 0 , 1 , 3 , 0 , 1 , 1 , 1 , 1 , 2 , 1 , 3 , 1 , 3 , 1 ] )
corps14 = pd. Series( [ 1 , 1 , 2 , 1 , 1 , 3 , 0 , 4 , 0 , 1 , 0 , 3 , 2 , 1 , 0 , 2 , 1 , 1 , 0 , 0 ] )
corps15 = pd. Series( [ 0 , 1 , 0 , 0 , 0 , 0 , 0 , 1 , 0 , 1 , 1 , 0 , 0 , 0 , 2 , 2 , 0 , 0 , 0 , 0 ] )
variables = dict ( guardCorps = guardCorps, corps1 = corps1,
corps2 = corps2, corps3 = corps3, corps4 = corps4,
corps5 = corps5, corps6 = corps6, corps7 = corps7,
corps8 = corps8, corps9 = corps9, corps10 = corps10,
corps11 = corps11 , corps14 = corps14, corps15 = corps15)
horsekick = pd. DataFrame( variables, columns = [ 'guardCorps' ,
'corps1' , 'corps2' ,
'corps3' , 'corps4' ,
'corps5' , 'corps6' ,
'corps7' , 'corps8' ,
'corps9' , 'corps10' ,
'corps11' , 'corps14' ,
'corps15' ] )
horsekick. index = [ 1875 , 1876 , 1877 , 1878 , 1879 , 1880 , 1881 , 1882 , 1883 , 1884 ,
1885 , 1886 , 1887 , 1888 , 1889 , 1890 , 1891 , 1892 , 1893 , 1894 ]
horsekick
guardCorps
corps1
corps2
corps3
corps4
corps5
corps6
corps7
corps8
corps9
corps10
corps11
corps14
corps15
1875
0
0
0
0
0
0
0
1
1
0
0
0
1
0
1876
2
0
0
0
1
0
0
0
0
0
0
0
1
1
1877
2
0
0
0
0
0
1
1
0
0
1
0
2
0
1878
1
2
2
1
1
0
0
0
0
0
1
0
1
0
1879
0
0
0
1
1
2
2
0
1
0
0
2
1
0
1880
0
3
2
1
1
1
0
0
0
2
1
4
3
0
1881
1
0
0
2
1
0
0
1
0
1
0
0
0
0
1882
1
2
0
0
0
0
1
0
1
1
2
1
4
1
1883
0
0
1
2
0
1
2
1
0
1
0
3
0
0
1884
3
0
1
0
0
0
0
1
0
0
2
0
1
1
1885
0
0
0
0
0
0
1
0
0
2
0
1
0
1
1886
2
1
0
0
1
1
1
0
0
1
0
1
3
0
1887
1
1
2
1
0
0
3
2
1
1
0
1
2
0
1888
0
1
1
0
0
1
1
0
0
0
0
1
1
0
1889
0
0
1
1
0
1
1
0
0
1
2
2
0
2
1890
1
2
0
2
0
1
1
2
0
2
1
1
2
2
1891
0
0
0
1
1
1
0
1
1
0
3
3
1
0
1892
1
3
2
0
1
1
3
0
1
1
0
1
1
0
1893
0
1
0
0
0
1
0
2
0
0
1
3
0
0
1894
1
0
0
0
0
0
0
0
1
0
1
1
0
0
result = horsekick. apply ( pd. value_counts) . fillna( 0 ) ; result
| | guardCorps | corps1 | corps2 | corps3 | corps4 | corps5 | corps6 | corps7 | corps8 | corps9 | corps10 | corps11 | corps14 | corps15 | | 0 | 9.0 | 11.0 | 12.0 | 11.0 | 12.0 | 10.0 | 9.0 | 11.0 | 13.0 | 10.0 | 10.0 | 6 | 6 | 14.0 | | 1 | 7.0 | 4.0 | 4.0 | 6.0 | 8.0 | 9.0 | 7.0 | 6.0 | 7.0 | 7.0 | 6.0 | 8 | 8 | 4.0 | | 2 | 3.0 | 3.0 | 4.0 | 3.0 | 0.0 | 1.0 | 2.0 | 3.0 | 0.0 | 3.0 | 3.0 | 2 | 3 | 2.0 | | 3 | 1.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3 | 2 | 0.0 | | 4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | 0.0 |
pd. value_counts( horsekick[ 'guardCorps' ] . values, sort= False )
'''
0 9
1 7
2 3
3 1
dtype: int64
'''
horsekick[ 'guardCorps' ] . unique( )
在 Pandas 中创建流水线
Pandas 的流水线功能允许你将 Python 函数串联在一起,来构建数据处理流水线。
import pandas as pd
df = pd. DataFrame( )
df[ 'name' ] = [ 'John' , 'Steve' , 'Sarah' ]
df[ 'gender' ] = [ 'Male' , 'Male' , 'Female' ]
df[ 'age' ] = [ 31 , 32 , 19 ]
df
name
gender
age
0
John
Male
31
1
Steve
Male
32
2
Sarah
Female
19
def mean_age_by_group ( dataframe, col) :
return dataframe. groupby( col) . mean( )
def uppercase_column_name ( dataframe) :
dataframe. columns = dataframe. columns. str . upper( )
return dataframe
( df. pipe( mean_age_by_group, col= 'gender' )
. pipe( uppercase_column_name)
)
AGE
gender
Female
19.0
Male
31.5
使用for
循环创建 Pandas 列
import pandas as pd
import numpy as np
raw_data = { 'student_name' : [ 'Miller' , 'Jacobson' , 'Ali' , 'Milner' , 'Cooze' , 'Jacon' , 'Ryaner' , 'Sone' , 'Sloan' , 'Piger' , 'Riani' , 'Ali' ] ,
'test_score' : [ 76 , 88 , 84 , 67 , 53 , 96 , 64 , 91 , 77 , 73 , 52 , np. NaN] }
df = pd. DataFrame( raw_data, columns = [ 'student_name' , 'test_score' ] )
grades = [ ]
for row in df[ 'test_score' ] :
if row > 95 :
grades. append( 'A' )
elif row > 90 :
grades. append( 'A-' )
elif row > 85 :
grades. append( 'B' )
elif row > 80 :
grades. append( 'B-' )
elif row > 75 :
grades. append( 'C' )
elif row > 70 :
grades. append( 'C-' )
elif row > 65 :
grades. append( 'D' )
elif row > 60 :
grades. append( 'D-' )
else :
grades. append( 'Failed' )
df[ 'grades' ] = grades
df
student_name
test_score
grades
0
Miller
76.0
C
1
Jacobson
88.0
B
2
Ali
84.0
B-
3
Milner
67.0
D
4
Cooze
53.0
Failed
5
Jacon
96.0
A
6
Ryaner
64.0
D-
7
Sone
91.0
A-
8
Sloan
77.0
C
9
Piger
73.0
C-
10
Riani
52.0
Failed
11
Ali
NaN
Failed
创建项目计数
from collections import Counter
fruit_eaten = Counter( [ 'Apple' , 'Apple' , 'Apple' , 'Banana' , 'Pear' , 'Pineapple' ] )
fruit_eaten
fruit_eaten. update( [ 'Pineapple' ] )
fruit_eaten
fruit_eaten. most_common( 3 )
基于条件创建一列
import pandas as pd
import numpy as np
data = { 'name' : [ 'Jason' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ] ,
'age' : [ 42 , 52 , 36 , 24 , 73 ] ,
'preTestScore' : [ 4 , 24 , 31 , 2 , 3 ] ,
'postTestScore' : [ 25 , 94 , 57 , 62 , 70 ] }
df = pd. DataFrame( data, columns = [ 'name' , 'age' , 'preTestScore' , 'postTestScore' ] )
df
name
age
preTestScore
postTestScore
0
Jason
42
4
25
1
Molly
52
24
94
2
Tina
36
31
57
3
Jake
24
2
62
4
Amy
73
3
70
df[ 'elderly' ] = np. where( df[ 'age' ] >= 50 , 'yes' , 'no' )
df
name
age
preTestScore
postTestScore
elderly
0
Jason
42
4
25
no
1
Molly
52
24
94
yes
2
Tina
36
31
57
no
3
Jake
24
2
62
no
4
Amy
73
3
70
yes
从词典键和值创建列表
dict = { 'county' : [ 'Cochice' , 'Pima' , 'Santa Cruz' , 'Maricopa' , 'Yuma' ] ,
'year' : [ 2012 , 2012 , 2013 , 2014 , 2014 ] ,
'fireReports' : [ 4 , 24 , 31 , 2 , 3 ] }
list ( dict . keys( ) )
list ( dict . values( ) )
'''
[[4, 24, 31, 2, 3],
[2012, 2012, 2013, 2014, 2014],
['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma']]
'''
Pandas 中的交叉表
import pandas as pd
raw_data = { 'regiment' : [ 'Nighthawks' , 'Nighthawks' , 'Nighthawks' , 'Nighthawks' , 'Dragoons' , 'Dragoons' , 'Dragoons' , 'Dragoons' , 'Scouts' , 'Scouts' , 'Scouts' , 'Scouts' ] ,
'company' : [ 'infantry' , 'infantry' , 'cavalry' , 'cavalry' , 'infantry' , 'infantry' , 'cavalry' , 'cavalry' , 'infantry' , 'infantry' , 'cavalry' , 'cavalry' ] ,
'experience' : [ 'veteran' , 'rookie' , 'veteran' , 'rookie' , 'veteran' , 'rookie' , 'veteran' , 'rookie' , 'veteran' , 'rookie' , 'veteran' , 'rookie' ] ,
'name' : [ 'Miller' , 'Jacobson' , 'Ali' , 'Milner' , 'Cooze' , 'Jacon' , 'Ryaner' , 'Sone' , 'Sloan' , 'Piger' , 'Riani' , 'Ali' ] ,
'preTestScore' : [ 4 , 24 , 31 , 2 , 3 , 4 , 24 , 31 , 2 , 3 , 2 , 3 ] ,
'postTestScore' : [ 25 , 94 , 57 , 62 , 70 , 25 , 94 , 57 , 62 , 70 , 62 , 70 ] }
df = pd. DataFrame( raw_data, columns = [ 'regiment' , 'company' , 'experience' , 'name' , 'preTestScore' , 'postTestScore' ] )
df
regiment
company
experience
name
preTestScore
postTestScore
0
Nighthawks
infantry
veteran
Miller
4
25
1
Nighthawks
infantry
rookie
Jacobson
24
94
2
Nighthawks
cavalry
veteran
Ali
31
57
3
Nighthawks
cavalry
rookie
Milner
2
62
4
Dragoons
infantry
veteran
Cooze
3
70
5
Dragoons
infantry
rookie
Jacon
4
25
6
Dragoons
cavalry
veteran
Ryaner
24
94
7
Dragoons
cavalry
rookie
Sone
31
57
8
Scouts
infantry
veteran
Sloan
2
62
9
Scouts
infantry
rookie
Piger
3
70
10
Scouts
cavalry
veteran
Riani
2
62
11
Scouts
cavalry
rookie
Ali
3
70
按公司和团队创建交叉表。按公司和团队计算观测数量。
pd. crosstab( df. regiment, df. company, margins= True )
company
cavalry
infantry
All
regiment
Dragoons
2
2
4
Nighthawks
2
2
4
Scouts
2
2
4
All
6
6
12
pd. crosstab( [ df. company, df. experience] , df. regiment, margins= True )
regiment
Dragoons
Nighthawks
Scouts
All
company
experience
cavalry
rookie
1
1
1
3
veteran
1
1
1
3
infantry
rookie
1
1
1
3
veteran
1
1
1
3
All
4
4
4
12
删除重复
import pandas as pd
raw_data = { 'first_name' : [ 'Jason' , 'Jason' , 'Jason' , 'Tina' , 'Jake' , 'Amy' ] ,
'last_name' : [ 'Miller' , 'Miller' , 'Miller' , 'Ali' , 'Milner' , 'Cooze' ] ,
'age' : [ 42 , 42 , 1111111 , 36 , 24 , 73 ] ,
'preTestScore' : [ 4 , 4 , 4 , 31 , 2 , 3 ] ,
'postTestScore' : [ 25 , 25 , 25 , 57 , 62 , 70 ] }
df = pd. DataFrame( raw_data, columns = [ 'first_name' , 'last_name' , 'age' , 'preTestScore' , 'postTestScore' ] )
df
first_name
last_name
age
preTestScore
postTestScore
0
Jason
Miller
42
4
25
1
Jason
Miller
42
4
25
2
Jason
Miller
1111111
4
25
3
Tina
Ali
36
31
57
4
Jake
Milner
24
2
62
5
Amy
Cooze
73
3
70
df. duplicated( )
'''
0 False
1 True
2 False
3 False
4 False
5 False
dtype: bool
'''
df. drop_duplicates( )
first_name
last_name
age
preTestScore
postTestScore
0
Jason
Miller
42
4
25
2
Jason
Miller
1111111
4
25
3
Tina
Ali
36
31
57
4
Jake
Milner
24
2
62
5
Amy
Cooze
73
3
70
df. drop_duplicates( [ 'first_name' ] , keep= 'last' )
first_name
last_name
age
preTestScore
postTestScore
2
Jason
Miller
1111111
4
25
3
Tina
Ali
36
31
57
4
Jake
Milner
24
2
62
5
Amy
Cooze
73
3
70
Pandas 数据帧的描述性统计
import pandas as pd
data = { 'name' : [ 'Jason' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ] ,
'age' : [ 42 , 52 , 36 , 24 , 73 ] ,
'preTestScore' : [ 4 , 24 , 31 , 2 , 3 ] ,
'postTestScore' : [ 25 , 94 , 57 , 62 , 70 ] }
df = pd. DataFrame( data, columns = [ 'name' , 'age' , 'preTestScore' , 'postTestScore' ] )
df
name
age
preTestScore
postTestScore
0
Jason
42
4
25
1
Molly
52
24
94
2
Tina
36
31
57
3
Jake
24
2
62
4
Amy
73
3
70
5 rows × 4 columns
df[ 'age' ] . sum ( )
df[ 'preTestScore' ] . mean( )
df[ 'preTestScore' ] . cumsum( )
'''
0 4
1 28
2 59
3 61
4 64
Name: preTestScore, dtype: int64
'''
df[ 'preTestScore' ] . describe( )
'''
count 5.000000
mean 12.800000
std 13.663821
min 2.000000
25% 3.000000
50% 4.000000
75% 24.000000
max 31.000000
Name: preTestScore, dtype: float64
'''
df[ 'preTestScore' ] . count( )
df[ 'preTestScore' ] . min ( )
df[ 'preTestScore' ] . max ( )
df[ 'preTestScore' ] . median( )
df[ 'preTestScore' ] . var( )
df[ 'preTestScore' ] . std( )
df[ 'preTestScore' ] . skew( )
df[ 'preTestScore' ] . kurt( )
df. corr( )
age
preTestScore
postTestScore
age
1.000000
-0.105651
0.328852
preTestScore
-0.105651
1.000000
0.378039
postTestScore
0.328852
0.378039
1.000000
3 rows × 3 columns
df. cov( )
age
preTestScore
postTestScore
age
340.80
-26.65
151.20
preTestScore
-26.65
186.70
128.65
postTestScore
151.20
128.65
620.30
3 rows × 3 columns
丢弃行或者列
import pandas as pd
data = { 'name' : [ 'Jason' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ] ,
'year' : [ 2012 , 2012 , 2013 , 2014 , 2014 ] ,
'reports' : [ 4 , 24 , 31 , 2 , 3 ] }
df = pd. DataFrame( data, index = [ 'Cochice' , 'Pima' , 'Santa Cruz' , 'Maricopa' , 'Yuma' ] )
df
name
reports
year
Cochice
Jason
4
2012
Pima
Molly
24
2012
Santa Cruz
Tina
31
2013
Maricopa
Jake
2
2014
Yuma
Amy
3
2014
df. drop( [ 'Cochice' , 'Pima' ] )
name
reports
year
Santa Cruz
Tina
31
2013
Maricopa
Jake
2
2014
Yuma
Amy
3
2014
df. drop( 'reports' , axis= 1 )
name
year
Cochice
Jason
2012
Pima
Molly
2012
Santa Cruz
Tina
2013
Maricopa
Jake
2014
Yuma
Amy
2014
如果它包含某个值(这里是Tina
),丢弃一行。
具体来说:创建一个名为df
的新数据框,名称列中的单元格的值不等于Tina
。
df[ df. name != 'Tina' ]
name
reports
year
Cochice
Jason
4
2012
Pima
Molly
24
2012
Maricopa
Jake
2
2014
Yuma
Amy
3
2014
按照行号丢弃一行(在本例中为第 3 行)。
请注意,Pandas使用从零开始的编号,因此 0 是第一行,1 是第二行,等等。
df. drop( df. index[ 2 ] )
name
reports
year
Cochice
Jason
4
2012
Pima
Molly
24
2012
Maricopa
Jake
2
2014
Yuma
Amy
3
2014
可以扩展到范围。
df. drop( df. index[ [ 2 , 3 ] ] )
name
reports
year
Cochice
Jason
4
2012
Pima
Molly
24
2012
Yuma
Amy
3
2014
或相对于 DF 的末尾来丢弃。
df. drop( df. index[ - 2 ] )
name
reports
year
Cochice
Jason
4
2012
Pima
Molly
24
2012
Santa Cruz
Tina
31
2013
Yuma
Amy
3
2014
你也可以选择相对于起始或末尾的范围。
df[ : 3 ]
name
reports
year
Cochice
Jason
4
2012
Pima
Molly
24
2012
Santa Cruz
Tina
31
2013
df[ : - 3 ]
name
reports
year
Cochice
Jason
4
2012
Pima
Molly
24
2012
枚举列表
data = [ 'One' , 'Two' , 'Three' , 'Four' , 'Five' ]
for item in enumerate ( data) :
print ( item)
print ( item[ 1 ] )
'''
(0, 'One')
One
(1, 'Two')
Two
(2, 'Three')
Three
(3, 'Four')
Four
(4, 'Five')
Five
'''
在 Pandas 中将包含列表的单元扩展为自己的变量
import pandas as pd
raw_data = { 'score' : [ 1 , 2 , 3 ] ,
'tags' : [ [ 'apple' , 'pear' , 'guava' ] , [ 'truck' , 'car' , 'plane' ] , [ 'cat' , 'dog' , 'mouse' ] ] }
df = pd. DataFrame( raw_data, columns = [ 'score' , 'tags' ] )
df
score
tags
0
1
[apple, pear, guava]
1
2
[truck, car, plane]
2
3
[cat, dog, mouse]
tags = df[ 'tags' ] . apply ( pd. Series)
tags = tags. rename( columns = lambda x : 'tag_' + str ( x) )
tags
tag_0
tag_1
tag_2
0
apple
pear
guava
1
truck
car
plane
2
cat
dog
mouse
pd. concat( [ df[ : ] , tags[ : ] ] , axis= 1 )
score
tags
tag_0
tag_1
tag_2
0
1
[apple, pear, guava]
apple
pear
guava
1
2
[truck, car, plane]
truck
car
plane
2
3
[cat, dog, mouse]
cat
dog
mouse
过滤 pandas 数据帧
import pandas as pd
data = { 'name' : [ 'Jason' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ] ,
'year' : [ 2012 , 2012 , 2013 , 2014 , 2014 ] ,
'reports' : [ 4 , 24 , 31 , 2 , 3 ] ,
'coverage' : [ 25 , 94 , 57 , 62 , 70 ] }
df = pd. DataFrame( data, index = [ 'Cochice' , 'Pima' , 'Santa Cruz' , 'Maricopa' , 'Yuma' ] )
df
coverage
name
reports
year
Cochice
25
Jason
4
2012
Pima
94
Molly
24
2012
Santa Cruz
57
Tina
31
2013
Maricopa
62
Jake
2
2014
Yuma
70
Amy
3
2014
df[ 'name' ]
'''
Cochice Jason
Pima Molly
Santa Cruz Tina
Maricopa Jake
Yuma Amy
Name: name, dtype: object
'''
df[ [ 'name' , 'reports' ] ]
name
reports
Cochice
Jason
4
Pima
Molly
24
Santa Cruz
Tina
31
Maricopa
Jake
2
Yuma
Amy
3
df[ : 2 ]
coverage
name
reports
year
Cochice
25
Jason
4
2012
Pima
94
Molly
24
2012
df[ df[ 'coverage' ] > 50 ]
coverage
name
reports
year
Pima
94
Molly
24
2012
Santa Cruz
57
Tina
31
2013
Maricopa
62
Jake
2
2014
Yuma
70
Amy
3
2014
df[ ( df[ 'coverage' ] > 50 ) & ( df[ 'reports' ] < 4 ) ]
coverage
name
reports
year
Maricopa
62
Jake
2
2014
Yuma
70
Amy
3
2014
寻找数据帧的列中的最大值
% matplotlib inline
import pandas as pd
import matplotlib. pyplot as plt
import numpy as np
raw_data = { 'first_name' : [ 'Jason' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ] ,
'last_name' : [ 'Miller' , 'Jacobson' , 'Ali' , 'Milner' , 'Cooze' ] ,
'age' : [ 42 , 52 , 36 , 24 , 73 ] ,
'preTestScore' : [ 4 , 24 , 31 , 2 , 3 ] ,
'postTestScore' : [ 25 , 94 , 57 , 62 , 70 ] }
df = pd. DataFrame( raw_data, columns = [ 'first_name' , 'last_name' , 'age' , 'preTestScore' , 'postTestScore' ] )
df
first_name
last_name
age
preTestScore
postTestScore
0
Jason
Miller
42
4
25
1
Molly
Jacobson
52
24
94
2
Tina
Ali
36
31
57
3
Jake
Milner
24
2
62
4
Amy
Cooze
73
3
70
df[ 'preTestScore' ] . idxmax( )
寻找数据帧中的唯一值
import pandas as pd
import numpy as np
raw_data = { 'regiment' : [ '51st' , '29th' , '2nd' , '19th' , '12th' , '101st' , '90th' , '30th' , '193th' , '1st' , '94th' , '91th' ] ,
'trucks' : [ 'MAZ-7310' , np. nan, 'MAZ-7310' , 'MAZ-7310' , 'Tatra 810' , 'Tatra 810' , 'Tatra 810' , 'Tatra 810' , 'ZIS-150' , 'Tatra 810' , 'ZIS-150' , 'ZIS-150' ] ,
'tanks' : [ 'Merkava Mark 4' , 'Merkava Mark 4' , 'Merkava Mark 4' , 'Leopard 2A6M' , 'Leopard 2A6M' , 'Leopard 2A6M' , 'Arjun MBT' , 'Leopard 2A6M' , 'Arjun MBT' , 'Arjun MBT' , 'Arjun MBT' , 'Arjun MBT' ] ,
'aircraft' : [ 'none' , 'none' , 'none' , 'Harbin Z-9' , 'Harbin Z-9' , 'none' , 'Harbin Z-9' , 'SH-60B Seahawk' , 'SH-60B Seahawk' , 'SH-60B Seahawk' , 'SH-60B Seahawk' , 'SH-60B Seahawk' ] }
df = pd. DataFrame( raw_data, columns = [ 'regiment' , 'trucks' , 'tanks' , 'aircraft' ] )
df. head( )
regiment
trucks
tanks
aircraft
0
51st
MAZ-7310
Merkava Mark 4
none
1
29th
NaN
Merkava Mark 4
none
2
2nd
MAZ-7310
Merkava Mark 4
none
3
19th
MAZ-7310
Leopard 2A6M
Harbin Z-9
4
12th
Tatra 810
Leopard 2A6M
Harbin Z-9
list ( set ( df. trucks) )
list ( df[ 'trucks' ] . unique( ) )
地理编码和反向地理编码
在使用地理数据时,地理编码(将物理地址或位置转换为经纬度)和反向地理编码(将经纬度转换为物理地址或位置)是常见任务。
Python 提供了许多软件包,使任务变得异常简单。 在下面的教程中,我使用 pygeocoder(Google 的 geo-API 的包装器)来进行地理编码和反向地理编码。
首先,我们要加载我们想要在脚本中使用的包。 具体来说,我正在为地理函数加载 pygeocoder,为数据帧结构加载 pandas,为缺失值(np.nan
)函数加载 numpy。
from pygeocoder import Geocoder
import pandas as pd
import numpy as np
地理数据有多种形式,在这种情况下,我们有一个 Python 字典,包含五个经纬度的字符串,每个坐标在逗号分隔的坐标对中。
data = { 'Site 1' : '31.336968, -109.560959' ,
'Site 2' : '31.347745, -108.229963' ,
'Site 3' : '32.277621, -107.734724' ,
'Site 4' : '31.655494, -106.420484' ,
'Site 5' : '30.295053, -104.014528' }
虽然技术上没必要,因为我最初使用 R,我是数据帧的忠实粉丝,所以让我们把模拟的数据字典变成数据帧。
df = pd. DataFrame. from_dict( data, orient= 'index' )
df
0
Site 1
31.336968, -109.560959
Site 2
31.347745, -108.229963
Site 3
32.277621, -107.734724
Site 4
31.655494, -106.420484
Site 5
30.295053, -104.014528
你现在可以看到,我们有了包含五行的数据帧,每行包含一个经纬度字符串。 在我们处理数据之前,我们需要1)将字符串分成纬度和经度,然后将它们转换为浮点数。以下代码就是这样。
lat = [ ]
lon = [ ]
for row in df[ 0 ] :
try :
lat. append( float ( row. split( ',' ) [ 0 ] ) )
lon. append( float ( row. split( ',' ) [ 1 ] ) )
except :
lat. append( np. NaN)
lon. append( np. NaN)
df[ 'latitude' ] = lat
df[ 'longitude' ] = lon
让我们看看现在有了什么。
df
0
latitude
longitude
Site 1
31.336968, -109.560959
31.336968
-109.560959
Site 2
31.347745, -108.229963
31.347745
-108.229963
Site 3
32.277621, -107.734724
32.277621
-107.734724
Site 4
31.655494, -106.420484
31.655494
-106.420484
Site 5
30.295053, -104.014528
30.295053
-104.014528
真棒。这正是我们想要看到的,一列用于纬度的浮点和一列用于经度的浮点。
为了反转地理编码,我们将特定的经纬度对(这里为第一行,索引为0
)提供给 pygeocoder 的reverse_geocoder
函数。
results = Geocoder. reverse_geocode( df[ 'latitude' ] [ 0 ] , df[ 'longitude' ] [ 0 ] )
现在我们可以开始提取我们想要的数据了。
results. coordinates
results. city
results. country
results. street_address
results. administrative_area_level_1
对于地理编码,我们需要将包含地址或位置(例如城市)的字符串,传入地理编码函数中。 但是,并非所有字符串的格式都是 Google 的 geo-API 可以理解的。 如果由.geocode().valid_address
函数验证有效,我们可以转换。
Geocoder. geocode( "4207 N Washington Ave, Douglas, AZ 85607" ) . valid_address
因为输出是True,我们现在知道这是一个有效的地址,因此可以打印纬度和经度坐标。
results. coordinates
但更有趣的是,一旦地址由 Google 地理 API 处理,我们就可以解析它并轻松地分隔街道号码,街道名称等。
result = Geocoder. geocode( "7250 South Tucson Boulevard, Tucson, AZ 85756" )
result. street_number
result. route
你就实现了它。Python 使整个过程变得简单,只需几分钟即可完成分析。祝好运!
地理定位城市和国家
本教程创建一个函数,尝试获取城市和国家并返回其经纬度。 但是当城市不可用时(通常是这种情况),则返回该国中心的经纬度。
from geopy. geocoders import Nominatim
geolocator = Nominatim( )
import numpy as np
def geolocate ( city= None , country= None ) :
'''
输入城市和国家,或仅输入国家。 如果可以的话,返回城市的经纬度坐标,否则返回该国家中心的经纬度。
'''
if city != None :
try :
loc = geolocator. geocode( str ( city + ',' + country) )
return ( loc. latitude, loc. longitude)
except :
return np. nan
else :
try :
loc = geolocator. geocode( country)
return ( loc. latitude, loc. longitude)
except :
return np. nan
geolocate( city= 'Austin' , country= 'USA' )
geolocate( country= 'USA' )
使用 pandas 分组时间序列
import pandas as pd
import numpy as np
df = pd. DataFrame( )
df[ 'german_army' ] = np. random. randint( low= 20000 , high= 30000 , size= 100 )
df[ 'allied_army' ] = np. random. randint( low= 20000 , high= 40000 , size= 100 )
df. index = pd. date_range( '1/1/2014' , periods= 100 , freq= 'H' )
df. head( )
german_army
allied_army
2014-01-01 00:00:00
28755
33938
2014-01-01 01:00:00
25176
28631
—
—
—
2014-01-01 02:00:00
23261
39685
—
—
—
2014-01-01 03:00:00
28686
27756
—
—
—
2014-01-01 04:00:00
24588
25681
—
—
—
Truncate the dataframe
df. truncate( before= '1/2/2014' , after= '1/3/2014' )
german_army
allied_army
2014-01-02 00:00:00
26401
20189
2014-01-02 01:00:00
29958
23934
2014-01-02 02:00:00
24492
39075
2014-01-02 03:00:00
25707
39262
2014-01-02 04:00:00
27129
35961
2014-01-02 05:00:00
27903
25418
2014-01-02 06:00:00
20409
25163
2014-01-02 07:00:00
25736
34794
2014-01-02 08:00:00
24057
27209
2014-01-02 09:00:00
26875
33402
2014-01-02 10:00:00
23963
38575
2014-01-02 11:00:00
27506
31859
2014-01-02 12:00:00
23564
25750
2014-01-02 13:00:00
27958
24365
2014-01-02 14:00:00
24915
38866
2014-01-02 15:00:00
23538
33820
2014-01-02 16:00:00
23361
30080
2014-01-02 17:00:00
27284
22922
2014-01-02 18:00:00
24176
32155
2014-01-02 19:00:00
23924
27763
2014-01-02 20:00:00
23111
32343
2014-01-02 21:00:00
20348
28907
2014-01-02 22:00:00
27136
38634
2014-01-02 23:00:00
28649
29950
2014-01-03 00:00:00
21292
26395
df. index = df. index + pd. DateOffset( months= 4 , days= 5 )
df. head( )
german_army
allied_army
2014-05-06 00:00:00
28755
33938
2014-05-06 01:00:00
25176
28631
2014-05-06 02:00:00
23261
39685
2014-05-06 03:00:00
28686
27756
2014-05-06 04:00:00
24588
25681
df. shift( 1 ) . head( )
german_army
allied_army
2014-05-06 00:00:00
NaN
NaN
2014-05-06 01:00:00
28755.0
33938.0
2014-05-06 02:00:00
25176.0
28631.0
2014-05-06 03:00:00
23261.0
39685.0
2014-05-06 04:00:00
28686.0
27756.0
df. shift( - 1 ) . tail( )
german_army
allied_army
2014-05-09 23:00:00
26903.0
39144.0
2014-05-10 00:00:00
27576.0
39759.0
2014-05-10 01:00:00
25232.0
35246.0
2014-05-10 02:00:00
23391.0
21044.0
2014-05-10 03:00:00
NaN
NaN
df. resample( 'D' ) . sum ( )
german_army
allied_army
2014-05-06
605161
755962
2014-05-07
608100
740396
2014-05-08
589744
700297
2014-05-09
607092
719283
2014-05-10
103102
135193
df. resample( 'D' ) . mean( )
german_army
allied_army
2014-05-06
25215.041667
31498.416667
2014-05-07
25337.500000
30849.833333
2014-05-08
24572.666667
29179.041667
2014-05-09
25295.500000
29970.125000
2014-05-10
25775.500000
33798.250000
df. resample( 'D' ) . min ( )
german_army
allied_army
2014-05-06
24882.0
31310.0
2014-05-07
25311.0
30969.5
2014-05-08
24422.5
28318.0
2014-05-09
24941.5
32082.5
2014-05-10
26067.5
37195.0
df. resample( 'D' ) . median( )
german_army
allied_army
2014-05-06
24882.0
31310.0
2014-05-07
25311.0
30969.5
2014-05-08
24422.5
28318.0
2014-05-09
24941.5
32082.5
2014-05-10
26067.5
37195.0
df. resample( 'D' ) . first( )
german_army
allied_army
2014-05-06
28755
33938
2014-05-07
26401
20189
2014-05-08
21292
26395
2014-05-09
25764
22613
2014-05-10
26903
39144
df. resample( 'D' ) . last( )
german_army
allied_army
2014-05-06
28214
32110
2014-05-07
28649
29950
2014-05-08
28379
32600
2014-05-09
26752
22379
2014-05-10
23391
21044
df. resample( 'D' ) . ohlc( )
german_army
allied_army
open
high
2014-05-06
28755
29206
2014-05-07
26401
29958
2014-05-08
21292
29786
2014-05-09
25764
29952
2014-05-10
26903
27576
按时间分组数据
2016 年 3 月 13 日,Pandas 版本 0.18.0 发布,重采样功能的运行方式发生了重大变化。 本教程遵循 v0.18.0,不适用于以前版本的 pandas。
首先让我们加载我们关心的模块。
import pandas as pd
import datetime
import numpy as np
接下来,让我们创建一些样例数据,我们可以将它们按时间分组作为样本。 在这个例子中,我创建了一个包含两列 365 行的数据帧。一列是日期,第二列是数值。
base = datetime. datetime. today( )
date_list = [ base - datetime. timedelta( days= x) for x in range ( 0 , 365 ) ]
score_list = list ( np. random. randint( low= 1 , high= 1000 , size= 365 ) )
df = pd. DataFrame( )
df[ 'datetime' ] = date_list
df[ 'datetime' ] = pd. to_datetime( df[ 'datetime' ] )
df. index = df[ 'datetime' ]
df[ 'score' ] = score_list
df. head( )
datetime
score
datetime
2016-06-02 09:57:54.793972
2016-06-02 09:57:54.793972
900
2016-06-01 09:57:54.793972
2016-06-01 09:57:54.793972
121
2016-05-31 09:57:54.793972
2016-05-31 09:57:54.793972
547
2016-05-30 09:57:54.793972
2016-05-30 09:57:54.793972
504
2016-05-29 09:57:54.793972
2016-05-29 09:57:54.793972
304
在 pandas 中,按时间分组的最常用方法是使用.resample()
函数。 在 v0.18.0 中,此函数是两阶段的。 这意味着df.resample('M')
创建了一个对象,我们可以对其应用其他函数(mean
,count
,sum
等)
df. resample( 'M' ) . mean( )
score
datetime
2015-06-30
513.629630
2015-07-31
561.516129
2015-08-31
448.032258
2015-09-30
548.000000
2015-10-31
480.419355
2015-11-30
487.033333
2015-12-31
499.935484
2016-01-31
429.193548
2016-02-29
520.413793
2016-03-31
349.806452
2016-04-30
395.500000
2016-05-31
503.451613
2016-06-30
510.500000
df. resample( 'M' ) . sum ( )
score
datetime
2015-06-30
13868
2015-07-31
17407
2015-08-31
13889
2015-09-30
16440
2015-10-31
14893
2015-11-30
14611
2015-12-31
15498
2016-01-31
13305
2016-02-29
15092
2016-03-31
10844
2016-04-30
11865
2016-05-31
15607
2016-06-30
1021
分组有很多选项。 你可以在 Pandas 的时间序列文档 中了解它们的更多信息,但是,为了你的方便,我也在下面列出了它们。
值
描述
B
business day frequency
C
custom business day frequency (experimental)
D
calendar day frequency
W
weekly frequency
M
month end frequency
BM
business month end frequency
CBM
custom business month end frequency
MS
month start frequency
BMS
business month start frequency
Q
quarter end frequency
BQ
business quarter endfrequency
QS
quarter start frequency
BQS
business quarter start frequency
A
year end frequency
BA
business year end frequency
AS
year start frequency
BAS
business year start frequency
BH
business hour frequency
H
hourly frequency
T
minutely frequency
S
secondly frequency
L
milliseonds
U
microseconds
N
nanoseconds
按小时分组数据
import pandas as pd
import numpy as np
time = pd. date_range( '1/1/2000' , periods= 2000 , freq= '5min' )
series = pd. Series( np. random. randint( 100 , size= 2000 ) , index= time)
series[ 0 : 10 ]
'''
2000-01-01 00:00:00 40
2000-01-01 00:05:00 13
2000-01-01 00:10:00 99
2000-01-01 00:15:00 72
2000-01-01 00:20:00 4
2000-01-01 00:25:00 36
2000-01-01 00:30:00 24
2000-01-01 00:35:00 20
2000-01-01 00:40:00 83
2000-01-01 00:45:00 44
Freq: 5T, dtype: int64
'''
series. groupby( series. index. hour) . mean( )
'''
0 50.380952
1 49.380952
2 49.904762
3 53.273810
4 47.178571
5 46.095238
6 49.047619
7 44.297619
8 53.119048
9 48.261905
10 45.166667
11 54.214286
12 50.714286
13 56.130952
14 50.916667
15 42.428571
16 46.880952
17 56.892857
18 54.071429
19 47.607143
20 50.940476
21 50.511905
22 44.550000
23 50.250000
dtype: float64
'''
对行分组
import pandas as pd
raw_data = { 'regiment' : [ 'Nighthawks' , 'Nighthawks' , 'Nighthawks' , 'Nighthawks' , 'Dragoons' , 'Dragoons' , 'Dragoons' , 'Dragoons' , 'Scouts' , 'Scouts' , 'Scouts' , 'Scouts' ] ,
'company' : [ '1st' , '1st' , '2nd' , '2nd' , '1st' , '1st' , '2nd' , '2nd' , '1st' , '1st' , '2nd' , '2nd' ] ,
'name' : [ 'Miller' , 'Jacobson' , 'Ali' , 'Milner' , 'Cooze' , 'Jacon' , 'Ryaner' , 'Sone' , 'Sloan' , 'Piger' , 'Riani' , 'Ali' ] ,
'preTestScore' : [ 4 , 24 , 31 , 2 , 3 , 4 , 24 , 31 , 2 , 3 , 2 , 3 ] ,
'postTestScore' : [ 25 , 94 , 57 , 62 , 70 , 25 , 94 , 57 , 62 , 70 , 62 , 70 ] }
df = pd. DataFrame( raw_data, columns = [ 'regiment' , 'company' , 'name' , 'preTestScore' , 'postTestScore' ] )
df
regiment
company
name
preTestScore
postTestScore
0
Nighthawks
1st
Miller
4
25
1
Nighthawks
1st
Jacobson
24
94
2
Nighthawks
2nd
Ali
31
57
3
Nighthawks
2nd
Milner
2
62
4
Dragoons
1st
Cooze
3
70
5
Dragoons
1st
Jacon
4
25
6
Dragoons
2nd
Ryaner
24
94
7
Dragoons
2nd
Sone
31
57
8
Scouts
1st
Sloan
2
62
9
Scouts
1st
Piger
3
70
10
Scouts
2nd
Riani
2
62
11
Scouts
2nd
Ali
3
70
regiment_preScore = df[ 'preTestScore' ] . groupby( df[ 'regiment' ] )
regiment_preScore. mean( )
'''
regiment
Dragoons 15.50
Nighthawks 15.25
Scouts 2.50
Name: preTestScore, dtype: float64
'''
Pandas 中的分层数据
import pandas as pd
raw_data = { 'regiment' : [ 'Nighthawks' , 'Nighthawks' , 'Nighthawks' , 'Nighthawks' , 'Dragoons' , 'Dragoons' , 'Dragoons' , 'Dragoons' , 'Scouts' , 'Scouts' , 'Scouts' , 'Scouts' ] ,
'company' : [ '1st' , '1st' , '2nd' , '2nd' , '1st' , '1st' , '2nd' , '2nd' , '1st' , '1st' , '2nd' , '2nd' ] ,
'name' : [ 'Miller' , 'Jacobson' , 'Ali' , 'Milner' , 'Cooze' , 'Jacon' , 'Ryaner' , 'Sone' , 'Sloan' , 'Piger' , 'Riani' , 'Ali' ] ,
'preTestScore' : [ 4 , 24 , 31 , 2 , 3 , 4 , 24 , 31 , 2 , 3 , 2 , 3 ] ,
'postTestScore' : [ 25 , 94 , 57 , 62 , 70 , 25 , 94 , 57 , 62 , 70 , 62 , 70 ] }
df = pd. DataFrame( raw_data, columns = [ 'regiment' , 'company' , 'name' , 'preTestScore' , 'postTestScore' ] )
df
regiment
company
name
preTestScore
postTestScore
0
Nighthawks
1st
Miller
4
25
1
Nighthawks
1st
Jacobson
24
94
2
Nighthawks
2nd
Ali
31
57
3
Nighthawks
2nd
Milner
2
62
4
Dragoons
1st
Cooze
3
70
5
Dragoons
1st
Jacon
4
25
6
Dragoons
2nd
Ryaner
24
94
7
Dragoons
2nd
Sone
31
57
8
Scouts
1st
Sloan
2
62
9
Scouts
1st
Piger
3
70
10
Scouts
2nd
Riani
2
62
11
Scouts
2nd
Ali
3
70
df = df. set_index( [ 'regiment' , 'company' ] , drop= False )
df
regiment
company
name
preTestScore
postTestScore
regiment
company
Nighthawks
1st
Nighthawks
1st
Miller
4
1st
Nighthawks
1st
Jacobson
24
94
2nd
Nighthawks
2nd
Ali
31
57
2nd
Nighthawks
2nd
Milner
2
62
Dragoons
1st
Dragoons
1st
Cooze
3
1st
Dragoons
1st
Jacon
4
25
2nd
Dragoons
2nd
Ryaner
24
94
2nd
Dragoons
2nd
Sone
31
57
Scouts
1st
Scouts
1st
Sloan
2
1st
Scouts
1st
Piger
3
70
2nd
Scouts
2nd
Riani
2
62
2nd
Scouts
2nd
Ali
3
70
df = df. set_index( [ 'regiment' , 'company' ] )
df
name
preTestScore
postTestScore
regiment
company
Nighthawks
1st
Miller
4
1st
Jacobson
24
94
2nd
Ali
31
57
2nd
Milner
2
62
Dragoons
1st
Cooze
3
1st
Jacon
4
25
2nd
Ryaner
24
94
2nd
Sone
31
57
Scouts
1st
Sloan
2
1st
Piger
3
70
2nd
Riani
2
62
2nd
Ali
3
70
df. index
MultiIndex( levels= [ [ 'Dragoons' , 'Nighthawks' , 'Scouts' ] , [ '1st' , '2nd' ] ] ,
labels= [ [ 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 2 , 2 , 2 , 2 ] , [ 0 , 0 , 1 , 1 , 0 , 0 , 1 , 1 , 0 , 0 , 1 , 1 ] ] ,
names= [ 'regiment' , 'company' ] )
df. swaplevel( 'regiment' , 'company' )
name
preTestScore
postTestScore
company
regiment
1st
Nighthawks
Miller
4
25
Nighthawks
Jacobson
24
94
2nd
Nighthawks
Ali
31
57
Nighthawks
Milner
2
62
1st
Dragoons
Cooze
3
70
Dragoons
Jacon
4
25
2nd
Dragoons
Ryaner
24
94
Dragoons
Sone
31
57
1st
Scouts
Sloan
2
62
Scouts
Piger
3
70
2nd
Scouts
Riani
2
62
Scouts
Ali
3
70
df. sum ( level= 'regiment' )
preTestScore
postTestScore
regiment
Nighthawks
61
238
Dragoons
62
246
Scouts
10
264