pandas_映射
start
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
df = DataFrame( np. random. normal( 100 , scale = 30 , size= ( 40 , 3 ) ) , columns= [ 'yu' , 'shu' , 'ying' ] , dtype= np. uint8)
df
out:yu shu ying 0 109 164 90 1 118 94 158 2 105 70 115 …
df. set_index( 'yu' )
out: 可以用set_index 来设置index
df. replace( { 70 : 60 , 115 : 60 } , inplace= True )
df. replace( { 115 : 60 , np. nan: 1024 } , inplace= True )
df[ 'Java' ] = df[ 'ying' ] . map ( lambda x : int ( ( ( x + 10 ) / 3 ) * 2 ) )
根据’ying’项创建’Java’项
out:yu shu ying Java 0 109 164 90 66 1 118 94 158 112 …
def cover ( x) :
if x < 60 :
return '不及格'
elif x < 80 :
return '及格'
elif x < 100 :
return '中等'
elif x < 120 :
return '良好'
else :
return '优秀'
df[ 'Lever' ] = df[ 'ying' ] . map ( cover)
df
out:yu shu ying Java Lever 0 109 164 90 66 中等 1 118 94 158 112 优秀 2 105 70 115 83 良好
更改索引名字
df2. rename( mapper= {
0 : 'A' ,
1 : 'B' ,
2 : 'C' ,
} , axis= 0 , inplace= True )
df2
out:yu shu ying Java Lever A 109 优秀 中等 66 中等 B 118 中等 优秀 112 优秀 C 105 及格 良好 83 良好 3 78 中等 良好 86 良好 4 109 中等 中等 62 中等 5 73 及格 良好 73 良好 6 65 良好 及格 50 及格
df2. rename( mapper= {
'yu' : '语文' ,
'shu' : '数学' ,
'ying' : '英语' ,
} , axis= 1 , inplace= True )
异常值检测与过滤
df = DataFrame( np. random. normal( 100 , scale = 30 , size= ( 40 , 3 ) ) , columns= [ 'yu' , 'shu' , 'ying' ] , dtype= np. uint8)
df
out: yu shu ying 0 85 64 53 1 70 90 126 2 110 93 128 3 94 108 73 4 67 132 154 5 81 158 69 6 95 157 90 7 123 112 105 8 108 78 157 …
m = df. mean( )
std = df. std( )
df. iloc[ 8 , 2 ] = 200
cond = df - m > 3 * std
df[ cond. any ( axis = 1 ) ]
out:yu shu ying (8, 2) 8 104 98 200 299
index = df[ cond. any ( axis = 1 ) ] . index
df. drop( labels= index, axis= 0 , inplace= True )
index = np. random. randint( 0 , 40 , size = 10 )
df1 = df. take( index)
使用take和normal可以完成随机抽样的效果
out:yu shu ying 18 81 132 84 2 85 74 84 4 118 112 103 5 106 49 66 30 130 91 66 6 81 114 152 34 104 119 61 19 110 72 39 22 159 147 81 7 132 106 111
数据聚合
df = DataFrame( { 'color' : [ 'white' , 'black' , 'white' , 'white' , 'black' , 'black' ] ,
'status' : [ 'up' , 'up' , 'down' , 'down' , 'down' , 'up' ] ,
'value1' : [ 12.33 , 14.55 , 22.34 , 27.84 , 23.40 , 18.33 ] ,
'value2' : [ 11.23 , 31.80 , 29.99 , 31.18 , 18.25 , 22.44 ] } )
ret = df. groupby( by = [ 'color' ] ) . mean( )
ret
out: value1 value2 color black 18.760000 24.163333 white 20.836667 24.133333
ret = df. groupby( by = [ 'color' , 'status' ] ) . mean( )
ret
out: value1 value2 color status black down 23.40 18.250 up 16.44 27.120 white down 25.09 30.585 up 12.33 11.230
ret = df. groupby( by = [ 'color' , 'status' ] )
def covert ( x) :
return ( np. round ( x. mean( ) , 1 ) , x. min ( ) , x. max ( ) )
ret. agg( covert)
out: value1 value2 color status black down (23.4, 23.4, 23.4) (18.2, 18.25, 18.25) up (16.4, 14.55, 18.33) (27.1, 22.44, 31.8) white down (25.1, 22.34, 27.84) (30.6, 29.99, 31.18) up (12.3, 12.33, 12.33) (11.2, 11.23, 11.23)