python_数据_pandas_3

pandas_映射

start

import pandas as pd
import numpy as np
from pandas import Series,DataFrame
df = DataFrame(np.random.normal(100,scale = 30,size=(40,3)),columns=['yu','shu','ying'],dtype=np.uint8)
df
  • out:yu shu ying
    0 109 164 90
    1 118 94 158
    2 105 70 115
df.set_index('yu')
  • out: 可以用set_index 来设置index
df.replace({70:60,115:60},inplace=True)
  • 将dataframe中的70和115替换为60
df.replace({115:60,np.nan:1024},inplace=True)
  • inplace = True 表示原数据保留改变
df['Java'] = df['ying'].map(lambda x :int(((x + 10) / 3) * 2))
  • 根据’ying’项创建’Java’项
  • out:yu shu ying Java
    0 109 164 90 66
    1 118 94 158 112
def cover(x):
    if x < 60:
        return '不及格'
    elif x < 80:
        return '及格'
    elif x < 100:
        return '中等'
    elif x < 120:
        return '良好'
    else:
        return '优秀'   
df['Lever'] = df['ying'].map(cover)
df
  • out:yu shu ying Java Lever
    0 109 164 90 66 中等
    1 118 94 158 112 优秀
    2 105 70 115 83 良好

更改索引名字

df2.rename(mapper={
    0:'A',
    1:'B',
    2:'C',
},axis=0,inplace=True)
df2
  • out:yu shu ying Java Lever
    A 109 优秀 中等 66 中等
    B 118 中等 优秀 112 优秀
    C 105 及格 良好 83 良好
    3 78 中等 良好 86 良好
    4 109 中等 中等 62 中等
    5 73 及格 良好 73 良好
    6 65 良好 及格 50 及格
df2.rename(mapper={
    'yu':'语文',
    'shu':'数学',
    'ying':'英语',
},axis=1,inplace=True)

异常值检测与过滤

df = DataFrame(np.random.normal(100,scale = 30,size=(40,3)),columns=['yu','shu','ying'],dtype=np.uint8)
df
  • out: yu shu ying
    0 85 64 53
    1 70 90 126
    2 110 93 128
    3 94 108 73
    4 67 132 154
    5 81 158 69
    6 95 157 90
    7 123 112 105
    8 108 78 157
m = df.mean()
std = df.std()
df.iloc[8,2] = 200
cond = df - m > 3 * std
df[cond.any(axis = 1)]
  • out:yu shu ying (8, 2)
    8 104 98 200 299
index = df[cond.any(axis = 1)].index
df.drop(labels=index,axis=0,inplace=True)
  • 去除异常数据
index = np.random.randint(0,40,size = 10)
df1 = df.take(index)      # 随机抽样
  • 使用take和normal可以完成随机抽样的效果
  • out:yu shu ying
    18 81 132 84
    2 85 74 84
    4 118 112 103
    5 106 49 66
    30 130 91 66
    6 81 114 152
    34 104 119 61
    19 110 72 39
    22 159 147 81
    7 132 106 111

数据聚合

df = DataFrame({'color':['white','black','white','white','black','black'],
               'status':['up','up','down','down','down','up'],
               'value1':[12.33,14.55,22.34,27.84,23.40,18.33],
               'value2':[11.23,31.80,29.99,31.18,18.25,22.44]})
ret = df.groupby(by = ['color']).mean()
ret
  • out: value1 value2
    color
    black 18.760000 24.163333
    white 20.836667 24.133333
ret = df.groupby(by = ['color','status']).mean()
ret
  • out: value1 value2
    color status
    black down 23.40 18.250
    up 16.44 27.120
    white down 25.09 30.585
    up 12.33 11.230
ret = df.groupby(by = ['color','status'])

def covert(x):
    return (np.round(x.mean(),1),x.min(),x.max())

ret.agg(covert)
  • out: value1 value2
    color status
    black down (23.4, 23.4, 23.4) (18.2, 18.25, 18.25)
    up (16.4, 14.55, 18.33) (27.1, 22.44, 31.8)
    white down (25.1, 22.34, 27.84) (30.6, 29.99, 31.18)
    up (12.3, 12.33, 12.33) (11.2, 11.23, 11.23)

猜你喜欢

转载自blog.csdn.net/sinat_39045958/article/details/86524968
今日推荐