Pandas技巧总结

Pandas 常用的数据结构为: Series(一维数组),DataFrame(二维数组)

1. Series(一维数组)

import pandas as pd
import numpy as np
n=np.random.randn(5) # 创建一个随机 Ndarray 数组
index=['a','b','c','d','e']
s2=pd.Series(n,index=index)
s2
"""
a   -1.255541
b   -0.342908
c   -1.023813
d    1.332438
e    0.332997
dtype: float64
"""

2. DataFrame(二维数组)

1.通过numpy生成DataFrame

dates=pd.date_range('today',periods=6) # 定义时间序列作为 index
num_arr=np.random.randn(6,4) # 传入 numpy 随机数组
columns=['A','B','C','D'] # 将列表作为列名
df1=pd.DataFrame(num_arr,index=dates,columns=columns)
df1

"""
                                    A           B           C           D
2018-05-01 13:40:38.447459  -2.584642   -0.057754    1.513665    0.502229
2018-05-02 13:40:38.447459   0.238058   -1.201849    0.607927   -0.489668
2018-05-03 13:40:38.447459   0.337550   -1.714525    0.864874   -0.363594
2018-05-04 13:40:38.447459   1.382850   -1.214867   -0.076234   -1.823509
2018-05-05 13:40:38.447459   0.130625   -1.065560   -0.219137    0.143160
2018-05-06 13:40:38.447459   0.831728    1.230709    0.109697    0.192017
"""

2.通过字典数组创建DataFrame

data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df2 = pd.DataFrame(data, index=labels)
df2
"""
    age   animal    priority    visits
a   2.5    cat        yes         1
b   3.0    cat        yes         3
c   0.5    snake       no         2
d   NaN    dog        yes         3
e   5.0    dog         no         2
f   2.0    cat         no         3
g   4.5    snake       no         1
h   NaN    cat        yes         1
i   7.0    dog         no         2
j   3.0    dog         no         1
"""

3.DataFrame的基本操作:

df2.index
# Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')
df2.columns
# Index(['age', 'animal', 'priority', 'visits'], dtype='object')
df2.values
"""
array([[2.5, 'cat', 'yes', 1],
       [3.0, 'cat', 'yes', 3],
       [0.5, 'snake', 'no', 2],
       [nan, 'dog', 'yes', 3],
       [5.0, 'dog', 'no', 2],
       [2.0, 'cat', 'no', 3],
       [4.5, 'snake', 'no', 1],
       [nan, 'cat', 'yes', 1],
       [7.0, 'dog', 'no', 2],
       [3.0, 'dog', 'no', 1]], dtype=object)
"""

#排序:
df2.sort_values(by='age',ascending=False)
# 按 age 降序排列,默认为ascending=True,升序
df2.sort_values(by=['age', 'visits'], ascending=[False, True]) 
#按照 age 降序,visits 升序排列

#索引:
df2['age']
df2.age # 等价于 df2['age']
df2[['age','animal']] # 传入一个列名组成的列表
df2.iloc[1:3] # 查询 2,3 行,等价于 df2[1:3]
df.iloc[2:4, 1:3]
df3.loc['f','age']=1.5 # 根据 DataFrame 的标签对数据进行修改
df3.iat[1,0]=222 # 修改第 2 行与第 1 列对应的值 3.0 → 2.0 

df3=df2.copy()
# 生成 DataFrame 副本,方便数据集被多个不同流程使用
df3.isnull() # 判断 DataFrame 元素是否为空,如果为空则返回为 True

#添加列数据:
num=pd.Series([0,1,2,3,4,5,6,7,8,9],index=df3.index)
df3['No.']=num # 添加以 'No.' 为列名的新数据列

#缺测值处理:
df3.fillna(value=33333333) #  对缺失值进行填充
df3.dropna(how='any') # 任何存在 NaN 的行都将被删除

4.时间序列索引

# 建立一个以 2018 年每一天为索引,值为随机数的 Series
dti = pd.date_range(start='2018-01-01', end='2018-12-31', freq='D') 
s = pd.Series(np.random.rand(len(dti)), index=dti)
# 统计s 中每一个周三对应值的和:
s[s.index.weekday == 2].sum() # 周一从 0 开始
# 统计s中每个月值的平均值:
s.resample('M').mean()
# 时区转换:
s = pd.date_range('today', periods=1, freq='D') # 获取当前时间
ts = pd.Series(np.random.randn(len(s)), s) # 随机数值
ts_utc = ts.tz_localize('UTC') # 转换为 UTC 时间
ts_utc.tz_convert('Asia/Shanghai') # 转换为上海所在时区

5. DataFrame 条件查找

# 示例数据

data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 
                   'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 
                     'yes', 'no', 'no']}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

df = pd.DataFrame(data, index=labels)

df

"""
    age   animal    priority    visits
a   2.5    cat        yes         1
b   3.0    cat        yes         3
c   0.5    snake       no         2
d   NaN    dog        yes         3
e   5.0    dog         no         2
f   2.0    cat         no         3
g   4.5    snake       no         1
h   NaN    cat        yes         1
i   7.0    dog         no         2
j   3.0    dog         no         1
"""

df[df['age'] > 3] # 查找 age 大于 3 的全部信息
df[(df['animal'] == 'cat') & (df['age'] < 3)] # 查找 age<3 且为 cat 的全部数据
df[df['animal'].isin(['cat', 'dog'])] # DataFrame 按关键字查询
df.loc[df.index[[3, 4, 8]], ['animal', 'age']] # DataFrame 按标签及列名查询
df['priority'].map({'yes': True, 'no': False}) 
# 将 priority 列的 yes 值替换为 True,no 值替换为 False
#---------------------------------------------------------------------------
df = pd.DataFrame(np.random.random(size=(5, 10)), columns=list('abcdefghij'))
df.sum().idxmin()  # idxmax(), idxmin() 为 Series 函数返回最大最小值的索引值

6. 数据清洗

df = pd.DataFrame({'From_To': ['LoNDon_paris', 'MAdrid_miLAN', 'londON_StockhOlm', 
                               'Budapest_PaRis', 'Brussels_londOn'],
              'FlightNumber': [10045, np.nan, 10065, np.nan, 10085],
              'RecentDelays': [[23, 47], [], [24, 43, 87], [13], [67, 32]],
                   'Airline': ['KLM(!)', '<Air France> (12)', '(British Airways. )', 
                               '12. Air France', '"Swiss Air"']})
df
"""
    Airline             FlightNumber    From_To       RecentDelays
0   KLM(!)              10045.0       LoNDon_paris      [23, 47]
1   <Air France> (12)   NaN           MAdrid_miLAN      []
2   (British Airways. ) 10065.0     londON_StockhOlm    [24, 43, 87]
3   12. Air France      NaN           Budapest_PaRis    [13]
4   "Swiss Air"         10085.0      Brussels_londOn    [67, 32]
"""
#在FilghtNumber中有数值缺失,其中数值为按 10 增长,
#补充相应的缺省值使得数据完整,并让数据为 int 类型。
df['FlightNumber'] = df['FlightNumber'].interpolate().astype(int)
df
"""
    Airline          FlightNumber       From_To         RecentDelays
0   KLM(!)              10045         LoNDon_paris        [23, 47]
1   <Air France> (12)   10055         MAdrid_miLAN        []
2   (British Airways. ) 10065       londON_StockhOlm      [24, 43, 87]
3   12. Air France      10075         Budapest_PaRis      [13]
4   "Swiss Air"         10085        Brussels_londOn      [67, 32]
"""
#其中From_to应该为两独立的两列From和To,将From_to依照_拆分为独立两列建立为一个新表。
temp = df.From_To.str.split('_', expand=True)
temp.columns = ['From', 'To']
temp
"""
     From        To
0   LoNDon      paris
1   MAdrid      miLAN
2   londON      StockhOlm
3   Budapest    PaRis
4   Brussels    londOn
"""
# 其中注意到地点的名字都不规范(如:londON应该为London)
# 需要对数据进行标准化处理
temp['From'] = temp['From'].str.capitalize()
temp['To'] = temp['To'].str.capitalize()
temp
"""

    From        To
0   London      Paris
1   Madrid      Milan
2   London      Stockholm
3   Budapest    Paris
4   Brussels    London
"""
# 删除坏数据加入整理好的数据
# 将最开始的From_to列删除,加入整理好的From和to列。
df = df.drop('From_To', axis=1)
df = df.join(temp)

# 去除多余字符,如同 airline 列中许多数据有许多其他字符,
# 会对后期的数据分析有较大影响,需要对这类数据进行修正。正则表达式。
df['Airline'] = df['Airline'].str.extract('([a-zA-Z\s]+)', expand=False).str.strip()
df
"""
    Airline          FlightNumber   RecentDelays    From        To
0   KLM              10045          [23, 47]        London      Paris
1   Air France       10055          []              Madrid      Milan
2   British Airways  10065          [24, 43, 87]    London      Stockholm
3   Air France       10075          [13]            Budapest    Paris
4   Swiss Air        10085          [67, 32]        Brussels    London
"""

# 格式规范:在 RecentDelays 中记录的方式为列表类型,由于其长度不一,
# 这会为后期数据分析造成很大麻烦。这里将 RecentDelays 的列表拆开,
# 取出列表中的相同位置元素作为一列,若为空值即用 NaN 代替。

delays = df['RecentDelays'].apply(pd.Series)

delays.columns = ['delay_{}'.format(n) for n in range(1, len(delays.columns)+1)] 
#['delay_1', 'delay_2', 'delay_3']

df = df.drop('RecentDelays', axis=1).join(delays) # axis=1,列,axis=0,行
df
"""
   Airline     FlightNumber   From      To       delay_1  delay_2  delay_3
0  KLM              10045     London    Paris      23.0    47.0    NaN
1  Air France       10055     Madrid    Milan      NaN     NaN     NaN
2  British Airways  10065     London    Stockholm  24.0    43.0    87.0
3  Air France       10075     Budapest  Paris      13.0    NaN     NaN
4  Swiss Air        10085     Brussels  London     67.0    32.0    NaN
"""

7. 数据预处理

df=pd.DataFrame({'name':['Alice','Bob','Candy','Dany','Ella','Frank','Grace','Jenny'],'grades':[58,83,79,65,93,45,61,88]})

def choice(x):
    if x>60:
        return 1
    else:
        return 0

df.grades=pd.Series(map(lambda x:choice(x),df.grades))
df

"""
   grades  name
0    0     Alice
1    1     Bob
2    1     Candy
3    1     Dany
4    1     Ella
5    0     Frank
6    1     Grace
7    1     Jenny
"""

#数据归一化:
def normalization(df):
    numerator=df.sub(df.min())
    denominator=(df.max()).sub(df.min())
    Y=numerator.div(denominator)
    return Y
df = pd.DataFrame(np.random.random(size=(5, 3)))
print(df)
normalization(df)

8. 数据读取

#csv file raw data:
"""
,c1,c2,c3
a,0,5,10
b,1,6,11
c,2,7,12
d,3,8,13
e,4,9,14
"""
import pandas as pd
df=pd.read_csv('ceshi.csv')
df
"""
 Unnamed: 0  c1  c2  c3
0          a   0   5  10
1          b   1   6  11
2          c   2   7  12
3          d   3   8  13
4          e   4   9  14
"""

df2=pd.read_csv('ceshi.csv',header=None,names=range(2,5))
# header=None,指明原始文件数据没有列索引,
# 这样read_csv为自动加上列索引,除非你给定列索引的名字:names=range(2,5)
df2
"""
    2   3   4
0  c1  c2  c3
1   0   5  10
2   1   6  11
3   2   7  12
4   3   8  13
5   4   9  14
"""

df3=pd.read_csv('ceshi.csv',header=0,names=range(2,5))
# header=0 表示文件第0行(即第一行,索引从0开始)为列索引,
# 这样加names会替换原来的列索引
df3
"""
   2  3   4
0  0  5  10
1  1  6  11
2  2  7  12
3  3  8  13
4  4  9  14
"""

#指定每一列的数据类型,读取指定的列(usecols=),并制定某一列为索引列(index_col=)
df= pd.read_csv(path+'xx.csv',dtype={"col1": str, "col2": str},usecols=[0,1,2,3],index_col=0) # 


选定为特定值的行:
https://stackoverflow.com/questions/17071871/select-rows-from-a-dataframe-based-on-values-in-a-column-in-pandas

9. 缺测处理

查看DataFrame的缺测情况,显示缺失的行

>>> df = pd.DataFrame([range(3), [0, np.NaN, 0], [0, 0, np.NaN], range(3), range(3)])
>>> df.isnull()   #df.isnull().sum()
       0      1      2
0  False  False  False
1  False   True  False
2  False  False   True
3  False  False  False
4  False  False  False
>>> df.isnull().any(axis=1)  
0    False
1     True
2     True
3    False
4    False
dtype: bool
>>> df[df.isnull().any(axis=1)]
   0   1   2
1  0 NaN   0
2  0   0 NaN
##相当于:
null_bool=df.isnull().any(axis=1)
df[null_bool]
#######################################

df.dropna(axis=1,how='all') 
# drop columns that all values are NAN,
# default value is axis=0,
# which means drop the rows that contain NAN

df.fillna(999) # 填充缺测值为999
df.fillna({0:777,1:888,2:999}) # 填充缺测值,第一列为777,...

# 统计df每一行出现0的次数
In:
df = pd.DataFrame({'a':[1,0,0,1,3], 'b':[0,0,1,0,1], 'c':[0,0,0,0,0]})
df
Out:
   a  b  c
0  1  0  0
1  0  0  0
2  0  1  0
3  1  0  0
4  3  1  0
In:

(df == 0).astype(int).sum(axis=1)  # or: (df == 0).sum(axis=1)
Out:
0    2
1    3
2    2
3    2
4    1
dtype: int64
# 可以结合groupby来使用:
def count0(df):
    return (df == 0).sum(axis=0)
df.groupby('id').agg(count0).astype('int')

Reference

100 pandas puzzles

猜你喜欢

转载自blog.csdn.net/maverick_7/article/details/80160104