pandas数据处理基本方法

import pandas as pd
from pandas import DataFrame, Series
alco2009 = DataFrame([(1.20, 0.22, 0.58),
                         (1.31, 0.54, 1.16),
                         (1.19, 0.38, 0.74)],
                        columns=("Beer", "Wine", "Spirits"),
                        index=("Alabama", "Alaska","Arizona"))
print(alco2009)
#out[]:
#          Beer  Wine  Spirits
# Alabama  1.20  0.22     0.58
# Alaska   1.31  0.54     1.16
# Arizona  1.19  0.38     0.74


#重建索引
s_state = list(alco2009.index) + ["Samoa"]
drinks = list(alco2009.columns) + ["Water"]
nan_alco = alco2009.reindex(s_state, columns=drinks)
print(nan_alco)
#out[]:
#          Beer  Wine  Spirits  Water
# Alabama  1.20  0.22     0.58    NaN
# Alaska   1.31  0.54     1.16    NaN
# Arizona  1.19  0.38     0.74    NaN
# Samoa     NaN   NaN      NaN    NaN


#删除缺失数据
#可以使用可选参数inplace=True直接修改原始frame,而不是创建副本。
print(nan_alco.dropna(how='all'))#默认删除行
#out[]:
#          Beer  Wine  Spirits  Water
# Alabama  1.20  0.22     0.58    NaN
# Alaska   1.31  0.54     1.16    NaN
# Arizona  1.19  0.38     0.74    NaN

print(nan_alco)
#          Beer  Wine  Spirits  Water
# Alabama  1.20  0.22     0.58    NaN
# Alaska   1.31  0.54     1.16    NaN
# Arizona  1.19  0.38     0.74    NaN
# Samoa     NaN   NaN      NaN    NaN

print(nan_alco.dropna(how="all", axis=1))#删除列
#out[]
#         Beer  Wine  Spirits
# Alabama  1.20  0.22     0.58
# Alaska   1.31  0.54     1.16
# Arizona  1.19  0.38     0.74
# Samoa     NaN   NaN      NaN

print(nan_alco.dropna())#对包含NaN的整行或整列进行删除
#out[]:
# Empty DataFrame
# Columns: []
# Index: [Alabama, Alaska, Arizona, Samoa]


#插补缺失数据
#两种最常见的插补技术是用常数(0,1等)和“干净”值的平均值替换缺失值。
print(nan_alco.isnull())
#out[]:
#          Beer   Wine  Spirits  Water
# Alabama  False  False    False   True
# Alaska   False  False    False   True
# Arizona  False  False    False   True
# Samoa     True   True     True   True

print(nan_alco.notnull())
#out[]:
#          Beer   Wine  Spirits  Water
# Alabama   True   True     True  False
# Alaska    True   True     True  False
# Arizona   True   True     True  False
# Samoa    False  False    False  False

#通过估算平均值来修正“Spirits”列(在numpy中,连字符“-”是否定运算符)
#平均值插补的方法必须逐列(或逐行)地进行,但如果是常数插补就可以直接应用到整个frame.
sp = nan_alco['Spirits']
clean = sp.notnull()
sp[-clean] = sp[clean].mean()
print(nan_alco)
#out[]:
#        Beer  Wine   Spirits  Water
# Alabama  1.20  0.22  0.580000    NaN
# Alaska   1.31  0.54  1.160000    NaN
# Arizona  1.19  0.38  0.740000    NaN
# Samoa     NaN   NaN  0.826667    NaN

#函数fillna(val)是将常数val插到空缺处的最简单的方式。
#沿着列 axis=0 默认值
#沿着行 axis=1
#method="ffill" 将最后一次有效观测值向前复制
#method="bfill" 向后复制
#除非指定了参数inplace=True, 否则该函数将返回一个新的frame或series
print(nan_alco.fillna(0))
#out[]:
#         Beer  Wine   Spirits  Water
# Alabama  1.20  0.22  0.580000    0.0
# Alaska   1.31  0.54  1.160000    0.0
# Arizona  1.19  0.38  0.740000    0.0
# Samoa    0.00  0.00  0.826667    0.0

print(nan_alco.fillna(method="ffill"))
#out[]:
#          Beer  Wine   Spirits  Water
# Alabama  1.20  0.22  0.580000    NaN
# Alaska   1.31  0.54  1.160000    NaN
# Arizona  1.19  0.38  0.740000    NaN
# Samoa    1.19  0.38  0.826667    NaN


#替换数据replace(Val_or_list, new_val)


#组合数据
#合并
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})
df2 = DataFrame({'key':['a', 'b', 'd'], 'data2': range(3)})

print(df1)
#out[]:
#    data1 key
# 0      0   b
# 1      1   b
# 2      2   a
# 3      3   c
# 4      4   a
# 5      5   a
# 6      6   b

print(df2)
#out[]:
#   data2 key
# 0      0   a
# 1      1   b
# 2      2   d

print(pd.merge(df1, df2))
#out[]:
#   data1 key  data2
# 0      0   b      1
# 1      1   b      1
# 2      6   b      1
# 3      2   a      0
# 4      4   a      0
# 5      5   a      0
#注意,我们并没有指明要用哪个列进行连接。如果没有指定,merge就会将重叠列的列名当做键。不过,最好显示指定一下。

print(pd.merge(df1, df2, on='key'))
#out[]:
#  data1 key  data2
# 0      0   b      1
# 1      1   b      1
# 2      6   b      1
# 3      2   a      0
# 4      4   a      0
# 5      5   a      0

#如果两个对象的列名不同,也可以分别进行指定:
df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})
df4 = DataFrame({'rkey': ['a', 'b', 'd'], 'data2': range(3)})
print(pd.merge(df3, df4, left_on='lkey', right_on='rkey'))
#out[]:
# data1 lkey  data2 rkey
# 0      0    b      1    b
# 1      1    b      1    b
# 2      6    b      1    b
# 3      2    a      0    a
# 4      4    a      0    a
# 5      5    a      0    a

#默认情况下,merge做的是"inner"连接;结果中的键是交集。其他情况还有"left"、 "right"以及"outer"。外连接求取的是键的并集,组合了左连接和右连接的效果。
print(pd.merge(df1, df2, how='outer'))
#out[]:
#  data1 key  data2
# 0    0.0   b    1.0
# 1    1.0   b    1.0
# 2    6.0   b    1.0
# 3    2.0   a    0.0
# 4    4.0   a    0.0
# 5    5.0   a    0.0
# 6    3.0   c    NaN
# 7    NaN   d    2.0

#多对多的连接
#pd.merge(df1, df2, on='key', how='left')
#pd.merge(left, right, on=['key1', 'key2'], how='outer')
#pd.merge(left, right, on='key1')
#pd.merge(left, right, on='key1', suffixes=('_left', '_right'))

#索引上的合并
#有时候,DataFrame中的连接键位于其索引中。在这种情况下,你可以传入left_index=True或right_index=True


#轴上连接
s1 = Series([0, 1], index=['a', 'b'])
s2 = Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = Series([5, 6], index=['f', 'g'])
print(pd.concat([s1, s2, s3]))
#out[]:
# a    0
# b    1
# c    2
# d    3
# e    4
# f    5
# g    6
# dtype: int64

#默认情况下,concat是在axis=0上工作的,最终产生一个新的Series。如果传入axis=1,则结果就会变成一个DataFrame
print(pd.concat([s1, s2, s3], axis=1))
#out[]:
#     0    1    2
# a  0.0  NaN  NaN
# b  1.0  NaN  NaN
# c  NaN  2.0  NaN
# d  NaN  3.0  NaN
# e  NaN  4.0  NaN
# f  NaN  NaN  5.0
# g  NaN  NaN  6.0

#传入join='inner'即可得到它们的交集:
s4 = pd.concat([s1*5, s3])
print(s4)
#out[]:
# a    0
# b    5
# f    5
# g    6
print(pd.concat([s4, s3], axis=1))
#out[];
#  0    1
# a  0  NaN
# b  5  NaN
# f  5  5.0
# g  6  6.0
print(pd.concat([s4, s3], axis=1, join='inner'))
#out[]
#   0  1
# f  5  5
# g  6  6

#通过join_axes指定要在其他轴上使用的索引:
print(pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']]))
#out[]:
#    0    1
# a  0.0  0.0
# c  NaN  NaN
# b  1.0  5.0
# e  NaN  NaN

#使用keys参数可以在连接轴上创建一个层次化索引。
result = pd.concat([s1, s2, s3], keys=['one', 'two', 'three'])
print(result)
#out[]:
# one    a    0
#        b    1
# two    c    2
#        d    3
#        e    4
# three  f    5
#        g    6
# dtype: int64

print(result.unstack())
#out[]
#          a    b    c    d    e    f    g
# one    0.0  1.0  NaN  NaN  NaN  NaN  NaN
# two    NaN  NaN  2.0  3.0  4.0  NaN  NaN
# three  NaN  NaN  NaN  NaN  NaN  5.0  6.0


猜你喜欢

转载自blog.csdn.net/qq_38195197/article/details/81138061