import pandas as pd
from pandas import DataFrame, Series
alco2009 = DataFrame([(1.20, 0.22, 0.58),
(1.31, 0.54, 1.16),
(1.19, 0.38, 0.74)],
columns=("Beer", "Wine", "Spirits"),
index=("Alabama", "Alaska","Arizona"))
print(alco2009)
#out[]:
# Beer Wine Spirits
# Alabama 1.20 0.22 0.58
# Alaska 1.31 0.54 1.16
# Arizona 1.19 0.38 0.74
#重建索引
s_state = list(alco2009.index) + ["Samoa"]
drinks = list(alco2009.columns) + ["Water"]
nan_alco = alco2009.reindex(s_state, columns=drinks)
print(nan_alco)
#out[]:
# Beer Wine Spirits Water
# Alabama 1.20 0.22 0.58 NaN
# Alaska 1.31 0.54 1.16 NaN
# Arizona 1.19 0.38 0.74 NaN
# Samoa NaN NaN NaN NaN
#删除缺失数据
#可以使用可选参数inplace=True直接修改原始frame,而不是创建副本。
print(nan_alco.dropna(how='all'))#默认删除行
#out[]:
# Beer Wine Spirits Water
# Alabama 1.20 0.22 0.58 NaN
# Alaska 1.31 0.54 1.16 NaN
# Arizona 1.19 0.38 0.74 NaN
print(nan_alco)
# Beer Wine Spirits Water
# Alabama 1.20 0.22 0.58 NaN
# Alaska 1.31 0.54 1.16 NaN
# Arizona 1.19 0.38 0.74 NaN
# Samoa NaN NaN NaN NaN
print(nan_alco.dropna(how="all", axis=1))#删除列
#out[]
# Beer Wine Spirits
# Alabama 1.20 0.22 0.58
# Alaska 1.31 0.54 1.16
# Arizona 1.19 0.38 0.74
# Samoa NaN NaN NaN
print(nan_alco.dropna())#对包含NaN的整行或整列进行删除
#out[]:
# Empty DataFrame
# Columns: []
# Index: [Alabama, Alaska, Arizona, Samoa]
#插补缺失数据
#两种最常见的插补技术是用常数(0,1等)和“干净”值的平均值替换缺失值。
print(nan_alco.isnull())
#out[]:
# Beer Wine Spirits Water
# Alabama False False False True
# Alaska False False False True
# Arizona False False False True
# Samoa True True True True
print(nan_alco.notnull())
#out[]:
# Beer Wine Spirits Water
# Alabama True True True False
# Alaska True True True False
# Arizona True True True False
# Samoa False False False False
#通过估算平均值来修正“Spirits”列(在numpy中,连字符“-”是否定运算符)
#平均值插补的方法必须逐列(或逐行)地进行,但如果是常数插补就可以直接应用到整个frame.
sp = nan_alco['Spirits']
clean = sp.notnull()
sp[-clean] = sp[clean].mean()
print(nan_alco)
#out[]:
# Beer Wine Spirits Water
# Alabama 1.20 0.22 0.580000 NaN
# Alaska 1.31 0.54 1.160000 NaN
# Arizona 1.19 0.38 0.740000 NaN
# Samoa NaN NaN 0.826667 NaN
#函数fillna(val)是将常数val插到空缺处的最简单的方式。
#沿着列 axis=0 默认值
#沿着行 axis=1
#method="ffill" 将最后一次有效观测值向前复制
#method="bfill" 向后复制
#除非指定了参数inplace=True, 否则该函数将返回一个新的frame或series
print(nan_alco.fillna(0))
#out[]:
# Beer Wine Spirits Water
# Alabama 1.20 0.22 0.580000 0.0
# Alaska 1.31 0.54 1.160000 0.0
# Arizona 1.19 0.38 0.740000 0.0
# Samoa 0.00 0.00 0.826667 0.0
print(nan_alco.fillna(method="ffill"))
#out[]:
# Beer Wine Spirits Water
# Alabama 1.20 0.22 0.580000 NaN
# Alaska 1.31 0.54 1.160000 NaN
# Arizona 1.19 0.38 0.740000 NaN
# Samoa 1.19 0.38 0.826667 NaN
#替换数据replace(Val_or_list, new_val)
#组合数据
#合并
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})
df2 = DataFrame({'key':['a', 'b', 'd'], 'data2': range(3)})
print(df1)
#out[]:
# data1 key
# 0 0 b
# 1 1 b
# 2 2 a
# 3 3 c
# 4 4 a
# 5 5 a
# 6 6 b
print(df2)
#out[]:
# data2 key
# 0 0 a
# 1 1 b
# 2 2 d
print(pd.merge(df1, df2))
#out[]:
# data1 key data2
# 0 0 b 1
# 1 1 b 1
# 2 6 b 1
# 3 2 a 0
# 4 4 a 0
# 5 5 a 0
#注意,我们并没有指明要用哪个列进行连接。如果没有指定,merge就会将重叠列的列名当做键。不过,最好显示指定一下。
print(pd.merge(df1, df2, on='key'))
#out[]:
# data1 key data2
# 0 0 b 1
# 1 1 b 1
# 2 6 b 1
# 3 2 a 0
# 4 4 a 0
# 5 5 a 0
#如果两个对象的列名不同,也可以分别进行指定:
df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})
df4 = DataFrame({'rkey': ['a', 'b', 'd'], 'data2': range(3)})
print(pd.merge(df3, df4, left_on='lkey', right_on='rkey'))
#out[]:
# data1 lkey data2 rkey
# 0 0 b 1 b
# 1 1 b 1 b
# 2 6 b 1 b
# 3 2 a 0 a
# 4 4 a 0 a
# 5 5 a 0 a
#默认情况下,merge做的是"inner"连接;结果中的键是交集。其他情况还有"left"、 "right"以及"outer"。外连接求取的是键的并集,组合了左连接和右连接的效果。
print(pd.merge(df1, df2, how='outer'))
#out[]:
# data1 key data2
# 0 0.0 b 1.0
# 1 1.0 b 1.0
# 2 6.0 b 1.0
# 3 2.0 a 0.0
# 4 4.0 a 0.0
# 5 5.0 a 0.0
# 6 3.0 c NaN
# 7 NaN d 2.0
#多对多的连接
#pd.merge(df1, df2, on='key', how='left')
#pd.merge(left, right, on=['key1', 'key2'], how='outer')
#pd.merge(left, right, on='key1')
#pd.merge(left, right, on='key1', suffixes=('_left', '_right'))
#索引上的合并
#有时候,DataFrame中的连接键位于其索引中。在这种情况下,你可以传入left_index=True或right_index=True
#轴上连接
s1 = Series([0, 1], index=['a', 'b'])
s2 = Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = Series([5, 6], index=['f', 'g'])
print(pd.concat([s1, s2, s3]))
#out[]:
# a 0
# b 1
# c 2
# d 3
# e 4
# f 5
# g 6
# dtype: int64
#默认情况下,concat是在axis=0上工作的,最终产生一个新的Series。如果传入axis=1,则结果就会变成一个DataFrame
print(pd.concat([s1, s2, s3], axis=1))
#out[]:
# 0 1 2
# a 0.0 NaN NaN
# b 1.0 NaN NaN
# c NaN 2.0 NaN
# d NaN 3.0 NaN
# e NaN 4.0 NaN
# f NaN NaN 5.0
# g NaN NaN 6.0
#传入join='inner'即可得到它们的交集:
s4 = pd.concat([s1*5, s3])
print(s4)
#out[]:
# a 0
# b 5
# f 5
# g 6
print(pd.concat([s4, s3], axis=1))
#out[];
# 0 1
# a 0 NaN
# b 5 NaN
# f 5 5.0
# g 6 6.0
print(pd.concat([s4, s3], axis=1, join='inner'))
#out[]
# 0 1
# f 5 5
# g 6 6
#通过join_axes指定要在其他轴上使用的索引:
print(pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']]))
#out[]:
# 0 1
# a 0.0 0.0
# c NaN NaN
# b 1.0 5.0
# e NaN NaN
#使用keys参数可以在连接轴上创建一个层次化索引。
result = pd.concat([s1, s2, s3], keys=['one', 'two', 'three'])
print(result)
#out[]:
# one a 0
# b 1
# two c 2
# d 3
# e 4
# three f 5
# g 6
# dtype: int64
print(result.unstack())
#out[]
# a b c d e f g
# one 0.0 1.0 NaN NaN NaN NaN NaN
# two NaN NaN 2.0 3.0 4.0 NaN NaN
# three NaN NaN NaN NaN NaN 5.0 6.0
pandas数据处理基本方法
猜你喜欢
转载自blog.csdn.net/qq_38195197/article/details/81138061
今日推荐
周排行