本篇介绍以下几个知识点 :
(1)选中某一批特定的数据,如在DataFrame中
(2)为选中的区域赋上另一批新值
(3)处理nan这种数值,删除或者改写
1.选中某一批特定的数据,如在DataFrame中
1.1:select by label loc
## select by label:loc
#print(df.loc['20130102'])
#print(df.loc[:,['A','B']])#所有行,AB列
#print(df.loc['20130102',['A','B']])#20130102行,AB列
1.2:select by position iloc
#select by position:iloc
#print(df.iloc[3,1])#取第四行第二列的数
#print(df.iloc[1:3,1:2])#取第二行到四行第二列到第三列的数
#print(df.iloc[[1,3,4],1:2])#选择特定的行的数
1.3:mixed selection,loc与iloc综合筛选 ix
#mixed selection:ix,loc与iloc综合筛选
print(df.ix[:3,['A','C']])
运行结果:
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
A C
2013-01-01 0 2
2013-01-02 4 6
2013-01-03 8 10
1.4:Boolean indexing
#Boolean indexing
#print(df[df.A>8])
运行结果:
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
A B C D
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
2.为选中的区域赋上另一批新值
2.1: iloc
df.iloc[2,2] = 111 #df.iloc['20130101','B'] = 111
print(df)
运行结果:
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 111 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
2.2: 布尔
### df[df.A>4] = 0
### df.B[df.A>4] = 0
2.3:加新的一列赋值
df['F'] = np.nan
df['E'] = pd.Series([1,2,3,4,5,6],index = pd.date_range('20130101',periods = 6))
print(df)
3. 处理nan这种非数值量,删除、改写、检查有没有nan这种非数值量
3.1:处理nan
#print(df.dropna(axis = 1,how = 'any')) #how = 'all',丢弃nan
# print(df.fillna(value = 0)) #使用其他数值来填充nan
3.2:检查nan是否存在
#print(df.isnull()) #逐个检查是否存在,文件大不易发现
print(np.any(df.isnull())==True) #整体检查是否存在
附上本次学习原始代码:
##8.8
# 选中其中的一些区域
print("\n"*55)
import pandas as pd
import numpy as np
dates = pd.date_range('20130101',periods = 6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index = dates,columns = ['A','B','C','D'])
print(df)
print("\n"*2)
print(df['A'],df.A) ##里面两个效果一样
#print(df[0:3],df['20130102':'20130104'])
## select by label:loc
#print(df.loc['20130102'])
#print(df.loc[:,['A','B']])#所有行,AB列
#print(df.loc['20130102',['A','B']])
#select by position:iloc
#print(df.iloc[3,1])
#print(df.iloc[1:3,1:2])
#print(df.iloc[[1,3,4],1:2])
#mixed selection:ix,loc与iloc综合筛选
#print(df.ix[:3,['A','C']])
#Boolean indexing
#print(df[df.A>8])
###g共四中筛选方法
# 为选中的区域赋上另一批值
'''
print("\n"*55)
import pandas as pd
import numpy as np
dates = pd.date_range('20130101',periods = 6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index = dates,columns = ['A','B','C','D'])
print(df)
print('\n'*2)
df.iloc[2,2] = 111 #df.iloc['20130101','B'] = 111
# print(df)
# df[df.A>4] = 0
# df.B[df.A>4] = 0
df['F'] = np.nan
df['E'] = pd.Series([1,2,3,4,5,6],index = pd.date_range('20130101',periods = 6))
print(df)
'''
# 处理nan的数据
'''
print("\n"*55)
import pandas as pd
import numpy as np
dates = pd.date_range('20130101',periods = 6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index = dates,columns = ['A','B','C','D'])
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
print(df)
print("\n"*2)
#print(df.dropna(axis = 1,how = 'any')) #how = 'all',丢弃
# print(df.fillna(value = 0)) #填充
#print(df.isnull()) #是否有丢失
print(np.any(df.isnull())==True) #是否有丢失
'''