一.Pandas
1.Python Data Analysis Library 或 pandas 是基于NumPy 的一种工具
2.http://pandas.pydata.org/
3.pandas中的数据结构:
Series一维数组,只允许存储相同的数据类型;
Time- Series:以时间为索引的Series;
DataFrame:二维的表格型数据结构;
Panel :三维的数组,可以理解为DataFrame的容器
二创建
1 import pandas as pd 2 import numpy as np 3 #创建Series, 4 s=pd.Series([1,3,4,np.nan,44,1])#有序号,有类型 5 print(s) 6 7 #创建dataframe:二维表格 8 #__init__(self, data=None, index=None, columns=None, dtype=None,copy=False) 9 #横行index;竖行columns 10 #通过一个字典来创建DataFrame 11 d={'one':pd.Series([1,3,4],index={'a','b','c'}), 12 'two':pd.Series([1,3,6,8],index={'a','b','c','d'})} 13 #字典的键为行名,值为对应的竖向的值,索引为列名 14 #通过字典创建会产生列的顺序会是随机的 15 df=pd.DataFrame(d) 16 print(df) 17 18 #从字典的列表中创建,其中每个字典代表的是每条记录(一行),而且顺序确定 19 d2=[{'one':1,'two':1},{'one':2,'two':2},{'one':3,'two':3},{'two':4}] 20 df2=pd.DataFrame(d2,index={'a','b','c','d'},columns={'one','two'})#指定索引和列名 21 print(df2) 22 23 d3=df2.to_dict()#把dataframe转为dict,变成一个嵌套的字典 24 print(d3) 25 26 d4={'two': {'b': 1, 'c': 2, 'd': 3, 'a': 4}, 27 'one': {'b': 1.0, 'c': 2.0, 'd': 3.0, 'a':np.nan}} 28 #这样指定字典,创建出来也是唯一的 29 df4=pd.DataFrame(d4) 30 print(df4) 31 ------------------------------------------------------------ 32 0 1.0 33 1 3.0 34 2 4.0 35 3 NaN 36 4 44.0 37 5 1.0 38 dtype: float64 39 one two 40 a 3.0 6 41 b 1.0 1 42 c 4.0 8 43 d NaN 3 44 two one 45 b 1 1.0 46 d 2 2.0 47 a 3 3.0 48 c 4 NaN 49 {'two': {'b': 1, 'd': 2, 'a': 3, 'c': 4}, 'one': {'b': 1.0, 'd': 2.0, 'a': 3.0, 'c': nan}} 50 one two 51 a NaN 4 52 b 1.0 1 53 c 2.0 2 54 d 3.0 3
三.查看数据
1 import numpy as np 2 import pandas as pd 3 dates = pd.date_range('20180507', periods=6)#利用pandas产生日期数据,(开始日期,长度) 4 df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) 5 #生成二维表格,索引是日期,纵列是abcd的列表,数据为随机产生的数 6 #看dataframe:从左到右,从上到下,最左边一列是索引列表,每一条索引表示一条记录 7 print(df) 8 print(df.dtypes)#显示每条纵列的数据类型 9 print(df.head(1))#从表格顶部开始显示表格,默认全部显示 10 print(df.tail(4))#从表格底部开始显示表格,默认全部显示 11 ------------------------------------------------------ 12 A B C D 13 2018-05-07 -1.224660 0.642997 0.513311 0.867978 14 2018-05-08 -0.190702 -0.186858 -1.187651 -0.243407 15 2018-05-09 2.036292 1.720265 0.633207 -0.480980 16 2018-05-10 -2.141022 1.062058 1.118255 0.677325 17 2018-05-11 0.084533 -1.357477 1.135133 0.163912 18 2018-05-12 -1.111821 -1.859636 -1.018877 1.500960 19 A float64 20 B float64 21 C float64 22 D float64 23 dtype: object 24 A B C D 25 2018-05-07 -1.22466 0.642997 0.513311 0.867978 26 A B C D 27 2018-05-09 2.036292 1.720265 0.633207 -0.480980 28 2018-05-10 -2.141022 1.062058 1.118255 0.677325 29 2018-05-11 0.084533 -1.357477 1.135133 0.163912 30 2018-05-12 -1.111821 -1.859636 -1.018877 1.500960
1 import numpy as np 2 import pandas as pd 3 dates = pd.date_range('20180507', periods=6)#利用pandas产生日期数据,(开始日期,长度) 4 df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) 5 print(df.index)#显示索引相关信息 6 print(df.columns)#显示纵列相关信息 7 print(df.values)#显示表格数据 8 ----------------------------------------------------------- 9 DatetimeIndex(['2018-05-07', '2018-05-08', '2018-05-09', '2018-05-10', 10 '2018-05-11', '2018-05-12'], 11 dtype='datetime64[ns]', freq='D') 12 Index(['A', 'B', 'C', 'D'], dtype='object') 13 [[ 0.7783973 -0.85917061 -0.80297609 0.19146651] 14 [ 0.17786108 0.05542154 0.26696242 0.85977364] 15 [-0.90406197 0.86919366 -0.19532951 -1.51333716] 16 [ 1.17939517 -0.71215508 0.6994515 0.14105961] 17 [-1.40057553 -0.19872321 1.04690513 -0.02428399] 18 [-1.33587412 1.27565206 -1.22414705 -0.31345343]]
1 import numpy as np 2 import pandas as pd 3 dates = pd.date_range('20180507', periods=6)#利用pandas产生日期数据,(开始日期,长度) 4 df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) 5 print(df.describe())#显示图标数据的一些统计摘要,比如平均值,方差等 6 print(df.T)#转置整个表格 7 print(df.sort_index(axis=1,ascending=False))#按index排序,以列名称进行排序,以倒的顺序排序 8 print(df.sort_values(by='C'))#指定某一属性,按值从小到大把整个列表排序 9 -------------------------------------------------------------------- 10 A B C D 11 count 6.000000 6.000000 6.000000 6.000000 12 mean 0.322268 0.380395 -0.007915 -0.074197 13 std 0.747649 1.309477 1.065533 1.122326 14 min -0.581089 -1.056308 -1.277190 -1.683641 15 25% -0.140576 -0.432361 -0.891893 -0.807084 16 50% 0.292569 -0.052893 0.099397 0.057339 17 75% 0.573371 1.229704 0.639316 0.812405 18 max 1.547544 2.346076 1.433943 1.154910 19 2018-05-07 2018-05-08 2018-05-09 2018-05-10 2018-05-11 2018-05-12 20 A 0.558207 -0.196412 0.578425 0.026930 1.547544 -0.581089 21 B 0.241584 -1.056308 1.559078 -0.347370 -0.460691 2.346076 22 C -0.283660 0.691603 -1.277190 0.482454 -1.094637 1.433943 23 D -0.933690 -1.683641 0.902558 -0.427266 0.541944 1.154910 24 D C B A 25 2018-05-07 -0.933690 -0.283660 0.241584 0.558207 26 2018-05-08 -1.683641 0.691603 -1.056308 -0.196412 27 2018-05-09 0.902558 -1.277190 1.559078 0.578425 28 2018-05-10 -0.427266 0.482454 -0.347370 0.026930 29 2018-05-11 0.541944 -1.094637 -0.460691 1.547544 30 2018-05-12 1.154910 1.433943 2.346076 -0.581089 31 A B C D 32 2018-05-09 0.578425 1.559078 -1.277190 0.902558 33 2018-05-11 1.547544 -0.460691 -1.094637 0.541944 34 2018-05-07 0.558207 0.241584 -0.283660 -0.933690 35 2018-05-10 0.026930 -0.347370 0.482454 -0.427266 36 2018-05-08 -0.196412 -1.056308 0.691603 -1.683641 37 2018-05-12 -0.581089 2.346076 1.433943 1.154910
四.选择数据
1 import numpy as np 2 import pandas as pd 3 4 dates = pd.date_range('20180507', periods=6) 5 df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) 6 print(df['A']) # 获取指定纵行数据,按键索引,等同于print(df.A) 7 print(df[0:3]) # 获取指定索引数据,按序索引,等同于print(df['20180507':'20180509']) 8 #按标签选择数据loc 9 print(df.loc[dates[0]])#选择index列表中第一个元素对应的一组数据,相当于print(df.loc['20180507']) 10 print(df.loc[:, ['A', 'B']])#选择(所有index,a和b两组)的数据 11 print(df.loc['20180509':'20180512', ['A', 'B']]) 12 #选择(index:'20180509':'20180512',columns:['A', 'B'])的数据 13 -------------------------------------------------------- 14 2018-05-07 1.199961 15 2018-05-08 -0.336282 16 2018-05-09 0.033081 17 2018-05-10 -0.949975 18 2018-05-11 0.155855 19 2018-05-12 -0.901445 20 Freq: D, Name: A, dtype: float64 21 A B C D 22 2018-05-07 1.199961 0.190652 1.412162 -1.005569 23 2018-05-08 -0.336282 1.445801 -0.584581 -0.101581 24 2018-05-09 0.033081 -0.069119 -0.545210 -0.339678 25 A 1.199961 26 B 0.190652 27 C 1.412162 28 D -1.005569 29 Name: 2018-05-07 00:00:00, dtype: float64 30 A B 31 2018-05-07 1.199961 0.190652 32 2018-05-08 -0.336282 1.445801 33 2018-05-09 0.033081 -0.069119 34 2018-05-10 -0.949975 0.557865 35 2018-05-11 0.155855 -1.079446 36 2018-05-12 -0.901445 1.649649 37 A B 38 2018-05-09 0.033081 -0.069119 39 2018-05-10 -0.949975 0.557865 40 2018-05-11 0.155855 -1.079446 41 2018-05-12 -0.901445 1.649649
1 import numpy as np 2 import pandas as pd 3 dates = pd.date_range('20180507', periods=6) 4 df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) 5 print(df) 6 #按位置选择iloc(索引序号进行选择) 7 print(df.iloc[3])#选择index列表中索引为3的数据 8 print(df.iloc[3:5, 0:2])#选择(index:3:5;columns:0:2)的数据 9 print(df.iloc[[1, 2, 4], [0, 2]])#(index:1,2,3)(columns:0,2) 10 print(df.iloc[1:3, :])#index:1:3;columns:所有 11 print(df.iloc[1, 1])#index:1;columns:1 12 ------------------------------------------------ 13 A B C D 14 2018-05-07 0.201876 -1.442301 1.465751 0.640606 15 2018-05-08 -0.775345 -0.534636 -0.091328 1.228146 16 2018-05-09 0.975878 1.260147 -0.006231 0.335106 17 2018-05-10 -0.520035 -1.354576 -1.364484 0.276557 18 2018-05-11 0.726874 0.223242 -0.037863 -1.681729 19 2018-05-12 0.550026 2.378874 -0.973442 1.530179 20 A -0.520035 21 B -1.354576 22 C -1.364484 23 D 0.276557 24 Name: 2018-05-10 00:00:00, dtype: float64 25 A B 26 2018-05-10 -0.520035 -1.354576 27 2018-05-11 0.726874 0.223242 28 A C 29 2018-05-08 -0.775345 -0.091328 30 2018-05-09 0.975878 -0.006231 31 2018-05-11 0.726874 -0.037863 32 A B C D 33 2018-05-08 -0.775345 -0.534636 -0.091328 1.228146 34 2018-05-09 0.975878 1.260147 -0.006231 0.335106 35 -0.534635999017828
1 import numpy as np 2 import pandas as pd 3 dates = pd.date_range('20180507', periods=6) 4 df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) 5 print(df) 6 #综合索引 7 print(df.ix[:3, ['A', 'C']])#index:0:3;columns:'A','C' 8 #布尔索引 9 print(df[df.A > 0])#条件筛选 10 ---------------------------------------------------- 11 A B C D 12 2018-05-07 0.383763 0.177597 0.445263 0.873108 13 2018-05-08 0.171476 0.549246 0.531994 -1.414907 14 2018-05-09 -1.008178 0.927677 0.774119 -0.058670 15 2018-05-10 -0.500451 -0.881271 -0.576227 0.876132 16 2018-05-11 -1.289566 -0.351046 0.765377 0.168464 17 2018-05-12 0.676490 -0.806772 0.194579 -0.205643 18 A C 19 2018-05-07 0.383763 0.445263 20 2018-05-08 0.171476 0.531994 21 2018-05-09 -1.008178 0.774119 22 A B C D 23 2018-05-07 0.383763 0.177597 0.445263 0.873108 24 2018-05-08 0.171476 0.549246 0.531994 -1.414907 25 2018-05-12 0.676490 -0.806772 0.194579 -0.205643
五.修改,添加数据
1 import numpy as np 2 import pandas as pd 3 dates = pd.date_range('20180507', periods=6) 4 df = pd.DataFrame(np.arange(24).reshape(6,4), index=dates, columns=list('ABCD')) 5 print(df) 6 #通过索引找到数据,再直接赋值修改数据 7 df.iloc[2,2]=1111 8 df.loc['20180509','B']=1111 9 df[df.A>4]=0 10 df['E']=np.nan#添加空列E 11 #添加新的一列数据,要对齐 12 df['E']=pd.Series([1,2,3,4,5,6],index=pd.date_range('20180507',periods=6)) 13 print(df) 14 ------------------------------------------------------- 15 A B C D 16 2018-05-07 0 1 2 3 17 2018-05-08 4 5 6 7 18 2018-05-09 8 9 10 11 19 2018-05-10 12 13 14 15 20 2018-05-11 16 17 18 19 21 2018-05-12 20 21 22 23 22 A B C D E 23 2018-05-07 0 1 2 3 1 24 2018-05-08 4 5 6 7 2 25 2018-05-09 0 0 0 0 3 26 2018-05-10 0 0 0 0 4 27 2018-05-11 0 0 0 0 5 28 2018-05-12 0 0 0 0 6
六.处理丢失数据
1 import numpy as np 2 import pandas as pd 3 dates = pd.date_range('20180507', periods=3) 4 df = pd.DataFrame(np.arange(12).reshape(3,4), index=dates, columns=list('ABCD')) 5 #np.nan表示丢失的数据,默认不包含计算中 6 df.ix[1,'C']=np.nan 7 print(df) 8 #删除对应数据 9 print(df.dropna(axis=0,how='any'))#删除行:行中只要有一个丢失数据就删除 10 print(df.dropna(axis=1,how='all'))#删除列:列中所有数据都是丢失数据就删除 11 #填充对应数据 12 print(df.fillna(value=0))#在丢失数据上把nan变为0 13 #检查是否确实数据 14 print(df.isnull())#print(df.isna()) 15 ----------------------------------------------------------- 16 A B C D 17 2018-05-07 0 1 2.0 3 18 2018-05-08 4 5 NaN 7 19 2018-05-09 8 9 10.0 11 20 A B C D 21 2018-05-07 0 1 2.0 3 22 2018-05-09 8 9 10.0 11 23 A B C D 24 2018-05-07 0 1 2.0 3 25 2018-05-08 4 5 NaN 7 26 2018-05-09 8 9 10.0 11 27 A B C D 28 2018-05-07 0 1 2.0 3 29 2018-05-08 4 5 0.0 7 30 2018-05-09 8 9 10.0 11 31 A B C D 32 2018-05-07 False False False False 33 2018-05-08 False False True False 34 2018-05-09 False False False False
七.文件读取和保存
解决乱码:https://blog.csdn.net/leonzhouwei/article/details/8447643
选择格式时,选择csv utf-8(逗号分隔)
CSV即Comma Separate Values,这种文件格式经常用来作为不同程序之间的数据交互的格式;CSV是以逗号间隔的文本文件
1 import pandas as pd 2 data=pd.read_csv('foo.csv') 3 print(data) 4 data.to_excel('foo.xlsx') 5 ----------------------------------------------- 6 jpdfpa 7 0 dafhpah 8 1 adfalln 9 2 '活动'
八.合并
1 import numpy as np 2 import pandas as pd 3 #concatenating 4 df1=pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d']) 5 df2=pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d']) 6 print('df1:',df1) 7 print('df2:',df2) 8 res=pd.concat([df1,df2],axis=0,ignore_index=True)#合并,index列表添加(上下合并),忽略以前的index 9 print('res:',res) 10 11 #join['inner','outer'] 12 df3=pd.DataFrame(np.ones((3,4))*3,columns=['b','c','d','e']) 13 print('df3:',df3) 14 res2=pd.concat([df1,df3],join='outer')#合并,index列表添加,columns列表并集 15 res3=pd.concat([df1,df3],join='inner')#合并,index列表添加,columns列表交集 16 print('res2:',res2) 17 print('res3:',res3) 18 19 #join_axes 20 res4=pd.concat([df1,df3],axis=1,join_axes=[df1.index]) 21 #合并,columns列表添加(左右合并),以df1的index进行处理 22 print('res4:',res4) 23 24 #append 25 res5=df1.append(df2,ignore_index=True)#把一个表添加到另一个表中,向下添加, 26 res6=df1.append([df2,df3])#把两个表添加到另一个表中,向下添加, 27 print('res5:',res5) 28 print('res6:',res6) 29 s1=pd.Series([1,2,3,4],index=['a','b','c','d']) 30 res7=df1.append(s1,ignore_index=True)#添加一个index 31 print('res7:',res7) 32 -------------------------------------------------------------- 33 df1: a b c d 34 0 1.0 1.0 1.0 1.0 35 1 1.0 1.0 1.0 1.0 36 2 1.0 1.0 1.0 1.0 37 df2: a b c d 38 0 2.0 2.0 2.0 2.0 39 1 2.0 2.0 2.0 2.0 40 2 2.0 2.0 2.0 2.0 41 res: a b c d 42 0 1.0 1.0 1.0 1.0 43 1 1.0 1.0 1.0 1.0 44 2 1.0 1.0 1.0 1.0 45 3 2.0 2.0 2.0 2.0 46 4 2.0 2.0 2.0 2.0 47 5 2.0 2.0 2.0 2.0 48 df3: b c d e 49 0 3.0 3.0 3.0 3.0 50 1 3.0 3.0 3.0 3.0 51 2 3.0 3.0 3.0 3.0 52 res2: a b c d e 53 0 1.0 1.0 1.0 1.0 NaN 54 1 1.0 1.0 1.0 1.0 NaN 55 2 1.0 1.0 1.0 1.0 NaN 56 0 NaN 3.0 3.0 3.0 3.0 57 1 NaN 3.0 3.0 3.0 3.0 58 2 NaN 3.0 3.0 3.0 3.0 59 res3: b c d 60 0 1.0 1.0 1.0 61 1 1.0 1.0 1.0 62 2 1.0 1.0 1.0 63 0 3.0 3.0 3.0 64 1 3.0 3.0 3.0 65 2 3.0 3.0 3.0 66 res4: a b c d b c d e 67 0 1.0 1.0 1.0 1.0 3.0 3.0 3.0 3.0 68 1 1.0 1.0 1.0 1.0 3.0 3.0 3.0 3.0 69 2 1.0 1.0 1.0 1.0 3.0 3.0 3.0 3.0 70 res5: a b c d 71 0 1.0 1.0 1.0 1.0 72 1 1.0 1.0 1.0 1.0 73 2 1.0 1.0 1.0 1.0 74 3 2.0 2.0 2.0 2.0 75 4 2.0 2.0 2.0 2.0 76 5 2.0 2.0 2.0 2.0 77 res6: a b c d e 78 0 1.0 1.0 1.0 1.0 NaN 79 1 1.0 1.0 1.0 1.0 NaN 80 2 1.0 1.0 1.0 1.0 NaN 81 0 2.0 2.0 2.0 2.0 NaN 82 1 2.0 2.0 2.0 2.0 NaN 83 2 2.0 2.0 2.0 2.0 NaN 84 0 NaN 3.0 3.0 3.0 3.0 85 1 NaN 3.0 3.0 3.0 3.0 86 2 NaN 3.0 3.0 3.0 3.0 87 res7: a b c d 88 0 1.0 1.0 1.0 1.0 89 1 1.0 1.0 1.0 1.0 90 2 1.0 1.0 1.0 1.0 91 3 1.0 2.0 3.0 4.0
1 #merge 2 import numpy as np 3 import pandas as pd 4 left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]}) 5 right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]}) 6 print(left) 7 print(right) 8 res=pd.merge(left,right,on='key')#基于相同columns=‘key’进行合并 9 print(res) 10 # res2=pd.merge(left,right,on=['key1','key2'])合并多个key 11 #how=['left','right','outer','inner']合并的方式:基于左边的表进行填充,右边的表进行填充,并集,交集 12 df1=pd.DataFrame({'coll':[0,1],'col_left':['a','b']}) 13 df2=pd.DataFrame({'coll':[1,2,2],'col_left':[2,2,2]}) 14 print(df1) 15 print(df2) 16 res2=pd.merge(df1,df2,on='coll',how='outer',indicator=True)#indicator显示merge方式 17 print(res2) 18 #left_index和right_index:是否考虑左边的index和右边的index,值有True或False 19 #suffixes:合并时,给一样的columns,不一样的数据,添加标记进行区分 20 boy=pd.DataFrame({'k':['k1','k2'],'age':[1,2]}) 21 girl=pd.DataFrame({'k':['k1','k2'],'age':[3,4]}) 22 res3=pd.merge(boy,girl,on='k',suffixes=['_boy','_girl'],how='inner') 23 print(res3) 24 ---------------------------------------------------------------- 25 key lval 26 0 foo 1 27 1 bar 2 28 key rval 29 0 foo 4 30 1 bar 5 31 key lval rval 32 0 foo 1 4 33 1 bar 2 5 34 col_left coll 35 0 a 0 36 1 b 1 37 col_left coll 38 0 2 1 39 1 2 2 40 2 2 2 41 col_left_x coll col_left_y _merge 42 0 a 0 NaN left_only 43 1 b 1 2.0 both 44 2 NaN 2 2.0 right_only 45 3 NaN 2 2.0 right_only 46 age_boy k age_girl 47 0 1 k1 3 48 1 2 k2 4
九.运算
1 import pandas as pd 2 import numpy as np 3 data={'one':{'a':1,'b':2,'c':3,'d':np.nan}, 4 'two': {'a': 2.0, 'b': 3.0, 'c': 4.0, 'd': 5.0}} 5 df=pd.DataFrame(data,index=['a','b','c','d'],columns=['one','two']) 6 print('df:',df) 7 #根据原有的列添加新列 8 df['three']=df['one']*10+df['two'] 9 print(df) 10 #求均值 11 print('mean:\n',df.mean(1))#参数1,计算行均值;默认0,按列求均值, 12 print('sum:\n',df.sum(1))#参数1,计算行和;默认0,按列求和 13 print('函数:\n',df.apply(lambda x:x.max()-x.min()))#将一个函数应用到dataframe的每一列 14 #设置索引 15 df.set_index('one') 16 #重命名列 17 df.rename(columns={'one':'1'},inplace=True) 18 print(df) 19 ---------------------------------------------------- 20 df: one two 21 a 1.0 2.0 22 b 2.0 3.0 23 c 3.0 4.0 24 d NaN 5.0 25 one two three 26 a 1.0 2.0 12.0 27 b 2.0 3.0 23.0 28 c 3.0 4.0 34.0 29 d NaN 5.0 NaN 30 mean: 31 a 5.000000 32 b 9.333333 33 c 13.666667 34 d 5.000000 35 dtype: float64 36 sum: 37 a 15.0 38 b 28.0 39 c 41.0 40 d 5.0 41 dtype: float64 42 函数: 43 one 2.0 44 two 3.0 45 three 22.0 46 dtype: float64 47 1 two three 48 a 1.0 2.0 12.0 49 b 2.0 3.0 23.0 50 c 3.0 4.0 34.0 51 d NaN 5.0 NaN
1 import pandas as pd 2 import numpy as np 3 data={'one':{'a':1,'b':2,'c':3,'d':np.nan}, 4 'two': {'a': 2.0, 'b': 3.0, 'c': 4.0, 'd': 5.0}} 5 df=pd.DataFrame(data,index=['a','b','c','d'],columns=['one','two']) 6 print('df:',df) 7 #查看最大最小值即其对应index 8 print(df['two'].max())#min取最小 9 print(df['two'].idxmax())#idmin取最小 10 #重设索引 11 df.reset_index(inplace=True) 12 print(df) 13 #改变数据类型 14 print(df.dtypes) 15 df1=df[['two',]].astype('int32') 16 print(df1.dtypes) 17 #计算频率 18 print(df['index'].value_counts()) 19 #字符方法 20 print(df['index'].str.upper())#大写 21 print(df['index'].str.len())#长度 22 print(df['index'].str.contains('s'))#包含 23 #。。。。 24 ------------------------------------------------------------- 25 df: one two 26 a 1.0 2.0 27 b 2.0 3.0 28 c 3.0 4.0 29 d NaN 5.0 30 5.0 31 d 32 index one two 33 0 a 1.0 2.0 34 1 b 2.0 3.0 35 2 c 3.0 4.0 36 3 d NaN 5.0 37 index object 38 one float64 39 two float64 40 dtype: object 41 two int32 42 dtype: object 43 c 1 44 d 1 45 b 1 46 a 1 47 Name: index, dtype: int64 48 0 A 49 1 B 50 2 C 51 3 D 52 Name: index, dtype: object 53 0 1 54 1 1 55 2 1 56 3 1 57 Name: index, dtype: int64 58 0 False 59 1 False 60 2 False 61 3 False 62 Name: index, dtype: bool
十.可视化数据
1 import pandas as pd 2 import numpy as np 3 import matplotlib.pyplot as plt 4 # data=pd.Series(np.random.randn(1000),index=np.arange(1000))#随机生成数据 5 # data=data.cumsum()#把生成的数据进行累加 6 # data.plot()#画图 7 # plt.show()#显示图 8 9 data=pd.DataFrame(np.random.randn(1000,4), 10 index=np.arange(1000), 11 columns=list('ABCD')) 12 data=data.cumsum()#把生成的数据进行累加 13 data.plot()#画图 14 plt.show()#显示图 15 -------------------------------------
1 data=pd.DataFrame(np.random.randn(1000,4), 2 index=np.arange(1000), 3 columns=list('ABCD')) 4 data=data.cumsum() 5 ax=data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class 1') 6 data.plot.scatter(x='A',y='C',color='DarkGreen',label='Class 2',ax=ax) 7 plt.show() 8 -----------------------------------------
十.总结
1.pandas中index表示行(列表),对应axis=0行操作
columns表示列(列表),对应axis=1列操作
通过字典创建dataframe,创建的图表:最左边是index列表,从上到下;最上面是columns列表,从左到右;中间是字典数据,每一个数据对应相应的index和columns
ix(index,columns):既可以传入索引,也可以传入切片