#-*- coding:utf-8 -*- #author : zhangwei import pandas as pd import numpy as np import matplotlib.pyplot as plt #pandas创建序列 # s = pd.Series([1,2,3,np.nan,44,1]) # print s # # dates = pd.date_range('20180603',periods=6) # # print dates # df = pd.DataFrame(np.random.rand(6,4),index=dates , columns=['a','b','c','d']) #定义DataFrame,行和列 # # df1 = pd.DataFrame(np.arange(12).reshape([3,4])) # # print df1.dtypes #打印类型 # # print df1.index #打印标签 # # print df1.values #打印列表中的内容 # # print df1.describe() #打印数字形式的均值等操作 # # print df1.T # print df.sort_index(axis=0 , ascending=False) #index排序ascending:False代表是否为倒序,axis=0代表的行,1代表的是列操作; # print df.sort_values(by='b',ascending=False) #对data中values进行排序 # dates = pd.date_range('20180603' , periods=6) # df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D']) # print df # print df['A'] , df.A #导入第一列,两种方法 # print df[0:3] #查找第0到第三行数据,右开 # print df['20180603':'20180605'] #进行行号索引 # print df.loc['20180604'] #筛选值loc # # print df.loc[:,['A','B']] #打印AB列的所有值 # print df.loc['20180604',['A','B']] #行和列进行打开 # print df.iloc[3:5,1:3] #打印第三行到第五行,第一列到第三列 # print df.iloc[[1,3,4],1:3] #间断打开行 # print df.ix[:3 , 'A':'C'] #ix进行筛选,标签核行号索引 # print(df.A > 8) #帅选A列中大于8的值 # dates = pd.date_range('20180603' , periods=6) # df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D']) # print df # # df.iloc[2,2] = 111 #改变第二行第二列的数据为111 # # df.loc['20180604' , 'B'] = 7 #用loc方法进行该表数据内部元素 # # df[df.A > 4] = 0 #A列中values>4的值全部替换为0 # # df.A[df.A>4] = 0 #只针对A这一列进行操作 # df['F'] = np.nan #加入一列 # df['E'] = pd.Series([1,2,3,4,5,6] , index=pd.date_range('20180603' , periods=6)) #加入新的一列,并进行赋值 # print(df) # dates = pd.date_range('20180603' , periods=6) # df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D']) # df.iloc[0,1] = np.nan # df.iloc[1,2] = np.nan # print df # print df.dropna() #丢弃含有nan的值及其对应的行 # print(df.dropna(axis=1,how='any')) #how={any,all},all:只有所有是nan才进行丢弃,any:只有含有一个nan就进行丢弃;axis=0,1分别对应行核列; # print df.fillna(value=1) #对nan进行填充value # print df.isnull() , df.isna() #查找是否存在缺失值,返回bool值 #read文件以及保存文件csv,pickle # data = pd.read_csv('/home/zhangwei/student.csv') #read_csv格式的文件 # print data # data.to_pickle('/home/zhangwei/student.pickle') #保存文件格式为Python的格式pickle #数据合并多个data ,concatenating # df1 = pd.DataFrame(np.ones((3,4))*0 , columns=['a','b','c','d']) # df2 = pd.DataFrame(np.ones((3,4))*1 , columns=['a','b','c','d']) # df3 = pd.DataFrame(np.ones((3,4))*2 , columns=['a','b','c','d']) # df4 = pd.DataFrame(np.ones((3,4))*0 , columns=['a','b','c','d'] , index= [1,2,3]) # df5 = pd.DataFrame(np.ones((3,4))*1 , columns=['b','c','d','e'] , index= [2,3,4]) # #print df1 , df2 , df3 # # res = pd.concat([df1 , df2 , df3] , axis=0 , ignore_index=True) #合并数据,axis=0代表的是行合并;ignore_index对信号进行重新排序; # # print df4 , df5 # # res = pd.concat([df4 , df5] , axis=0 , join='inner' , ignore_index=True) #join{'inner','outer'},outer进行部nan操作;inner进行裁剪掉,寻找相同的部分进行合; # # res = pd.concat([df4 , df5] , axis=1 , join_axes=[df4.index]) #join_axes数据按照【】里面的数据进行排序; # # res = df4.append([df3 , df5] , ignore_index=True) #append进行添加数据 # s1 = pd.Series([1,2,3,4] , index=['a','b','c','d']) # res = df4.append(s1 , ignore_index=True) #用series进行添加行 # print(res) #采用merge进行合并dataframe # left = pd.DataFrame({'key1':['k0','k1','k0','k1'], # 'key2':['k0','k0','k1','k1'], # 'A' : ['A0','A1','A2','A3'], # 'B':['B0','B1','B2','B3']}) # right = pd.DataFrame({'key1':['k0','k1','k1','k0'], # 'key2':['k0','k1','k1','k1'], # 'C':['C0','C1','C2','C3'], # 'D':['D0','D1','D2','D3']}) #print left , right # res = pd.merge(left , right , on='key') #对key进行合并 # print left , right # res = pd.merge(left , right , on=['key1' , 'key2'] , how='outer' , indicator='indicator_column') #how:{inner , outer , left , right} , left则是以左边的key1进行考虑,indicator显示是那边数据进行合并 # print res #handle overlapping # boys = pd.DataFrame({'k':['K0','K1','K2'], # 'ages':[1,2,3]}) # girls = pd.DataFrame({'k':['K0','K1','K2'], # 'ages':[4,5,6]}) # print boys , girls # res = pd.merge(boys , girls , on='k' , how='outer' , suffixes=['_boys' , '_girls']) #suffixes输出boys and girls # print res #plot data # data = pd.Series(np.random.randn(1000) , index=np.arange(1000)) # data = data.cumsum() #对数据进行累加 # data.plot() #线性的plot,当采用pandas时,无需定义xy洲,plot内部有很多参数,包括线的颜色以及线的宽度、等等; # plt.show() data = pd.DataFrame(np.random.randn(1000,4), index=np.arange(1000) , columns=['A','B','C','D']) #数据四个一组,注意list列表必须是括号形式,而不是中括号形式; data = data.cumsum() # print data.head(8) #显示数据的前8行 # data.plot() # plt.show() #plot methods:bar , hist , box , kde , area , scatter , hexbin....... ax = data.plot.scatter(x='A' , y='B' , color='DarkBlue' , label='Class1') data.plot.scatter(x='A' , y='C' , color='Darkgreen' , label='Class2',ax=ax) #使用ax=ax代表的是在同一张图上画两张图; plt.show()
python学习笔记-pandas
猜你喜欢
转载自blog.csdn.net/xwei1226/article/details/80566808
今日推荐
周排行