Panads学习(一)

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u013817676/article/details/78995210
import pandas as pd
import numpy as np
#创建序列
s = pd.Series([1,2,3,4,5,np.nan,22,33])
print s
0     1.0
1     2.0
2     3.0
3     4.0
4     5.0
5     NaN
6    22.0
7    33.0
dtype: float64
#创建DataFrame
#创建一个时间序列
datas = pd.date_range("20180107",periods=6)
print datas
DatetimeIndex(['2018-01-07', '2018-01-08', '2018-01-09', '2018-01-10',
               '2018-01-11', '2018-01-12'],
              dtype='datetime64[ns]', freq='D')
#指定行列
df = pd.DataFrame(np.random.randn(6,4),index = datas,columns=['a','b','c','d'])
print df
                   a         b         c         d
2018-01-07 -0.005483 -0.006776  1.410138 -0.447714
2018-01-08 -0.070957  1.444326 -0.152151 -0.129219
2018-01-09  0.371654 -0.052031  1.520965  0.697403
2018-01-10 -0.940761  0.339332  0.305061 -1.076497
2018-01-11  0.854794 -0.477876 -0.776903  1.134447
2018-01-12  0.906029 -0.493668 -1.983745 -1.260175
#默认DataFrame
df1 = pd.DataFrame(np.arange(24).reshape((6,4)))
print df1
    0   1   2   3
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
4  16  17  18  19
5  20  21  22  23
#根据字典定义
df2 = pd.DataFrame({'A':1.,
                   'B':pd.Timestamp('20180107'),
                   'C':pd.Series(1,index = list(range(4))),
                   'D':np.array([3]*4),
                   'E':pd.Categorical(['me','he','she','it'])})
print df2
     A          B  C  D    E
0  1.0 2018-01-07  1  3   me
1  1.0 2018-01-07  1  3   he
2  1.0 2018-01-07  1  3  she
3  1.0 2018-01-07  1  3   it
df2.dtypes
A           float64
B    datetime64[ns]
C             int64
D             int64
E          category
dtype: object
#行、列细信息
df2.index
df2.columns
Index([u'A', u'B', u'C', u'D', u'E'], dtype='object')
df2.values
array([[1.0, Timestamp('2018-01-07 00:00:00'), 1, 3, 'me'],
       [1.0, Timestamp('2018-01-07 00:00:00'), 1, 3, 'he'],
       [1.0, Timestamp('2018-01-07 00:00:00'), 1, 3, 'she'],
       [1.0, Timestamp('2018-01-07 00:00:00'), 1, 3, 'it']], dtype=object)
#计算基本信息
df2.describe()
A C D
count 4.0 4.0 4.0
mean 1.0 1.0 3.0
std 0.0 0.0 0.0
min 1.0 1.0 3.0
25% 1.0 1.0 3.0
50% 1.0 1.0 3.0
75% 1.0 1.0 3.0
max 1.0 1.0 3.0
#根据行或列进行排序
print df2.sort_index(axis=1,ascending=False)
     E  D  C          B    A
0   me  3  1 2018-01-07  1.0
1   he  3  1 2018-01-07  1.0
2  she  3  1 2018-01-07  1.0
3   it  3  1 2018-01-07  1.0
#根据value进行排序
print df2.sort_values(by='E')
     A          B  C  D    E
1  1.0 2018-01-07  1  3   he
3  1.0 2018-01-07  1  3   it
0  1.0 2018-01-07  1  3   me
2  1.0 2018-01-07  1  3  she
df
a b c d
2018-01-07 -0.005483 -0.006776 1.410138 -0.447714
2018-01-08 -0.070957 1.444326 -0.152151 -0.129219
2018-01-09 0.371654 -0.052031 1.520965 0.697403
2018-01-10 -0.940761 0.339332 0.305061 -1.076497
2018-01-11 0.854794 -0.477876 -0.776903 1.134447
2018-01-12 0.906029 -0.493668 -1.983745 -1.260175
#选择列
print df['a'],df.a
2018-01-07   -0.005483
2018-01-08   -0.070957
2018-01-09    0.371654
2018-01-10   -0.940761
2018-01-11    0.854794
2018-01-12    0.906029
Freq: D, Name: a, dtype: float64 2018-01-07   -0.005483
2018-01-08   -0.070957
2018-01-09    0.371654
2018-01-10   -0.940761
2018-01-11    0.854794
2018-01-12    0.906029
Freq: D, Name: a, dtype: float64
#选择行,切片
print df[0:2]
                   a         b         c         d
2018-01-07 -0.005483 -0.006776  1.410138 -0.447714
2018-01-08 -0.070957  1.444326 -0.152151 -0.129219
print df['20180107':'20180110']
                   a         b         c         d
2018-01-07 -0.005483 -0.006776  1.410138 -0.447714
2018-01-08 -0.070957  1.444326 -0.152151 -0.129219
2018-01-09  0.371654 -0.052031  1.520965  0.697403
2018-01-10 -0.940761  0.339332  0.305061 -1.076497
#通过标签选择:loc
print df.loc['20180110']
a   -0.940761
b    0.339332
c    0.305061
d   -1.076497
Name: 2018-01-10 00:00:00, dtype: float64
#选择a和b的所有列数据
print df.loc[:,['a','b']]
                   a         b
2018-01-07 -0.005483 -0.006776
2018-01-08 -0.070957  1.444326
2018-01-09  0.371654 -0.052031
2018-01-10 -0.940761  0.339332
2018-01-11  0.854794 -0.477876
2018-01-12  0.906029 -0.493668
#通过位置选择:iloc
print df.iloc[3]
a   -0.940761
b    0.339332
c    0.305061
d   -1.076497
Name: 2018-01-10 00:00:00, dtype: float64
print df.iloc[3,1]
0.33933171911
print df.iloc[1:3,2:3]
                   c
2018-01-08 -0.152151
2018-01-09  1.520965
print df.iloc[[1,3,5],2:3]
                   c
2018-01-08 -0.152151
2018-01-10  0.305061
2018-01-12 -1.983745
#综合选择(mix loc and iloc):ix
print df.ix[:3,['a','b']]
                   a         b
2018-01-07 -0.005483 -0.006776
2018-01-08 -0.070957  1.444326
2018-01-09  0.371654 -0.052031
#是或否删选(Boolean indexing)
print df
                   a         b         c         d
2018-01-07 -0.005483 -0.006776  1.410138 -0.447714
2018-01-08 -0.070957  1.444326 -0.152151 -0.129219
2018-01-09  0.371654 -0.052031  1.520965  0.697403
2018-01-10 -0.940761  0.339332  0.305061 -1.076497
2018-01-11  0.854794 -0.477876 -0.776903  1.134447
2018-01-12  0.906029 -0.493668 -1.983745 -1.260175
print df[df.a>0]
                   a         b         c         d
2018-01-09  0.371654 -0.052031  1.520965  0.697403
2018-01-11  0.854794 -0.477876 -0.776903  1.134447
2018-01-12  0.906029 -0.493668 -1.983745 -1.260175
#总共4中删选方式
#赋值
df.iloc[3,3] = 11
print df
                   a         b         c          d
2018-01-07 -0.005483 -0.006776  1.410138  -0.447714
2018-01-08 -0.070957  1.444326 -0.152151  -0.129219
2018-01-09  0.371654 -0.052031  1.520965   0.697403
2018-01-10 -0.940761  0.339332  0.305061  11.000000
2018-01-11  0.854794 -0.477876 -0.776903   1.134447
2018-01-12  0.906029 -0.493668 -1.983745  -1.260175
df.a[df.a<0] = 0
print df
                   a         b         c          d
2018-01-07  0.000000 -0.006776  1.410138  -0.447714
2018-01-08  0.000000  1.444326 -0.152151  -0.129219
2018-01-09  0.371654 -0.052031  1.520965   0.697403
2018-01-10  0.000000  0.339332  0.305061  11.000000
2018-01-11  0.854794 -0.477876 -0.776903   1.134447
2018-01-12  0.906029 -0.493668 -1.983745  -1.260175
#添加一列
df['e'] = pd.Series([1,2,3,4,5,6])
print df
                   a         b         c          d   e
2018-01-07  0.000000 -0.006776  1.410138  -0.447714 NaN
2018-01-08  0.000000  1.444326 -0.152151  -0.129219 NaN
2018-01-09  0.371654 -0.052031  1.520965   0.697403 NaN
2018-01-10  0.000000  0.339332  0.305061  11.000000 NaN
2018-01-11  0.854794 -0.477876 -0.776903   1.134447 NaN
2018-01-12  0.906029 -0.493668 -1.983745  -1.260175 NaN
df['e'] = pd.Series([1,2,3,4,5,6],index = pd.date_range('20180107',periods=6))
print df
                   a         b         c          d  e
2018-01-07  0.000000 -0.006776  1.410138  -0.447714  1
2018-01-08  0.000000  1.444326 -0.152151  -0.129219  2
2018-01-09  0.371654 -0.052031  1.520965   0.697403  3
2018-01-10  0.000000  0.339332  0.305061  11.000000  4
2018-01-11  0.854794 -0.477876 -0.776903   1.134447  5
2018-01-12  0.906029 -0.493668 -1.983745  -1.260175  6

猜你喜欢

转载自blog.csdn.net/u013817676/article/details/78995210