Python common library: pandas

pandas is a numpy-based library and a very commonly used library for data processing in python.

  1. Series
  2. DataFrame
  3. Index

1. Series

>>> import pandas as pd
>>> import numpy as np
>>> s = pd.Series([7,'beijing',2.1,2,'happy'])
>>> s
0          7
1    beijing
2        2.1
3          2
4      happy
dtype: object
>>> s = pd.Series([7,'beijing',2.1,2,'happy'],index = ['A','B','C','D','E']) #修改index
>>> s
A          7
B    beijing
C        2.1
D          2
E      happy
dtype: object
>>> type(s)
<class 'pandas.core.series.Series'>
>>> cities = {'beijing':55000,'shanghai':60000,'shenzhen':40000,'guangzhou':25000}
>>> cities
{'beijing': 55000, 'shanghai': 60000, 'shenzhen': 40000, 'guangzhou': 25000}
>>> apts = pd.Series(cities) #Use dict to define a Series, Series itself is a key value pair
>>> apts
beijing      55000
guangzhou    25000
shanghai     60000
shenzhen     40000
dtype: int64
>>> apts[['beijing','shenzhen']]
beijing     55000
shenzhen    40000
dtype: int64
>>> apts[apts<50000]
guangzhou    25000
shenzhen     40000
dtype: int64
>>> 'beijing' in apts #Check if the element is in the Series
True

>>> apts[apts.isnull()] #View elements whose value is null
Series([], dtype: int64)

>>> apts[apts.notnull()] #View elements whose value is not null
	 
beijing      55000
guangzhou    25000
shanghai     60000
shenzhen     40000
dtype: int64

2. DataFrame

A DataFrame is a table, a Series is a one-dimensional array, and a DataFrame is a two-dimensional array

>>> import pandas as pd
>>> import numpy as np
>>> data = {'cities':['beijing','shanghai','guangzhou','shenzhen'],'years':[2014,2015,2016,2017],'population':[1000,2000,3000,4000]}
>>> type(pd.DataFrame(data))
<class 'pandas.core.frame.DataFrame'>
>>> pd.DataFrame(data) #DataFrame initialization
      cities  population  years
0    beijing        1000   2014
1   shanghai        2000   2015
2  guangzhou        3000   2016
3   shenzhen        4000   2017
>>> pd.DataFrame(data,columns=['years','cities','population'])#Specify the order of columns
   years     cities  population
0   2014    beijing        1000
1   2015   shanghai        2000
2   2016  guangzhou        3000
3   2017   shenzhen        4000
>>> pd.DataFrame(data,columns=['years','cities','population'],index=['one','two','three','foue'])#Modify the name of the row
       years     cities  population
one     2014    beijing        1000
two     2015   shanghai        2000
three   2016  guangzhou        3000
foue 2017 shenzhen 4000
>>> frame2 = pd.DataFrame(data,columns=['years','cities','population'],index=['one','two','three','foue'])
>>> frame2
       years     cities  population
one     2014    beijing        1000
two     2015   shanghai        2000
three   2016  guangzhou        3000
foue 2017 shenzhen 4000
>>> frame2['cities']#Remove a column
one        beijing
two       shanghai
three    guangzhou
foue shenzhen
Name: cities, dtype: object
>>> frame2.cities
one        beijing
two       shanghai
three    guangzhou
foue shenzhen
Name: cities, dtype: object
>>> frame2.ix['three']# Take out a row
years              2016
cities        guangzhou
population         3000
Name: three, dtype: object
>>> frame2.ix[2]
years              2016
cities        guangzhou
population         3000
Name: three, dtype: object
>>> frame2['cities']['one'] = 'zhuhai'#Modify an element

Warning (from warnings module):
  File "__main__", line 1
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
>>> frame2
       years     cities  population
one     2014     zhuhai        1000
two     2015   shanghai        2000
three   2016  guangzhou        3000
foue 2017 shenzhen 4000
>>> frame2['population'] = 1#Modify a whole column
>>> frame2
       years     cities  population
one     2014     zhuhai           1
two     2015   shanghai           1
three   2016  guangzhou           1
foue 2017 shenzhen 1
>>> frame2.ix['five'] = 1#Modify a whole line
>>> frame2
       years     cities  population
one     2014     zhuhai           1
two     2015   shanghai           1
three   2016  guangzhou           1
foue 2017 shenzhen 1
five       1          1           1
>>> frame2.ix['one'] = 2
>>> frame2
       years     cities  population
one        2          2           2
two     2015   shanghai           1
three   2016  guangzhou           1
foue 2017 shenzhen 1
five       1          1           1
>>> frame2.years = np.arange(5)#Use a generator to modify a column
>>> frame2
       years     cities  population
one        0          2           2
two        1   shanghai           1
three      2  guangzhou           1
foue 3 shenzhen 1
five       4          1           1
>>> val = pd.Series([200,300,500],index=['two','three','five'])
>>> frame2['population'] = val#Use a Series to modify a column
>>> frame2
       years     cities  population
one 0 2 NaN
two        1   shanghai       200.0
three      2  guangzhou       300.0
foue 3 shenzhen NaN
five       4          1       500.0
>>> frame2.columns
Index(['years', 'cities', 'population'], dtype='object')
>>> frame2.index
Index(['one', 'two', 'three', 'foue', 'five'], dtype='object')
>>> frame2.T#transpose
            one       two      three      foue five
years         0         1          2         3    4
cities        2  shanghai  guangzhou  shenzhen    1
population NaN 200 300 NaN 500
>>> frame2['cities'][1:2]#Use slices to get elements
two    shanghai
Name: cities, dtype: object
>>>

3. Index

Some operations on index

>>> import pandas as pd
>>> import numpy as np
>>> obj = pd.Series(range(3))
>>> obj
0    0
1    1
2    2
dtype: int64
>>> obj = pd.Series(range(3),index=['a','b','c'])
>>> obj
a    0
b    1
c    2
dtype: int64
>>> obj[[0,2]]
a    0
c    2
dtype: int64
>>> obj[0:2]
a    0
b    1
dtype: int64
>>> obj['a':'c']
a    0
b    1
c    2
dtype: int64
>>> obj['a':'c'] = 3
>>> obj
a    3
b    3
c    3
dtype: int64
>>> frame = pd.DataFrame(np.arange(9).reshape(3,3),index = ['a','b','c'],columns=['beijing','shanghai','guangzhou'])
>>> frame
   beijing shanghai guangzhou
a        0         1          2
b        3         4          5
c        6         7          8
>>> frame.ix['a':'c']
   beijing shanghai guangzhou
a        0         1          2
b        3         4          5
c        6         7          8
>>> frame.ix[['a','c'],['beijing','guangzhou']]
   beijing guangzhou
a        0          2
c        6          8
>>> frame.ix[:,'beijing':'guangzhou']
   beijing shanghai guangzhou
a        0         1          2
b        3         4          5
c        6         7          8
>>> frame.reindex(['e','f','g','h'])
   beijing shanghai guangzhou
e NaN NaN NaN
f NaN NaN NaN
g NaN NaN NaN
h NaN NaN NaN
>>> frame
   beijing shanghai guangzhou
a        0         1          2
b        3         4          5
c        6         7          8
>>> frame.drop('a')
   beijing shanghai guangzhou
b        3         4          5
c        6         7          8
>>> frame
   beijing shanghai guangzhou
a        0         1          2
b        3         4          5
c        6         7          8
>>> frame = frame.drop('a')
>>> frame
   beijing shanghai guangzhou
b        3         4          5
c        6         7          8
>>> data = pd.Series(np.random.randn(10),index=[['a','a','a','b','b','c','c','c','d','d'],[1,2,3,1,2,1,2,3,1,2]])
>>> data
a  1   -0.060544
   2   -1.680403
   3    0.408582
b  1    1.001766
   2    1.320155
c  1   -1.125726
   2    1.508404
   3    0.640139
d  1    0.824988
   2    0.148888
dtype: float64
>>> data.index
MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 2, 2, 2, 3, 3], [0, 1, 2, 0, 1, 0, 1, 2, 0, 1]])
>>> data['b':'d']
b  1    1.001766
   2    1.320155
c  1   -1.125726
   2    1.508404
   3    0.640139
d  1    0.824988
   2    0.148888
dtype: float64
>>> data[1:4]
a  2   -1.680403
   3    0.408582
b  1    1.001766
dtype: float64
>>> data.unstack()
          1         2         3
a -0.060544 -1.680403  0.408582
b 1.001766 1.320155 NaN
c -1.125726  1.508404  0.640139
d 0.824988 0.148888 NaN
>>> data.unstack().stack()
a  1   -0.060544
   2   -1.680403
   3    0.408582
b  1    1.001766
   2    1.320155
c  1   -1.125726
   2    1.508404
   3    0.640139
d  1    0.824988
   2    0.148888
dtype: float64
>>>


Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324508766&siteId=291194637