import pandas as pd
import numpy as np
pandas
1. Series 数据结构(Series 是带有标签的一维数组,可以保存任何数据类型(整数,字符串,浮点数,Python对象等),轴标签统称为索引)
1.1 创建对象
1.1.1 由字典创建,字典的key就是index,values就是values
dic = {'a':1 ,'b':2 , 'c':3, '4':4, '5':5}
pd.Series(dic)
4 4
5 5
a 1
b 2
c 3
dtype: int64
1.1.2 由数组创建(一维数组)
arr = np.random.randn(5)
arr
array([-0.07012061, 0.11267954, -0.39431225, 1.01689252, -1.09012858])
pd.Series(arr, index = ['a','b','c','d','e'],dtype = np.object)
a -0.0701206
b 0.11268
c -0.394312
d 1.01689
e -1.09013
dtype: object
1.1.3 由标量创建
pd.Series(10, index = range(4))
0 10
1 10
2 10
3 10
dtype: int64
1.2 属性
s = pd.Series(np.random.randn(5),name= "test")
s
0 1.157269
1 -0.869221
2 -1.288676
3 -0.313955
4 -0.366169
Name: test, dtype: float64
1.2.1 name
s.name
'test'
1.2.2 dtypes
s.dtypes
dtype('float64')
1.2.3 对数据快速统计汇总
s.describe()
count 5.000000
mean -0.336150
std 0.925090
min -1.288676
25% -0.869221
50% -0.366169
75% -0.313955
max 1.157269
Name: test, dtype: float64
1.2.4 排序
s.sort_values()
2 -1.288676
1 -0.869221
4 -0.366169
3 -0.313955
0 1.157269
Name: test, dtype: float64
s.sort_index()
0 1.157269
1 -0.869221
2 -1.288676
3 -0.313955
4 -0.366169
Name: test, dtype: float64
1.3 索引
s = pd.Series(np.random.rand(5),index = ['a','b','c','d','e'])
s
a 0.801643
b 0.915875
c 0.759831
d 0.017935
e 0.989988
dtype: float64
1.3.1 位置下标
s[0]
0.80164301518126946
1.3.2 标签索引
s["a"]
0.80164301518126946
s[["a","c"]]
a 0.801643
c 0.759831
dtype: float64
1.3.3 切片索引
s[1:3]
b 0.915875
c 0.759831
dtype: float64
s["a":"b"]
a 0.801643
b 0.915875
dtype: float64
1.3.4 布尔型索引
s[s>0.8]
a 0.801643
b 0.915875
e 0.989988
dtype: float64
1.4 基本技巧
s = pd.Series(np.random.rand(50))
1.4.1 数据查看
s.head()
0 0.782876
1 0.051283
2 0.649146
3 0.142833
4 0.383419
dtype: float64
s.tail()
45 0.755592
46 0.844982
47 0.726627
48 0.378381
49 0.201560
dtype: float64
1.4.2 重新索引
s = pd.Series(np.random.rand(3), index = ['a','b','c'])
s
a 0.572041
b 0.638441
c 0.209887
dtype: float64
s1 = s.reindex(['c','b','a','d'])
s1
c 0.209887
b 0.638441
a 0.572041
d NaN
dtype: float64
s2 = s.reindex(['c','b','a','d'], fill_value = 0)
s2
c 0.209887
b 0.638441
a 0.572041
d 0.000000
dtype: float64
1.4.3 对齐
s1 = pd.Series(np.random.rand(3), index = ['Jack','Marry','Tom'])
s2 = pd.Series(np.random.rand(3), index = ['Wang','Jack','Marry'])
s1
Jack 0.266121
Marry 0.899194
Tom 0.629672
dtype: float64
s2
Wang 0.350321
Jack 0.602482
Marry 0.081977
dtype: float64
s1+s2
Jack 0.868602
Marry 0.981171
Tom NaN
Wang NaN
dtype: float64
1.4.4 添加. 修改 删除
s = pd.Series(np.random.rand(5), index = list('ngjur'))
s
n 0.722115
g 0.999095
j 0.350186
u 0.763943
r 0.944230
dtype: float64
s.drop("n")
g 0.999095
j 0.350186
u 0.763943
r 0.944230
dtype: float64
s.drop(["g","n"])
j 0.350186
u 0.763943
r 0.944230
dtype: float64
s1 = pd.Series(np.random.rand(5))
s2 = pd.Series(np.random.rand(5), index = list('ngjur'))
s1.append(s2)
0 0.716950
1 0.382762
2 0.518129
3 0.849587
4 0.322931
n 0.321734
g 0.818017
j 0.129185
u 0.134461
r 0.327531
dtype: float64
s = pd.Series(np.random.rand(3), index = ['a','b','c'])
print(s)
s['a'] = 100
s[['b','c']] = 200
print(s)
a 0.114684
b 0.491650
c 0.482090
dtype: float64
a 100.0
b 200.0
c 200.0
dtype: float64
2. Dataframe 数据结构(Dataframe是一个表格型的数据结构,“带有标签的二维数组;Dataframe带有index(行标签)和columns(列标签))
2.1 创建对象
2.1.1 数组/list组成的字典
data1 = {'a':[1,2,3],
'b':[3,4,5],
'c':[5,6,7]}
pd.DataFrame(data1)
|
a |
b |
c |
0 |
1 |
3 |
5 |
1 |
2 |
4 |
6 |
2 |
3 |
5 |
7 |
2.1.2 Series组成的字典
data1 = {'one':pd.Series(np.random.rand(2)),
'two':pd.Series(np.random.rand(3))}
pd.DataFrame(data1)
|
one |
two |
0 |
0.063242 |
0.413140 |
1 |
0.738629 |
0.572936 |
2 |
NaN |
0.153727 |
2.1.3 二维数组直接创建
ar = np.random.rand(9).reshape(3,3)
ar
array([[ 0.69920788, 0.63388493, 0.95545456],
[ 0.85046889, 0.62151678, 0.95159924],
[ 0.36499264, 0.09285466, 0.3064868 ]])
pd.DataFrame(ar,index = ['a', 'b', 'c'], columns = ['one','two','three'])
|
one |
two |
three |
a |
0.699208 |
0.633885 |
0.955455 |
b |
0.850469 |
0.621517 |
0.951599 |
c |
0.364993 |
0.092855 |
0.306487 |
2.1.4 有字典组成的列表
data = [{'one': 1, 'two': 2}, {'one': 5, 'two': 10, 'three': 20}]
data
[{'one': 1, 'two': 2}, {'one': 5, 'three': 20, 'two': 10}]
pd.DataFrame(data)
|
one |
three |
two |
0 |
1 |
NaN |
2 |
1 |
5 |
20.0 |
10 |
2.1.5 有字典组成的字典
data = {'Jack':{'math':90,'english':89,'art':78},
'Marry':{'math':82,'english':95,'art':92},
'Tom':{'math':78,'english':67}}
pd.DataFrame(data)
|
Jack |
Marry |
Tom |
art |
78 |
92 |
NaN |
english |
89 |
95 |
67.0 |
math |
90 |
82 |
78.0 |
2.2 索引
2.2.1 选择列
df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
index = ['one','two','three'],
columns = ['a','b','c','d'])
df
|
a |
b |
c |
d |
one |
44.519620 |
50.015625 |
30.079694 |
42.898252 |
two |
10.109428 |
55.797363 |
53.288322 |
4.932462 |
three |
82.247965 |
78.284970 |
69.452170 |
27.152858 |
df[["a","c"]]
|
a |
c |
one |
44.519620 |
30.079694 |
two |
10.109428 |
53.288322 |
three |
82.247965 |
69.452170 |
df.loc[:,["a","c"]]
|
a |
c |
one |
44.519620 |
30.079694 |
two |
10.109428 |
53.288322 |
three |
82.247965 |
69.452170 |
df.iloc[:,[0,2]]
|
a |
c |
one |
44.519620 |
30.079694 |
two |
10.109428 |
53.288322 |
three |
82.247965 |
69.452170 |
2.2.2 选择行
df.loc["one"]
a 44.519620
b 50.015625
c 30.079694
d 42.898252
Name: one, dtype: float64
df.loc[["one","two"]]
|
a |
b |
c |
d |
one |
44.519620 |
50.015625 |
30.079694 |
42.898252 |
two |
10.109428 |
55.797363 |
53.288322 |
4.932462 |
df.iloc[0]
a 44.519620
b 50.015625
c 30.079694
d 42.898252
Name: one, dtype: float64
df.iloc[[0,1]]
|
a |
b |
c |
d |
one |
44.519620 |
50.015625 |
30.079694 |
42.898252 |
two |
10.109428 |
55.797363 |
53.288322 |
4.932462 |
2.2.3 切片
df
|
a |
b |
c |
d |
one |
44.519620 |
50.015625 |
30.079694 |
42.898252 |
two |
10.109428 |
55.797363 |
53.288322 |
4.932462 |
three |
82.247965 |
78.284970 |
69.452170 |
27.152858 |
df.loc["one":"two","a":"c"]
|
a |
b |
c |
one |
44.519620 |
50.015625 |
30.079694 |
two |
10.109428 |
55.797363 |
53.288322 |
df.iloc[0:2,0:3]
|
a |
b |
c |
one |
44.519620 |
50.015625 |
30.079694 |
two |
10.109428 |
55.797363 |
53.288322 |
2.2.4 布尔判断
df[df<40]
|
a |
b |
c |
d |
one |
NaN |
NaN |
30.079694 |
NaN |
two |
10.109428 |
NaN |
NaN |
4.932462 |
three |
NaN |
NaN |
NaN |
27.152858 |
2.3 基本技巧
2.3.1 数据查看. 转置
df = pd.DataFrame(np.random.rand(16).reshape(8,2)*100,
columns = ['a','b'])
df
|
a |
b |
0 |
93.694076 |
10.585479 |
1 |
20.906019 |
0.805435 |
2 |
60.688091 |
44.387455 |
3 |
94.554004 |
11.026580 |
4 |
51.196744 |
60.110108 |
5 |
49.554107 |
77.915304 |
6 |
4.947558 |
90.967949 |
7 |
13.152346 |
96.102279 |
df.head(2)
|
a |
b |
0 |
93.694076 |
10.585479 |
1 |
20.906019 |
0.805435 |
df.tail(2)
|
a |
b |
6 |
4.947558 |
90.967949 |
7 |
13.152346 |
96.102279 |
df.T
|
0 |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
a |
93.694076 |
20.906019 |
60.688091 |
94.554004 |
51.196744 |
49.554107 |
4.947558 |
13.152346 |
b |
10.585479 |
0.805435 |
44.387455 |
11.026580 |
60.110108 |
77.915304 |
90.967949 |
96.102279 |
2.3.2 添加 修改 删除值
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
columns = ['a','b','c','d'])
df
|
a |
b |
c |
d |
0 |
72.752063 |
26.046342 |
12.064456 |
98.062747 |
1 |
11.057977 |
80.180406 |
37.311464 |
36.185763 |
2 |
20.363733 |
44.369824 |
94.950827 |
55.851955 |
3 |
79.797658 |
53.622336 |
31.726099 |
83.414271 |
df["e"] = 20
df
|
a |
b |
c |
d |
e |
0 |
72.752063 |
26.046342 |
12.064456 |
98.062747 |
20 |
1 |
11.057977 |
80.180406 |
37.311464 |
36.185763 |
20 |
2 |
20.363733 |
44.369824 |
94.950827 |
55.851955 |
20 |
3 |
79.797658 |
53.622336 |
31.726099 |
83.414271 |
20 |
df[['a','c']] = 100
df
|
a |
b |
c |
d |
e |
0 |
100 |
26.046342 |
100 |
98.062747 |
20 |
1 |
100 |
80.180406 |
100 |
36.185763 |
20 |
2 |
100 |
44.369824 |
100 |
55.851955 |
20 |
3 |
100 |
53.622336 |
100 |
83.414271 |
20 |
df.drop([1,2])
|
a |
b |
c |
d |
e |
0 |
100 |
26.046342 |
100 |
98.062747 |
20 |
3 |
100 |
53.622336 |
100 |
83.414271 |
20 |
df.drop(["d"],axis=1)
|
a |
b |
c |
e |
0 |
100 |
26.046342 |
100 |
20 |
1 |
100 |
80.180406 |
100 |
20 |
2 |
100 |
44.369824 |
100 |
20 |
3 |
100 |
53.622336 |
100 |
20 |
2.3.3 对齐
df1 = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])
df1
|
A |
B |
C |
D |
0 |
1.089101 |
0.371419 |
-0.096348 |
0.397983 |
1 |
-0.979242 |
-0.849951 |
0.054136 |
-1.596409 |
2 |
-0.133808 |
-1.436406 |
-0.062871 |
0.376788 |
3 |
-0.676031 |
-0.157631 |
-0.533043 |
-0.510467 |
4 |
0.390883 |
-1.253727 |
0.177204 |
-0.002852 |
5 |
0.825071 |
-0.163355 |
-1.204615 |
0.742660 |
6 |
1.377831 |
-1.170601 |
-0.734310 |
-1.271898 |
7 |
1.163899 |
0.069660 |
-0.889569 |
-1.143764 |
8 |
-1.770280 |
0.073562 |
-1.331347 |
0.158275 |
9 |
0.769114 |
-1.269013 |
-0.830343 |
-0.615827 |
df2
|
A |
B |
C |
0 |
0.064734 |
-0.016315 |
0.251051 |
1 |
0.080493 |
-0.621427 |
-0.362038 |
2 |
0.552462 |
-0.429362 |
-0.145449 |
3 |
0.970827 |
2.155149 |
-0.748711 |
4 |
-0.641491 |
-1.133494 |
1.383980 |
5 |
0.540944 |
0.905777 |
0.703850 |
6 |
-2.282045 |
-0.097482 |
-1.760575 |
df1 + df2
|
A |
B |
C |
D |
0 |
1.153836 |
0.355103 |
0.154703 |
NaN |
1 |
-0.898749 |
-1.471378 |
-0.307902 |
NaN |
2 |
0.418654 |
-1.865768 |
-0.208320 |
NaN |
3 |
0.294796 |
1.997519 |
-1.281754 |
NaN |
4 |
-0.250609 |
-2.387221 |
1.561184 |
NaN |
5 |
1.366015 |
0.742422 |
-0.500765 |
NaN |
6 |
-0.904214 |
-1.268082 |
-2.494885 |
NaN |
7 |
NaN |
NaN |
NaN |
NaN |
8 |
NaN |
NaN |
NaN |
NaN |
9 |
NaN |
NaN |
NaN |
NaN |
2.3.4 排序
df= pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
columns = ['a','b','c','d'])
df
|
a |
b |
c |
d |
0 |
5.152197 |
49.079658 |
69.335822 |
75.125638 |
1 |
83.086432 |
55.414763 |
9.856352 |
18.925750 |
2 |
30.280855 |
77.819176 |
6.757983 |
59.269951 |
3 |
21.799211 |
0.693387 |
69.473753 |
84.004438 |
df.sort_values(["a"],ascending=True)
|
a |
b |
c |
d |
0 |
5.152197 |
49.079658 |
69.335822 |
75.125638 |
3 |
21.799211 |
0.693387 |
69.473753 |
84.004438 |
2 |
30.280855 |
77.819176 |
6.757983 |
59.269951 |
1 |
83.086432 |
55.414763 |
9.856352 |
18.925750 |
df.sort_values(["a","c"])
|
a |
b |
c |
d |
0 |
5.152197 |
49.079658 |
69.335822 |
75.125638 |
3 |
21.799211 |
0.693387 |
69.473753 |
84.004438 |
2 |
30.280855 |
77.819176 |
6.757983 |
59.269951 |
1 |
83.086432 |
55.414763 |
9.856352 |
18.925750 |
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = [5,4,3,2],
columns = ['a','b','c','d'])
df
|
a |
b |
c |
d |
5 |
71.176835 |
65.530367 |
52.849498 |
69.301327 |
4 |
68.153738 |
68.172008 |
46.080072 |
86.103846 |
3 |
86.816305 |
24.459884 |
53.673947 |
80.592007 |
2 |
81.356156 |
47.900072 |
85.548738 |
19.770766 |
df.sort_index()
|
a |
b |
c |
d |
2 |
81.356156 |
47.900072 |
85.548738 |
19.770766 |
3 |
86.816305 |
24.459884 |
53.673947 |
80.592007 |
4 |
68.153738 |
68.172008 |
46.080072 |
86.103846 |
5 |
71.176835 |
65.530367 |
52.849498 |
69.301327 |