一、索引器
1.表的列索引
import numpy as np
import pandas as pd
df = pd. read_csv( r"C:\Users\zhoukaiwei\Desktop\joyful-pandas\data\learn_pandas.csv" ,
usecols = [ 'School' , 'Grade' , 'Name' , 'Gender' ,
'Weight' , 'Transfer' ] )
df[ 'Name' ] . head( )
0 Gaopeng Yang
1 Changqiang You
2 Mei Sun
3 Xiaojuan Sun
4 Gaojuan You
Name: Name, dtype: object
df[ [ 'Gender' , 'Name' ] ] . head( )
Gender
Name
0
Female
Gaopeng Yang
1
Male
Changqiang You
2
Male
Mei Sun
3
Female
Xiaojuan Sun
4
Male
Gaojuan You
df. Name. head( )
0 Gaopeng Yang
1 Changqiang You
2 Mei Sun
3 Xiaojuan Sun
4 Gaojuan You
Name: Name, dtype: object
2.序列的行索引
以字符串为索引的 Series
A = pd. Series( [ 1 , 2 , 3 , 4 , 5 , 6 ] , index= [ 'a' , 'b' , 'a' , 'a' , 'a' , 'c' ] )
A[ 'a' ]
a 1
a 3
a 4
dtype: int64
A[ [ 'c' , 'b' ] ]
c 6
b 5
dtype: int64
A = pd. Series( [ 1 , 2 , 3 , 4 , 5 , 6 ] , index= [ 'a' , 'b' , 'a' , 'a' , 'a' , 'c' ] )
A[ 'c' : 'b' : - 1 ]
c 6
a 5
a 4
a 3
b 2
dtype: int64
3. loc索引器
前面讲到了对 DataFrame 的列进行选取,下面要讨论其行的选取。对于表而言,有两种索引器,一种是基于 元素 的 loc 索引器,另一种是基于 位置 的 iloc 索引器。loc 索引器的一般形式是 loc[*, ] ,其中第一个 * 代表行的选择,第二个 * 代表列的选择,如果省略第二个位置写作 loc[ ] ,这个 * 是指行的筛选。其中, * 的位置一共有五类合法对象,分别是:单个元素、元素列表、元素切片、布尔列表以及函数,下面将依次说明。 为了演示相应操作,先利用 set_index 方法把 Name 列设为索引,关于该函数的其他用法将在多级索引一章介绍。
import numpy as np
import pandas as pd
df = pd. read_csv( r"C:\Users\zhoukaiwei\Desktop\joyful-pandas\data\learn_pandas.csv" ,
usecols = [ 'School' , 'Grade' , 'Name' , 'Gender' ,
'Weight' , 'Transfer' ] )
df_A = df. set_index( 'Name' )
df_A. head( )
School
Grade
Gender
Weight
Transfer
Name
Gaopeng Yang
Shanghai Jiao Tong University
Freshman
Female
46.0
N
Changqiang You
Peking University
Freshman
Male
70.0
N
Mei Sun
Shanghai Jiao Tong University
Senior
Male
89.0
N
Xiaojuan Sun
Fudan University
Sophomore
Female
41.0
N
Gaojuan You
Fudan University
Sophomore
Male
74.0
N
df_A. loc[ 'Qiang Sun' ]
School
Grade
Gender
Weight
Transfer
Name
Qiang Sun
Tsinghua University
Junior
Female
53.0
N
Qiang Sun
Tsinghua University
Sophomore
Female
40.0
N
Qiang Sun
Shanghai Jiao Tong University
Junior
Female
NaN
N
df_A. loc[ 'Quan Zhao' ]
School Shanghai Jiao Tong University
Grade Junior
Gender Female
Weight 53
Transfer N
Name: Quan Zhao, dtype: object
df_A. loc[ 'Qiang Sun' , 'School' ]
Name
Qiang Sun Tsinghua University
Qiang Sun Tsinghua University
Qiang Sun Shanghai Jiao Tong University
Name: School, dtype: object
df_A. loc[ [ 'Qiang Sun' , 'Quan Zhao' ] , [ 'School' , 'Gender' ] ]
School
Gender
Name
Qiang Sun
Tsinghua University
Female
Qiang Sun
Tsinghua University
Female
Qiang Sun
Shanghai Jiao Tong University
Female
Quan Zhao
Shanghai Jiao Tong University
Female
df_A. loc[ 'Gaojuan You' : 'Gaoqiang Qian' , 'School' : 'Gender' ]
School
Grade
Gender
Name
Gaojuan You
Fudan University
Sophomore
Male
Xiaoli Qian
Tsinghua University
Freshman
Female
Qiang Chu
Shanghai Jiao Tong University
Freshman
Female
Gaoqiang Qian
Tsinghua University
Junior
Female
在实际的数据处理中,根据条件来筛选行是极其常见的,此处传入 loc 的布尔列表与 DataFrame 长度相同,且列表为 True 的位置所对应的行会被选中, False 则会被剔除。
df_A. loc[ df_A. Weight> 70 ] . head( )
School
Grade
Gender
Weight
Transfer
Name
Mei Sun
Shanghai Jiao Tong University
Senior
Male
89.0
N
Gaojuan You
Fudan University
Sophomore
Male
74.0
N
Xiaopeng Zhou
Shanghai Jiao Tong University
Freshman
Male
74.0
N
Xiaofeng Sun
Tsinghua University
Senior
Male
71.0
N
Qiang Zheng
Shanghai Jiao Tong University
Senior
Male
87.0
N
df_A. loc[ df_A. Grade. isin( [ 'Freshman' , 'Senior' ] ) ] . head( )
School
Grade
Gender
Weight
Transfer
Name
Gaopeng Yang
Shanghai Jiao Tong University
Freshman
Female
46.0
N
Changqiang You
Peking University
Freshman
Male
70.0
N
Mei Sun
Shanghai Jiao Tong University
Senior
Male
89.0
N
Xiaoli Qian
Tsinghua University
Freshman
Female
51.0
N
Qiang Chu
Shanghai Jiao Tong University
Freshman
Female
52.0
N
def condition ( x) :
condition_1_1 = x. School == 'Fudan University'
condition_1_2 = x. Grade == 'Senior'
condition_1_3 = x. Weight > 70
condition_1 = condition_1_1 & condition_1_2 & condition_1_3
condition_2_1 = x. School == 'Peking University'
condition_2_2 = x. Grade == 'Senior'
condition_2_3 = x. Weight > 80
condition_2 = condition_2_1 & ( ~ condition_2_2) & condition_2_3
result = condition_1 | condition_2
return result
df_A. loc[ condition]
School
Grade
Gender
Weight
Transfer
Name
Qiang Han
Peking University
Freshman
Male
87.0
N
Chengpeng Zhou
Fudan University
Senior
Male
81.0
N
Changpeng Zhao
Peking University
Freshman
Male
83.0
N
Chengpeng Qian
Fudan University
Senior
Male
73.0
Y
df_A. loc[ lambda x: slice ( 'Gaojuan You' , 'Gaoqiang Qian' ) ]
School
Grade
Gender
Weight
Transfer
Name
Gaojuan You
Fudan University
Sophomore
Male
74.0
N
Xiaoli Qian
Tsinghua University
Freshman
Female
51.0
N
Qiang Chu
Shanghai Jiao Tong University
Freshman
Female
52.0
N
Gaoqiang Qian
Tsinghua University
Junior
Female
50.0
N
4. iloc索引器
iloc 的使用与 loc 完全类似,只不过是针对位置进行筛选,在相应的 * 位置处一共也有五类 合法对象,分别是:整数、整数列表、整数切片、布尔列表以及函数,函数的返回值必须是前面 的四类合法对象中的一个,其输入同样也为 DataFrame 本身。
df_A. iloc[ 1 , 1 ]
'Freshman'
df_A. iloc[ [ 0 , 1 ] , [ 0 , 1 ] ]
School
Grade
Name
Gaopeng Yang
Shanghai Jiao Tong University
Freshman
Changqiang You
Peking University
Freshman
df_A. iloc[ 1 : 4 , 2 : 4 ]
Gender
Weight
Name
Changqiang You
Male
70.0
Mei Sun
Male
89.0
Xiaojuan Sun
Female
41.0
5. query方法
df. query( '((School == "Fudan University")&(Grade == "Senior")&(Weight > 70))' )
School
Grade
Name
Gender
Weight
Transfer
66
Fudan University
Senior
Chengpeng Zhou
Male
81.0
N
131
Fudan University
Senior
Chengpeng Qian
Male
73.0
Y
df. query( 'Grade == ["Junior", "Senior"]' ) . head( )
School
Grade
Name
Gender
Weight
Transfer
2
Shanghai Jiao Tong University
Senior
Mei Sun
Male
89.0
N
7
Tsinghua University
Junior
Gaoqiang Qian
Female
50.0
N
9
Peking University
Junior
Juan Xu
Female
NaN
N
11
Tsinghua University
Junior
Xiaoquan Lv
Female
43.0
N
12
Shanghai Jiao Tong University
Senior
Peng You
Female
48.0
NaN
low, high = 70 , 80
df. query( 'Weight.between(@low, @high)' ) . head( )
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-73-7b9719bb38f0> in <module>
1 low, high =70, 80
----> 2 df.query('Weight.between(@low, @high)').head()
D:\Anaconda3\lib\site-packages\pandas\core\frame.py in query(self, expr, inplace, **kwargs)
3343 kwargs["level"] = kwargs.pop("level", 0) + 1
3344 kwargs["target"] = None
-> 3345 res = self.eval(expr, **kwargs)
3346
3347 try:
D:\Anaconda3\lib\site-packages\pandas\core\frame.py in eval(self, expr, inplace, **kwargs)
3473 kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers)
3474
-> 3475 return _eval(expr, inplace=inplace, **kwargs)
3476
3477 def select_dtypes(self, include=None, exclude=None) -> "DataFrame":
D:\Anaconda3\lib\site-packages\pandas\core\computation\eval.py in eval(expr, parser, engine, truediv, local_dict, global_dict, resolvers, level, target, inplace)
344 eng = _engines[engine]
345 eng_inst = eng(parsed_expr)
--> 346 ret = eng_inst.evaluate()
347
348 if parsed_expr.assigner is None:
D:\Anaconda3\lib\site-packages\pandas\core\computation\engines.py in evaluate(self)
71
72 # make sure no names in resolvers and locals/globals clash
---> 73 res = self._evaluate()
74 return reconstruct_object(
75 self.result_type, res, self.aligned_axes, self.expr.terms.return_type
D:\Anaconda3\lib\site-packages\pandas\core\computation\engines.py in _evaluate(self)
111 env = self.expr.env
112 scope = env.full_scope
--> 113 _check_ne_builtin_clash(self.expr)
114 return ne.evaluate(s, local_dict=scope)
115
D:\Anaconda3\lib\site-packages\pandas\core\computation\engines.py in _check_ne_builtin_clash(expr)
27 Terms can contain
28 """
---> 29 names = expr.names
30 overlap = names & _ne_builtins
31
D:\Anaconda3\lib\site-packages\pandas\core\computation\expr.py in names(self)
812 """
813 if is_term(self.terms):
--> 814 return frozenset([self.terms.name])
815 return frozenset(term.name for term in com.flatten(self.terms))
816
D:\Anaconda3\lib\site-packages\pandas\core\generic.py in __hash__(self)
1667 def __hash__(self):
1668 raise TypeError(
-> 1669 f"{repr(type(self).__name__)} objects are mutable, "
1670 f"thus they cannot be hashed"
1671 )
TypeError: 'Series' objects are mutable, thus they cannot be hashed
6. 随机抽样
df_A = pd. DataFrame( {
'id' : list ( 'abcde' ) ,
'value' : [ 10 , 20 , 30 , 20 , 20 ] } )
df_A
id
value
0
a
10
1
b
20
2
c
30
3
d
20
4
e
20
df_A. sample( 3 , replace = True , weights = df_A. value)
id
value
3
d
20
2
c
30
4
e
20
二、多级索引
1. 多级索引及其表的结构
import numpy as np
import pandas as pd
np. random. seed( 0 )
multi_index = pd. MultiIndex. from_product( [ list ( 'ABCD' ) ,
df. Gender. unique( ) ] , names= ( 'School' , 'Gender' ) )
multi_column = pd. MultiIndex. from_product( [ [ 'Height' , 'Weight' ] ,
df. Grade. unique( ) ] , names= ( 'Indicator' , 'Grade' ) )
df_multi = pd. DataFrame( np. c_[ ( np. random. randn( 8 , 4 ) * 5 + 163 ) . tolist( ) ,
( np. random. randn( 8 , 4 ) * 5 + 65 ) . tolist( ) ] ,
index = multi_index,
columns = multi_column) . round ( 1 )
df_multi
Indicator
Height
Weight
Grade
Freshman
Senior
Sophomore
Junior
Freshman
Senior
Sophomore
Junior
School
Gender
A
Female
171.8
165.0
167.9
174.2
60.6
55.1
63.3
65.8
Male
172.3
158.1
167.8
162.2
71.2
71.0
63.1
63.5
B
Female
162.5
165.1
163.7
170.3
59.8
57.9
56.5
74.8
Male
166.8
163.6
165.2
164.7
62.5
62.8
58.7
68.9
C
Female
170.5
162.0
164.6
158.7
56.9
63.9
60.5
66.9
Male
150.2
166.3
167.3
159.3
62.4
59.1
64.9
67.1
D
Female
174.3
155.7
163.2
162.1
65.3
66.5
61.8
63.2
Male
170.7
170.3
163.8
164.9
61.6
63.2
60.9
56.4
df_multi. index. names
FrozenList(['School', 'Gender'])
df_multi. columns. names
FrozenList(['Indicator', 'Grade'])
df_multi. index. get_level_values( 0 )
Index(['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], dtype='object', name='School')
2. 多级索引中的loc索引器
熟悉了结构后,现在回到原表,将学校和年级设为索引,此时的行为多级索引,列为单级索引, 由于默认状态的列索引不含名字,因此对应于刚刚图中 Indicator 和 Grade 的索引名位置是空缺的。
df_multi = df. set_index( [ 'School' , 'Grade' ] )
df_multi. head( )
Name
Gender
Weight
Transfer
School
Grade
Shanghai Jiao Tong University
Freshman
Gaopeng Yang
Female
46.0
N
Peking University
Freshman
Changqiang You
Male
70.0
N
Shanghai Jiao Tong University
Senior
Mei Sun
Male
89.0
N
Fudan University
Sophomore
Xiaojuan Sun
Female
41.0
N
Sophomore
Gaojuan You
Male
74.0
N
由于多级索引中的单个元素以元组为单位,因此之前在第一节介绍的 loc 和 iloc 方法完全可以照搬 ,只需把标量的位置替换成对应的元组,不过在索引前最好对 MultiIndex 进行排序以避免性能警告:
df_multi = df_multi. sort_index( )
df_multi. loc[ ( 'Fudan University' , 'Junior' ) ] . head( )
Name
Gender
Weight
Transfer
School
Grade
Fudan University
Junior
Yanli You
Female
48.0
N
Junior
Chunqiang Chu
Male
72.0
N
Junior
Changfeng Lv
Male
76.0
N
Junior
Yanjuan Lv
Female
49.0
NaN
Junior
Gaoqiang Zhou
Female
43.0
N
df_multi. loc[ [ ( 'Fudan University' , 'Senior' ) ,
( 'Shanghai Jiao Tong University' , 'Freshman' ) ] ] . head( )
Name
Gender
Weight
Transfer
School
Grade
Fudan University
Senior
Chengpeng Zheng
Female
38.0
N
Senior
Feng Zhou
Female
47.0
N
Senior
Gaomei Lv
Female
34.0
N
Senior
Chunli Lv
Female
56.0
N
Senior
Chengpeng Zhou
Male
81.0
N
3. IndexSlice对象
前面介绍的方法,即使在索引不重复的时候,也只能对元组整体进行切片,而不能对每层进行切片, 也不允许将切片和布尔列表混合使用,引入 IndexSlice 对象就能解决这个问题。 Slice 对象一共 有两种形式,第一种为 loc[idx[, ]] 型,第二种为 loc[idx[, ],idx[, ]] 型, 下面将进行介绍。为了方便演示,下面构造一个 索引不重复的 DataFrame :
np. random. seed( 0 )
L1, L2 = [ 'A' , 'B' , 'C' ] , [ 'a' , 'b' , 'c' ]
mul_index1 = pd. MultiIndex. from_product( [ L1, L2] , names= ( 'Upper' , 'Lower' ) )
L3, L4 = [ 'D' , 'E' , 'F' ] , [ 'd' , 'e' , 'f' ]
mul_index2 = pd. MultiIndex. from_product( [ L3, L4] , names= ( 'Big' , 'Small' ) )
df_ex = pd. DataFrame( np. random. randint( - 9 , 10 , ( 9 , 9 ) ) , index= mul_index1, columns= mul_index2)
df_ex
Big
D
E
F
Small
d
e
f
d
e
f
d
e
f
Upper
Lower
A
a
3
6
-9
-6
-6
-2
0
9
-5
b
-3
3
-8
-3
-2
5
8
-4
4
c
-1
0
7
-4
6
6
-9
9
-6
B
a
8
5
-2
-9
-8
0
-9
1
-6
b
2
9
-7
-9
-9
-5
-4
-3
-1
c
8
6
-5
0
1
-8
-8
-2
0
C
a
-6
-3
2
5
9
-9
5
-6
3
b
1
2
-5
-3
-5
6
-6
3
-5
c
-1
5
6
-6
6
4
7
8
-4
index = pd. IndexSlice
df_ex. loc[ index[ 'C' : , ( 'D' , 'e' ) : ] ]
Big
D
E
F
Small
e
f
d
e
f
d
e
f
Upper
Lower
C
a
-3
2
5
9
-9
5
-6
3
b
2
-5
-3
-5
6
-6
3
-5
c
5
6
-6
6
4
7
8
-4
df_ex. loc[ index[ : 'A' , lambda x: x. sum ( ) > 0 ] ]
Big
D
F
Small
d
e
e
Upper
Lower
A
a
3
6
9
b
-3
3
-4
c
-1
0
9
df_ex. loc[ index[ : 'A' , 'b' : ] , index[ 'E' : , 'e' : ] ]
Big
E
F
Small
e
f
e
f
Upper
Lower
A
b
-2
5
-4
4
c
6
6
9
-6
4. 多级索引的构造
前面提到了多级索引表的结构和切片,那么除了使用 set_index 之外, 如何自己构造多级索引呢?常用的有 from_tuples, from_arrays, from_product 三种方法, 它们都是 pd.MultiIndex 对象下的函数。
my_A = [ ( 'a' , 'cat' ) , ( 'a' , 'dog' ) , ( 'b' , 'cat' ) , ( 'b' , 'dog' ) ]
pd. MultiIndex. from_tuples( my_A, names = [ 'First' , 'Second' ] )
MultiIndex([('a', 'cat'),
('a', 'dog'),
('b', 'cat'),
('b', 'dog')],
names=['First', 'Second'])
my_array = [ list ( 'aabb' ) , [ 'cat' , 'dog' ] * 2 ]
pd. MultiIndex. from_arrays( my_array, names= [ 'First' , 'Second' ] )
MultiIndex([('a', 'cat'),
('a', 'dog'),
('b', 'cat'),
('b', 'dog')],
names=['First', 'Second'])
list1 = [ 'a' , 'b' ]
list2 = [ 'cat' , 'dog' ]
pd. MultiIndex. from_product( [ list1, list2] , names = [ 'First' , 'Second' ] )
MultiIndex([('a', 'cat'),
('a', 'dog'),
('b', 'cat'),
('b', 'dog')],
names=['First', 'Second'])
三、索引的常用方法
1. 索引层的交换和删除
np. random. seed( 0 )
L1, L2, L3 = [ 'A' , 'B' ] , [ 'a' , 'b' ] , [ 'alpha' , 'beta' ]
m_index1 = pd. MultiIndex. from_product( [ L1, L2, L3] , names = ( 'Upper' , 'Lower' , 'Extra' ) )
L4, L5, L6 = [ 'C' , 'D' ] , [ 'c' , 'd' ] , [ 'cat' , 'dog' ]
m_index2 = pd. MultiIndex. from_product( [ L4, L5, L6] , names= ( 'Big' , 'Small' , 'Other' ) )
df_ex = pd. DataFrame( np. random. randint( - 9 , 10 , ( 8 , 8 ) ) , index= m_index1, columns= m_index2)
df_ex
Big
C
D
Small
c
d
c
d
Other
cat
dog
cat
dog
cat
dog
cat
dog
Upper
Lower
Extra
A
a
alpha
3
6
-9
-6
-6
-2
0
9
beta
-5
-3
3
-8
-3
-2
5
8
b
alpha
-4
4
-1
0
7
-4
6
6
beta
-9
9
-6
8
5
-2
-9
-8
B
a
alpha
0
-9
1
-6
2
9
-7
-9
beta
-9
-5
-4
-3
-1
8
6
-5
b
alpha
0
1
-8
-8
-2
0
-6
-3
beta
2
5
9
-9
5
-6
3
1
df_ex. swaplevel( 0 , 2 , axis= 1 ) . head( )
Other
cat
dog
cat
dog
cat
dog
cat
dog
Small
c
c
d
d
c
c
d
d
Big
C
C
C
C
D
D
D
D
Upper
Lower
Extra
A
a
alpha
3
6
-9
-6
-6
-2
0
9
beta
-5
-3
3
-8
-3
-2
5
8
b
alpha
-4
4
-1
0
7
-4
6
6
beta
-9
9
-6
8
5
-2
-9
-8
B
a
alpha
0
-9
1
-6
2
9
-7
-9
df_ex. reorder_levels( [ 2 , 0 , 1 ] , axis= 0 ) . head( )
Big
C
D
Small
c
d
c
d
Other
cat
dog
cat
dog
cat
dog
cat
dog
Extra
Upper
Lower
alpha
A
a
3
6
-9
-6
-6
-2
0
9
beta
A
a
-5
-3
3
-8
-3
-2
5
8
alpha
A
b
-4
4
-1
0
7
-4
6
6
beta
A
b
-9
9
-6
8
5
-2
-9
-8
alpha
B
a
0
-9
1
-6
2
9
-7
-9
df_ex. droplevel( 1 , axis= 1 )
Big
C
D
Other
cat
dog
cat
dog
cat
dog
cat
dog
Upper
Lower
Extra
A
a
alpha
3
6
-9
-6
-6
-2
0
9
beta
-5
-3
3
-8
-3
-2
5
8
b
alpha
-4
4
-1
0
7
-4
6
6
beta
-9
9
-6
8
5
-2
-9
-8
B
a
alpha
0
-9
1
-6
2
9
-7
-9
beta
-9
-5
-4
-3
-1
8
6
-5
b
alpha
0
1
-8
-8
-2
0
-6
-3
beta
2
5
9
-9
5
-6
3
1
df_ex. droplevel( [ 0 , 1 ] , axis= 0 )
Big
C
D
Small
c
d
c
d
Other
cat
dog
cat
dog
cat
dog
cat
dog
Extra
alpha
3
6
-9
-6
-6
-2
0
9
beta
-5
-3
3
-8
-3
-2
5
8
alpha
-4
4
-1
0
7
-4
6
6
beta
-9
9
-6
8
5
-2
-9
-8
alpha
0
-9
1
-6
2
9
-7
-9
beta
-9
-5
-4
-3
-1
8
6
-5
alpha
0
1
-8
-8
-2
0
-6
-3
beta
2
5
9
-9
5
-6
3
1
2.索引属性的修改
df_ex. rename_axis( index = {
'Uper' : 'Changed_row' } , columns = {
'Other' : 'Changed_Col' } )
. head( )
Big
C
D
Small
c
d
c
d
Changed_Col
cat
dog
cat
dog
cat
dog
cat
dog
Upper
Lower
Extra
A
a
alpha
3
6
-9
-6
-6
-2
0
9
beta
-5
-3
3
-8
-3
-2
5
8
b
alpha
-4
4
-1
0
7
-4
6
6
beta
-9
9
-6
8
5
-2
-9
-8
B
a
alpha
0
-9
1
-6
2
9
-7
-9
df_ex. rename( columns= {
'cat' : 'not_cat' } , level= 2 ) . head( )
Big
C
D
Small
c
d
c
d
Other
not_cat
dog
not_cat
dog
not_cat
dog
not_cat
dog
Upper
Lower
Extra
A
a
alpha
3
6
-9
-6
-6
-2
0
9
beta
-5
-3
3
-8
-3
-2
5
8
b
alpha
-4
4
-1
0
7
-4
6
6
beta
-9
9
-6
8
5
-2
-9
-8
B
a
alpha
0
-9
1
-6
2
9
-7
-9
df_ex. rename( index= lambda x: str . upper( x) , level= 2 ) . head( )
Big
C
D
Small
c
d
c
d
Other
cat
dog
cat
dog
cat
dog
cat
dog
Upper
Lower
Extra
A
a
ALPHA
3
6
-9
-6
-6
-2
0
9
BETA
-5
-3
3
-8
-3
-2
5
8
b
ALPHA
-4
4
-1
0
7
-4
6
6
BETA
-9
9
-6
8
5
-2
-9
-8
B
a
ALPHA
0
-9
1
-6
2
9
-7
-9
A = iter ( list ( 'abcdefgh' ) )
df_ex. rename( index= lambda x: next ( A) , level = 2 )
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-80-bc33686b6fdf> in <module>
----> 1 A = iter(list('abcdefgh'))
2 df_ex.rename(index=lambda x:next(A),level = 2)
TypeError: 'list' object is not callable
3. 索引的设置与重置
df_A = pd. DataFrame( {
"A" : list ( 'aacd' ) , "B" : list ( 'PQRT' ) , "C" : [ 1 , 2 , 3 , 4 ] } )
df_A
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-86-2fdf41795bad> in <module>
----> 1 df_A = pd.DataFrame({"A":list('aacd'),"B":list('PQRT'),"C":[1,2,3,4]})
2 df_A
TypeError: 'list' object is not callable
4. 索引的变形
df_reindex = pd. DataFrame( {
"Weight" : [ 60 , 70 , 80 ] , "Height" : [ 176 , 180 , 179 ] } , index= [ '1001' , '1003' , '1002' ] )
df_reindex
Weight
Height
1001
60
176
1003
70
180
1002
80
179
df_reindex. reindex( index= [ '1001' , '1002' , '1003' , '1004' ] , columns= [ 'Weight' , 'Gender' ] )
Weight
Gender
1001
60.0
NaN
1002
80.0
NaN
1003
70.0
NaN
1004
NaN
NaN
df_existed = pd. DataFrame( index = [ '1001' , '1002' , '1003' , '1004' ] , columns= [ 'Weight' , 'Gender' ] )
df_reindex. reindex_like( df_existed)
Weight
Gender
1001
60.0
NaN
1002
80.0
NaN
1003
70.0
NaN
1004
NaN
NaN
四、索引运算
2. 一般的索引运算
df_A = pd. DataFrame( [ [ 0 , 1 ] , [ 1 , 2 ] , [ 3 , 4 ] ] ,
index = pd. Index( [ 'a' , 'b' , 'a' ] , name= 'id1' ) )
df_B = pd. DataFrame( [ [ 4 , 5 ] , [ 2 , 6 ] , [ 7 , 1 ] ] , index = pd. Index( [ 'b' , 'b' , 'c' ] , name= 'id2' ) )
id1, id2 = df_A. index. unique( ) , df_B. index. unique( )
id1. intersection( id2)
Index(['b'], dtype='object')
id1. union( id2)
Index(['a', 'b', 'c'], dtype='object')
id1. difference( id2)
Index(['a'], dtype='object')
id1. symmetric_difference( id2)
Index(['a', 'c'], dtype='object')
df_a = df_A. reset_index( )
df_b = df_B. reset_index( )
df_a
id1
0
1
0
a
0
1
1
b
1
2
2
a
3
4
df_b
id2
0
1
0
b
4
5
1
b
2
6
2
c
7
1
df_a[ df_a. id1. isin( df_b. id2) ]
五、练习
Ex1:公司员工数据集
df = pd. read_csv( r'C:\Users\zhoukaiwei\Desktop\joyful-pandas\data/company.csv' )
df. head( )
EmployeeID
birthdate_key
age
city_name
department
job_title
gender
0
1318
1/3/1954
61
Vancouver
Executive
CEO
M
1
1319
1/3/1957
58
Vancouver
Executive
VP Stores
F
2
1320
1/2/1955
60
Vancouver
Executive
Legal Counsel
F
3
1321
1/2/1959
56
Vancouver
Executive
VP Human Resources
M
4
1322
1/9/1958
57
Vancouver
Executive
VP Finance
M
分别只使用 query 和 loc 选出年龄不超过四十岁且工作部门为 Dairy 或 Bakery 的男性。
df. query( '(age <= 40)&(department is in [' Diary',' Bakery'])&(gender==' M')' ) . head( )
File "<ipython-input-108-aa2a95f52718>", line 1
df.query('(age <= 40)&(department is in ['Diary','Bakery'])&(gender=='M')').head()
^
SyntaxError: invalid syntax
选出员工 ID 号 为奇数所在行的第1、第3和倒数第2列。
df. iloc[ ( df. EmployeeID% 2 == 1 ) . values, [ 0 , 2 , - 2 ] ] . head( )
EmployeeID
age
job_title
1
1319
58
VP Stores
3
1321
56
VP Human Resources
5
1323
53
Exec Assistant, VP Stores
6
1325
51
Exec Assistant, Legal Counsel
8
1329
48
Store Manager
按照以下步骤进行索引操作:
把后三列设为索引后交换内外两层
df. set_index( df. columns[ - 3 : ] . tolist( ) ) . swaplevel( 0 , 2 , axis= 0 ) . head( )
EmployeeID
birthdate_key
age
city_name
gender
job_title
department
M
CEO
Executive
1318
1/3/1954
61
Vancouver
F
VP Stores
Executive
1319
1/3/1957
58
Vancouver
Legal Counsel
Executive
1320
1/2/1955
60
Vancouver
M
VP Human Resources
Executive
1321
1/2/1959
56
Vancouver
VP Finance
Executive
1322
1/9/1958
57
Vancouver
df. reset_index( level= 1 )
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-116-bd39f5c7cad0> in <module>
----> 1 df.reset_index(level=1)
D:\Anaconda3\lib\site-packages\pandas\core\frame.py in reset_index(self, level, drop, inplace, col_level, col_fill)
4818 if not isinstance(level, (tuple, list)):
4819 level = [level]
-> 4820 level = [self.index._get_level_number(lev) for lev in level]
4821 if len(level) < self.index.nlevels:
4822 new_index = self.index.droplevel(level)
D:\Anaconda3\lib\site-packages\pandas\core\frame.py in <listcomp>(.0)
4818 if not isinstance(level, (tuple, list)):
4819 level = [level]
-> 4820 level = [self.index._get_level_number(lev) for lev in level]
4821 if len(level) < self.index.nlevels:
4822 new_index = self.index.droplevel(level)
D:\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in _get_level_number(self, level)
1413
1414 def _get_level_number(self, level) -> int:
-> 1415 self._validate_index_level(level)
1416 return 0
1417
D:\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in _validate_index_level(self, level)
1405 elif level > 0:
1406 raise IndexError(
-> 1407 f"Too many levels: Index has only 1 level, not {level + 1}"
1408 )
1409 elif level != self.name:
IndexError: Too many levels: Index has only 1 level, not 2
修改外层索引名为 Gender
df. rename_axis( index= {
'gender' : 'Gender' } ) . head( )
EmployeeID
birthdate_key
age
city_name
department
job_title
gender
0
1318
1/3/1954
61
Vancouver
Executive
CEO
M
1
1319
1/3/1957
58
Vancouver
Executive
VP Stores
F
2
1320
1/2/1955
60
Vancouver
Executive
Legal Counsel
F
3
1321
1/2/1959
56
Vancouver
Executive
VP Human Resources
M
4
1322
1/9/1958
57
Vancouver
Executive
VP Finance
M
用下划线合并两层行索引
后面题目继续再做,后面会更新到文档里面。
DataWhale:DataWhale