一、索引器

1.表的列索引

import numpy as np
import pandas as pd
df = pd.read_csv(r"C:\Users\zhoukaiwei\Desktop\joyful-pandas\data\learn_pandas.csv",
                 usecols = ['School', 'Grade', 'Name', 'Gender',
                              'Weight', 'Transfer'])
df['Name'].head()

0      Gaopeng Yang
1    Changqiang You
2           Mei Sun
3      Xiaojuan Sun
4       Gaojuan You
Name: Name, dtype: object

df[['Gender','Name']].head()#获得多个列

	Gender	Name
0	Female	Gaopeng Yang
1	Male	Changqiang You
2	Male	Mei Sun
3	Female	Xiaojuan Sun
4	Male	Gaojuan You

df.Name.head()#获得单列，且列名中不包空和上面df['Name']是一样的

0      Gaopeng Yang
1    Changqiang You
2           Mei Sun
3      Xiaojuan Sun
4       Gaojuan You
Name: Name, dtype: object

2.序列的行索引

以字符串为索引的 Series

A = pd.Series([1,2,3,4,5,6],index=['a','b','a','a','a','c'])
A['a']

a    1
a    3
a    4
dtype: int64

#如果取出多个索引的对应元素，则可以使用 [items的列表] ：
A[['c','b']]

c    6
b    5
dtype: int64

A = pd.Series([1,2,3,4,5,6],index=['a','b','a','a','a','c'])
A['c': 'b': -1]#获得两个索引之间的元素

c    6
a    5
a    4
a    3
b    2
dtype: int64

3. loc索引器

前面讲到了对 DataFrame 的列进行选取，下面要讨论其行的选取。对于表而言，有两种索引器，一种是基于元素的 loc 索引器，另一种是基于位置的
iloc 索引器。loc 索引器的一般形式是 loc[*, ] ，其中第一个 * 代表行的选择，第二个 * 代表列的选择，如果省略第二个位置写作 loc[] ，这个 *
是指行的筛选。其中， * 的位置一共有五类合法对象，分别是：单个元素、元素列表、元素切片、布尔列表以及函数，下面将依次说明。
为了演示相应操作，先利用 set_index 方法把 Name 列设为索引，关于该函数的其他用法将在多级索引一章介绍。

import numpy as np
import pandas as pd
df = pd.read_csv(r"C:\Users\zhoukaiwei\Desktop\joyful-pandas\data\learn_pandas.csv",
                 usecols = ['School', 'Grade', 'Name', 'Gender',
                              'Weight', 'Transfer'])
df_A =df.set_index('Name')
df_A.head()

	School	Grade	Gender	Weight	Transfer
Name
Gaopeng Yang	Shanghai Jiao Tong University	Freshman	Female	46.0	N
Changqiang You	Peking University	Freshman	Male	70.0	N
Mei Sun	Shanghai Jiao Tong University	Senior	Male	89.0	N
Xiaojuan Sun	Fudan University	Sophomore	Female	41.0	N
Gaojuan You	Fudan University	Sophomore	Male	74.0	N

为单个元素

df_A.loc['Qiang Sun']

	School	Grade	Gender	Weight	Transfer
Name
Qiang Sun	Tsinghua University	Junior	Female	53.0	N
Qiang Sun	Tsinghua University	Sophomore	Female	40.0	N
Qiang Sun	Shanghai Jiao Tong University	Junior	Female	NaN	N

df_A.loc['Quan Zhao']

School      Shanghai Jiao Tong University
Grade                              Junior
Gender                             Female
Weight                                 53
Transfer                                N
Name: Quan Zhao, dtype: object

df_A.loc['Qiang Sun','School']

Name
Qiang Sun              Tsinghua University
Qiang Sun              Tsinghua University
Qiang Sun    Shanghai Jiao Tong University
Name: School, dtype: object

为元素列表

#取出列表中所有元素值对应的行或列：
df_A.loc[['Qiang Sun','Quan Zhao'],['School','Gender']]

	School	Gender
Name
Qiang Sun	Tsinghua University	Female
Qiang Sun	Tsinghua University	Female
Qiang Sun	Shanghai Jiao Tong University	Female
Quan Zhao	Shanghai Jiao Tong University	Female

为切片

df_A.loc['Gaojuan You':'Gaoqiang Qian','School':'Gender']#使用切片

	School	Grade	Gender
Name
Gaojuan You	Fudan University	Sophomore	Male
Xiaoli Qian	Tsinghua University	Freshman	Female
Qiang Chu	Shanghai Jiao Tong University	Freshman	Female
Gaoqiang Qian	Tsinghua University	Junior	Female

为布尔列表

在实际的数据处理中，根据条件来筛选行是极其常见的，此处传入 loc 的布尔列表与 DataFrame
长度相同，且列表为 True 的位置所对应的行会被选中， False 则会被剔除。

df_A.loc[df_A.Weight>70].head()

	School	Grade	Gender	Weight	Transfer
Name
Mei Sun	Shanghai Jiao Tong University	Senior	Male	89.0	N
Gaojuan You	Fudan University	Sophomore	Male	74.0	N
Xiaopeng Zhou	Shanghai Jiao Tong University	Freshman	Male	74.0	N
Xiaofeng Sun	Tsinghua University	Senior	Male	71.0	N
Qiang Zheng	Shanghai Jiao Tong University	Senior	Male	87.0	N

df_A.loc[df_A.Grade.isin(['Freshman', 'Senior'])].head()#获得大一和大四的学生

	School	Grade	Gender	Weight	Transfer
Name
Gaopeng Yang	Shanghai Jiao Tong University	Freshman	Female	46.0	N
Changqiang You	Peking University	Freshman	Male	70.0	N
Mei Sun	Shanghai Jiao Tong University	Senior	Male	89.0	N
Xiaoli Qian	Tsinghua University	Freshman	Female	51.0	N
Qiang Chu	Shanghai Jiao Tong University	Freshman	Female	52.0	N

为函数

def condition(x):
        condition_1_1 = x.School == 'Fudan University'
        condition_1_2 = x.Grade == 'Senior'
        condition_1_3 = x.Weight > 70
        condition_1 = condition_1_1 & condition_1_2 & condition_1_3
        condition_2_1 = x.School == 'Peking University'
        condition_2_2 = x.Grade == 'Senior'
        condition_2_3 = x.Weight > 80
        condition_2 = condition_2_1 & (~condition_2_2) & condition_2_3
        result = condition_1 | condition_2
        return result
df_A.loc[condition]

	School	Grade	Gender	Weight	Transfer
Name
Qiang Han	Peking University	Freshman	Male	87.0	N
Chengpeng Zhou	Fudan University	Senior	Male	81.0	N
Changpeng Zhao	Peking University	Freshman	Male	83.0	N
Chengpeng Qian	Fudan University	Senior	Male	73.0	Y

#由于函数无法返回如 start: end: step 的切片形式，故返回切片时要用 slice 对象进行包装：
df_A.loc[lambda x: slice('Gaojuan You','Gaoqiang Qian')]

	School	Grade	Gender	Weight	Transfer
Name
Gaojuan You	Fudan University	Sophomore	Male	74.0	N
Xiaoli Qian	Tsinghua University	Freshman	Female	51.0	N
Qiang Chu	Shanghai Jiao Tong University	Freshman	Female	52.0	N
Gaoqiang Qian	Tsinghua University	Junior	Female	50.0	N

4. iloc索引器

iloc 的使用与 loc 完全类似，只不过是针对位置进行筛选，在相应的 * 位置处一共也有五类
合法对象，分别是：整数、整数列表、整数切片、布尔列表以及函数，函数的返回值必须是前面
的四类合法对象中的一个，其输入同样也为 DataFrame 本身。

df_A.iloc[1, 1] # 第二行第二列

'Freshman'

df_A.iloc[[0,1],[0,1]]

	School	Grade
Name
Gaopeng Yang	Shanghai Jiao Tong University	Freshman
Changqiang You	Peking University	Freshman

df_A.iloc[1: 4, 2:4] # 切片不包含结束端点

	Gender	Weight
Name
Changqiang You	Male	70.0
Mei Sun	Male	89.0
Xiaojuan Sun	Female	41.0

5. query方法

#在 pandas 中，支持把字符串形式的查询表达式传入 query 方法来查询数据，其表
#达式的执行结果必须返回布尔列表
df.query('((School == "Fudan University")&(Grade == "Senior")&(Weight > 70))')

	School	Grade	Name	Gender	Weight	Transfer
66	Fudan University	Senior	Chengpeng Zhou	Male	81.0	N
131	Fudan University	Senior	Chengpeng Qian	Male	73.0	Y

df.query('Grade == ["Junior", "Senior"]').head()

	School	Grade	Name	Gender	Weight	Transfer
2	Shanghai Jiao Tong University	Senior	Mei Sun	Male	89.0	N
7	Tsinghua University	Junior	Gaoqiang Qian	Female	50.0	N
9	Peking University	Junior	Juan Xu	Female	NaN	N
11	Tsinghua University	Junior	Xiaoquan Lv	Female	43.0	N
12	Shanghai Jiao Tong University	Senior	Peng You	Female	48.0	NaN

low, high =70, 80
df.query('Weight.between(@low, @high)').head()

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-73-7b9719bb38f0> in <module>
      1 low, high =70, 80
----> 2 df.query('Weight.between(@low, @high)').head()


D:\Anaconda3\lib\site-packages\pandas\core\frame.py in query(self, expr, inplace, **kwargs)
   3343         kwargs["level"] = kwargs.pop("level", 0) + 1
   3344         kwargs["target"] = None
-> 3345         res = self.eval(expr, **kwargs)
   3346 
   3347         try:


D:\Anaconda3\lib\site-packages\pandas\core\frame.py in eval(self, expr, inplace, **kwargs)
   3473         kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers)
   3474 
-> 3475         return _eval(expr, inplace=inplace, **kwargs)
   3476 
   3477     def select_dtypes(self, include=None, exclude=None) -> "DataFrame":


D:\Anaconda3\lib\site-packages\pandas\core\computation\eval.py in eval(expr, parser, engine, truediv, local_dict, global_dict, resolvers, level, target, inplace)
    344         eng = _engines[engine]
    345         eng_inst = eng(parsed_expr)
--> 346         ret = eng_inst.evaluate()
    347 
    348         if parsed_expr.assigner is None:


D:\Anaconda3\lib\site-packages\pandas\core\computation\engines.py in evaluate(self)
     71 
     72         # make sure no names in resolvers and locals/globals clash
---> 73         res = self._evaluate()
     74         return reconstruct_object(
     75             self.result_type, res, self.aligned_axes, self.expr.terms.return_type


D:\Anaconda3\lib\site-packages\pandas\core\computation\engines.py in _evaluate(self)
    111         env = self.expr.env
    112         scope = env.full_scope
--> 113         _check_ne_builtin_clash(self.expr)
    114         return ne.evaluate(s, local_dict=scope)
    115 


D:\Anaconda3\lib\site-packages\pandas\core\computation\engines.py in _check_ne_builtin_clash(expr)
     27         Terms can contain
     28     """
---> 29     names = expr.names
     30     overlap = names & _ne_builtins
     31 


D:\Anaconda3\lib\site-packages\pandas\core\computation\expr.py in names(self)
    812         """
    813         if is_term(self.terms):
--> 814             return frozenset([self.terms.name])
    815         return frozenset(term.name for term in com.flatten(self.terms))
    816 


D:\Anaconda3\lib\site-packages\pandas\core\generic.py in __hash__(self)
   1667     def __hash__(self):
   1668         raise TypeError(
-> 1669             f"{repr(type(self).__name__)} objects are mutable, "
   1670             f"thus they cannot be hashed"
   1671         )


TypeError: 'Series' objects are mutable, thus they cannot be hashed

6. 随机抽样

df_A = pd.DataFrame({
    
    'id': list('abcde'),
                          'value': [10, 20, 30, 20, 20]})
df_A

	id	value
0	a	10
1	b	20
2	c	30
3	d	20
4	e	20

df_A.sample(3,replace = True,weights = df_A.value)

	id	value
3	d	20
2	c	30
4	e	20

二、多级索引

1. 多级索引及其表的结构

import numpy as np
import pandas as pd
np.random.seed(0)
multi_index = pd.MultiIndex.from_product([list('ABCD'),
               df.Gender.unique()], names=('School', 'Gender'))#重新建表
multi_column = pd.MultiIndex.from_product([['Height', 'Weight'],
                   df.Grade.unique()], names=('Indicator', 'Grade'))
df_multi = pd.DataFrame(np.c_[(np.random.randn(8,4)*5 + 163).tolist(),
                            (np.random.randn(8,4)*5 + 65).tolist()],
                            index = multi_index,
                            columns = multi_column).round(1)
df_multi

	Indicator	Height				Weight
	Grade	Freshman	Senior	Sophomore	Junior	Freshman	Senior	Sophomore	Junior
School	Gender
A	Female	171.8	165.0	167.9	174.2	60.6	55.1	63.3	65.8
A	Male	172.3	158.1	167.8	162.2	71.2	71.0	63.1	63.5
B	Female	162.5	165.1	163.7	170.3	59.8	57.9	56.5	74.8
B	Male	166.8	163.6	165.2	164.7	62.5	62.8	58.7	68.9
C	Female	170.5	162.0	164.6	158.7	56.9	63.9	60.5	66.9
C	Male	150.2	166.3	167.3	159.3	62.4	59.1	64.9	67.1
D	Female	174.3	155.7	163.2	162.1	65.3	66.5	61.8	63.2
D	Male	170.7	170.3	163.8	164.9	61.6	63.2	60.9	56.4

df_multi.index.names

FrozenList(['School', 'Gender'])

df_multi.columns.names

FrozenList(['Indicator', 'Grade'])

df_multi.index.get_level_values(0)#获得某一层的索引

Index(['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], dtype='object', name='School')

2. 多级索引中的loc索引器

熟悉了结构后，现在回到原表，将学校和年级设为索引，此时的行为多级索引，列为单级索引，
由于默认状态的列索引不含名字，因此对应于刚刚图中 Indicator 和 Grade 的索引名位置是空缺的。

df_multi = df.set_index(['School','Grade'])
df_multi.head()

		Name	Gender	Weight	Transfer
School	Grade
Shanghai Jiao Tong University	Freshman	Gaopeng Yang	Female	46.0	N
Peking University	Freshman	Changqiang You	Male	70.0	N
Shanghai Jiao Tong University	Senior	Mei Sun	Male	89.0	N
Fudan University	Sophomore	Xiaojuan Sun	Female	41.0	N
Fudan University	Sophomore	Gaojuan You	Male	74.0	N

由于多级索引中的单个元素以元组为单位，因此之前在第一节介绍的 loc 和 iloc 方法完全可以照搬
，只需把标量的位置替换成对应的元组，不过在索引前最好对 MultiIndex 进行排序以避免性能警告：

df_multi = df_multi.sort_index()
df_multi.loc[('Fudan University','Junior')].head()

		Name	Gender	Weight	Transfer
School	Grade
Fudan University	Junior	Yanli You	Female	48.0	N
	Junior	Chunqiang Chu	Male	72.0	N
	Junior	Changfeng Lv	Male	76.0	N
	Junior	Yanjuan Lv	Female	49.0	NaN
	Junior	Gaoqiang Zhou	Female	43.0	N

df_multi.loc[[('Fudan University', 'Senior'),
                  ('Shanghai Jiao Tong University', 'Freshman')]].head()

		Name	Gender	Weight	Transfer
School	Grade
Fudan University	Senior	Chengpeng Zheng	Female	38.0	N
	Senior	Feng Zhou	Female	47.0	N
	Senior	Gaomei Lv	Female	34.0	N
	Senior	Chunli Lv	Female	56.0	N
	Senior	Chengpeng Zhou	Male	81.0	N

3. IndexSlice对象

前面介绍的方法，即使在索引不重复的时候，也只能对元组整体进行切片，而不能对每层进行切片，
也不允许将切片和布尔列表混合使用，引入 IndexSlice 对象就能解决这个问题。 Slice 对象一共
有两种形式，第一种为 loc[idx[,]] 型，第二种为 loc[idx[,],idx[,]] 型，
下面将进行介绍。为了方便演示，下面构造一个索引不重复的 DataFrame ：

np.random.seed(0)
L1,L2 = ['A','B','C'],['a','b','c']
mul_index1 = pd.MultiIndex.from_product([L1,L2],names=('Upper', 'Lower'))
L3,L4 = ['D','E','F'],['d','e','f']
mul_index2 = pd.MultiIndex.from_product([L3,L4],names=('Big', 'Small'))
df_ex = pd.DataFrame(np.random.randint(-9,10,(9,9)),index=mul_index1,columns=mul_index2)
df_ex

	Big	D			E			F
	Small	d	e	f	d	e	f	d	e	f
Upper	Lower
A	a	3	6	-9	-6	-6	-2	0	9	-5
	b	-3	3	-8	-3	-2	5	8	-4	4
	c	-1	0	7	-4	6	6	-9	9	-6
B	a	8	5	-2	-9	-8	0	-9	1	-6
	b	2	9	-7	-9	-9	-5	-4	-3	-1
	c	8	6	-5	0	1	-8	-8	-2	0
C	a	-6	-3	2	5	9	-9	5	-6	3
	b	1	2	-5	-3	-5	6	-6	3	-5
	c	-1	5	6	-6	6	4	7	8	-4

index = pd.IndexSlice#定义slice对象
df_ex.loc[index['C':,('D','e'):]]#loc[idx[*,*]] 型

	Big	D		E			F
	Small	e	f	d	e	f	d	e	f
Upper	Lower
C	a	-3	2	5	9	-9	5	-6	3
	b	2	-5	-3	-5	6	-6	3	-5
	c	5	6	-6	6	4	7	8	-4

df_ex.loc[index[:'A', lambda x:x.sum()>0]] # 列和大于0

	Big	D		F
	Small	d	e	e
Upper	Lower
A	a	3	6	9
	b	-3	3	-4
	c	-1	0	9

df_ex.loc[index[:'A', 'b':], index['E':, 'e':]]#loc[idx[*,*],idx[*,*]] 型前一个 idx 指代的是行索引，后一个是列索引。

	Big	E		F
	Small	e	f	e	f
Upper	Lower
A	b	-2	5	-4	4
A	c	6	6	9	-6

4. 多级索引的构造

前面提到了多级索引表的结构和切片，那么除了使用 set_index 之外，
如何自己构造多级索引呢？常用的有 from_tuples, from_arrays, from_product 三种方法，
它们都是 pd.MultiIndex 对象下的函数。

#from_tuples 指根据传入由元组组成的列表进行构造：
my_A = [('a','cat'),('a','dog'),('b','cat'),('b','dog')]
pd.MultiIndex.from_tuples(my_A,names = ['First','Second'])

MultiIndex([('a', 'cat'),
            ('a', 'dog'),
            ('b', 'cat'),
            ('b', 'dog')],
           names=['First', 'Second'])

#from_arrays 根据传入列表中，对应层的列表进行构造：
my_array = [list('aabb'), ['cat', 'dog']*2]
pd.MultiIndex.from_arrays(my_array, names=['First','Second'])

MultiIndex([('a', 'cat'),
            ('a', 'dog'),
            ('b', 'cat'),
            ('b', 'dog')],
           names=['First', 'Second'])

#from_product 指根据给定多个列表的笛卡尔积进行构造：
list1 = ['a','b']
list2 = ['cat','dog']
pd.MultiIndex.from_product([list1,list2],names = ['First','Second'])

MultiIndex([('a', 'cat'),
            ('a', 'dog'),
            ('b', 'cat'),
            ('b', 'dog')],
           names=['First', 'Second'])

三、索引的常用方法

1. 索引层的交换和删除

np.random.seed(0)
L1,L2,L3 = ['A','B'],['a','b'],['alpha','beta']
m_index1 = pd.MultiIndex.from_product([L1,L2,L3],names = ('Upper','Lower','Extra'))
L4,L5,L6 = ['C','D'],['c','d'],['cat','dog']
m_index2 = pd.MultiIndex.from_product([L4,L5,L6],names=('Big', 'Small', 'Other'))
df_ex = pd.DataFrame(np.random.randint(-9,10,(8,8)),index=m_index1,columns=m_index2)
df_ex

		Big	C				D
		Small	c		d		c		d
		Other	cat	dog	cat	dog	cat	dog	cat	dog
Upper	Lower	Extra
A	a	alpha	3	6	-9	-6	-6	-2	0	9
	a	beta	-5	-3	3	-8	-3	-2	5	8
	b	alpha	-4	4	-1	0	7	-4	6	6
	b	beta	-9	9	-6	8	5	-2	-9	-8
B	a	alpha	0	-9	1	-6	2	9	-7	-9
	a	beta	-9	-5	-4	-3	-1	8	6	-5
	b	alpha	0	1	-8	-8	-2	0	-6	-3
	b	beta	2	5	9	-9	5	-6	3	1

df_ex.swaplevel(0,2,axis=1).head() # 列索引的第一层和第三层交换

		Other	cat	dog	cat	dog	cat	dog	cat	dog
		Small	c	c	d	d	c	c	d	d
		Big	C	C	C	C	D	D	D	D
Upper	Lower	Extra
A	a	alpha	3	6	-9	-6	-6	-2	0	9
	a	beta	-5	-3	3	-8	-3	-2	5	8
	b	alpha	-4	4	-1	0	7	-4	6	6
	b	beta	-9	9	-6	8	5	-2	-9	-8
B	a	alpha	0	-9	1	-6	2	9	-7	-9

 df_ex.reorder_levels([2,0,1],axis=0).head() # 列表数字指代原来索引中的层

		Big	C				D
		Small	c		d		c		d
		Other	cat	dog	cat	dog	cat	dog	cat	dog
Extra	Upper	Lower
alpha	A	a	3	6	-9	-6	-6	-2	0	9
beta	A	a	-5	-3	3	-8	-3	-2	5	8
alpha	A	b	-4	4	-1	0	7	-4	6	6
beta	A	b	-9	9	-6	8	5	-2	-9	-8
alpha	B	a	0	-9	1	-6	2	9	-7	-9

df_ex.droplevel(1,axis=1)#删除某一层的索引

		Big	C				D
		Other	cat	dog	cat	dog	cat	dog	cat	dog
Upper	Lower	Extra
A	a	alpha	3	6	-9	-6	-6	-2	0	9
	a	beta	-5	-3	3	-8	-3	-2	5	8
	b	alpha	-4	4	-1	0	7	-4	6	6
	b	beta	-9	9	-6	8	5	-2	-9	-8
B	a	alpha	0	-9	1	-6	2	9	-7	-9
	a	beta	-9	-5	-4	-3	-1	8	6	-5
	b	alpha	0	1	-8	-8	-2	0	-6	-3
	b	beta	2	5	9	-9	5	-6	3	1

df_ex.droplevel([0,1],axis=0)#删除列索引

Big	C				D
Small	c		d		c		d
Other	cat	dog	cat	dog	cat	dog	cat	dog
Extra
alpha	3	6	-9	-6	-6	-2	0	9
beta	-5	-3	3	-8	-3	-2	5	8
alpha	-4	4	-1	0	7	-4	6	6
beta	-9	9	-6	8	5	-2	-9	-8
alpha	0	-9	1	-6	2	9	-7	-9
beta	-9	-5	-4	-3	-1	8	6	-5
alpha	0	1	-8	-8	-2	0	-6	-3
beta	2	5	9	-9	5	-6	3	1

2.索引属性的修改

df_ex.rename_axis(index = {
    
    'Uper':'Changed_row'},columns = {
    
    'Other':'Changed_Col'})
.head()#修改索引层名字rename_axis

		Big	C				D
		Small	c		d		c		d
		Changed_Col	cat	dog	cat	dog	cat	dog	cat	dog
Upper	Lower	Extra
A	a	alpha	3	6	-9	-6	-6	-2	0	9
	a	beta	-5	-3	3	-8	-3	-2	5	8
	b	alpha	-4	4	-1	0	7	-4	6	6
	b	beta	-9	9	-6	8	5	-2	-9	-8
B	a	alpha	0	-9	1	-6	2	9	-7	-9

df_ex.rename(columns={
    
    'cat':'not_cat'},level=2).head()#修改索引值rename

		Big	C				D
		Small	c		d		c		d
		Other	not_cat	dog	not_cat	dog	not_cat	dog	not_cat	dog
Upper	Lower	Extra
A	a	alpha	3	6	-9	-6	-6	-2	0	9
	a	beta	-5	-3	3	-8	-3	-2	5	8
	b	alpha	-4	4	-1	0	7	-4	6	6
	b	beta	-9	9	-6	8	5	-2	-9	-8
B	a	alpha	0	-9	1	-6	2	9	-7	-9

df_ex.rename(index=lambda x:str.upper(x),level=2).head()#通过函数修改索引值

		Big	C				D
		Small	c		d		c		d
		Other	cat	dog	cat	dog	cat	dog	cat	dog
Upper	Lower	Extra
A	a	ALPHA	3	6	-9	-6	-6	-2	0	9
	a	BETA	-5	-3	3	-8	-3	-2	5	8
	b	ALPHA	-4	4	-1	0	7	-4	6	6
	b	BETA	-9	9	-6	8	5	-2	-9	-8
B	a	ALPHA	0	-9	1	-6	2	9	-7	-9

A = iter(list('abcdefgh'))
df_ex.rename(index=lambda x:next(A),level = 2)

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-80-bc33686b6fdf> in <module>
----> 1 A = iter(list('abcdefgh'))
      2 df_ex.rename(index=lambda x:next(A),level = 2)


TypeError: 'list' object is not callable

3. 索引的设置与重置

df_A = pd.DataFrame({
    
    "A":list('aacd'),"B":list('PQRT'),"C":[1,2,3,4]})
df_A

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-86-2fdf41795bad> in <module>
----> 1 df_A = pd.DataFrame({"A":list('aacd'),"B":list('PQRT'),"C":[1,2,3,4]})
      2 df_A


TypeError: 'list' object is not callable

4. 索引的变形

df_reindex = pd.DataFrame({
    
    "Weight":[60,70,80], "Height":[176,180,179]}, index=['1001','1003','1002'])
df_reindex

	Weight	Height
1001	60	176
1003	70	180
1002	80	179

df_reindex.reindex(index=['1001','1002','1003','1004'],columns=['Weight','Gender'])
#增加一行的同时，去掉一列并新增一列

	Weight	Gender
1001	60.0	NaN
1002	80.0	NaN
1003	70.0	NaN
1004	NaN	NaN

df_existed = pd.DataFrame(index = ['1001','1002','1003','1004'],columns=['Weight','Gender'])
df_reindex.reindex_like(df_existed)
# reindex_like与reindex相似 ，其功能是仿照传入的表的索引来进行被调用表索引的变形。

	Weight	Gender
1001	60.0	NaN
1002	80.0	NaN
1003	70.0	NaN
1004	NaN	NaN

四、索引运算

2. 一般的索引运算

df_A = pd.DataFrame([[0,1],[1,2],[3,4]],
                        index = pd.Index(['a','b','a'],name='id1'))
df_B = pd.DataFrame([[4,5],[2,6],[7,1]],index = pd.Index(['b','b','c'],name='id2'))
id1,id2 = df_A.index.unique(),df_B.index.unique()
id1.intersection(id2)

Index(['b'], dtype='object')

id1.union(id2)

Index(['a', 'b', 'c'], dtype='object')

id1.difference(id2)

Index(['a'], dtype='object')

id1.symmetric_difference(id2)

Index(['a', 'c'], dtype='object')

#若两张表需要做集合运算的列并没有被设置索引，一种办法是先转成索引，运算后再恢复，
#另一种方法是利用 isin 函数
df_a = df_A.reset_index()
df_b = df_B.reset_index()
df_a

	id1	0	1
0	a	0	1
1	b	1	2
2	a	3	4

df_b

	id2	0	1
0	b	4	5
1	b	2	6
2	c	7	1

df_a[df_a.id1.isin(df_b.id2)]

	id1	0	1
1	b	1	2

五、练习

Ex1：公司员工数据集

df = pd.read_csv(r'C:\Users\zhoukaiwei\Desktop\joyful-pandas\data/company.csv')
df.head()

	EmployeeID	birthdate_key	age	city_name	department	job_title	gender
0	1318	1/3/1954	61	Vancouver	Executive	CEO	M
1	1319	1/3/1957	58	Vancouver	Executive	VP Stores	F
2	1320	1/2/1955	60	Vancouver	Executive	Legal Counsel	F
3	1321	1/2/1959	56	Vancouver	Executive	VP Human Resources	M
4	1322	1/9/1958	57	Vancouver	Executive	VP Finance	M

分别只使用 query 和 loc 选出年龄不超过四十岁且工作部门为 Dairy 或 Bakery 的男性。

df.query('(age <= 40)&(department is in ['Diary','Bakery'])&(gender=='M')').head()

  File "<ipython-input-108-aa2a95f52718>", line 1
    df.query('(age <= 40)&(department is in ['Diary','Bakery'])&(gender=='M')').head()
                                                  ^
SyntaxError: invalid syntax

选出员工 ID 号为奇数所在行的第1、第3和倒数第2列。

df.iloc[(df.EmployeeID%2==1).values,[0,2,-2]].head()

	EmployeeID	age	job_title
1	1319	58	VP Stores
3	1321	56	VP Human Resources
5	1323	53	Exec Assistant, VP Stores
6	1325	51	Exec Assistant, Legal Counsel
8	1329	48	Store Manager

按照以下步骤进行索引操作：

把后三列设为索引后交换内外两层

df.set_index(df.columns[-3:].tolist()).swaplevel(0,2,axis=0).head()

			EmployeeID	birthdate_key	age	city_name
gender	job_title	department
M	CEO	Executive	1318	1/3/1954	61	Vancouver
F	VP Stores	Executive	1319	1/3/1957	58	Vancouver
F	Legal Counsel	Executive	1320	1/2/1955	60	Vancouver
M	VP Human Resources	Executive	1321	1/2/1959	56	Vancouver
M	VP Finance	Executive	1322	1/9/1958	57	Vancouver

df.reset_index(level=1)#恢复中间一层

---------------------------------------------------------------------------

IndexError                                Traceback (most recent call last)

<ipython-input-116-bd39f5c7cad0> in <module>
----> 1 df.reset_index(level=1)


D:\Anaconda3\lib\site-packages\pandas\core\frame.py in reset_index(self, level, drop, inplace, col_level, col_fill)
   4818             if not isinstance(level, (tuple, list)):
   4819                 level = [level]
-> 4820             level = [self.index._get_level_number(lev) for lev in level]
   4821             if len(level) < self.index.nlevels:
   4822                 new_index = self.index.droplevel(level)


D:\Anaconda3\lib\site-packages\pandas\core\frame.py in <listcomp>(.0)
   4818             if not isinstance(level, (tuple, list)):
   4819                 level = [level]
-> 4820             level = [self.index._get_level_number(lev) for lev in level]
   4821             if len(level) < self.index.nlevels:
   4822                 new_index = self.index.droplevel(level)


D:\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in _get_level_number(self, level)
   1413 
   1414     def _get_level_number(self, level) -> int:
-> 1415         self._validate_index_level(level)
   1416         return 0
   1417 


D:\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in _validate_index_level(self, level)
   1405             elif level > 0:
   1406                 raise IndexError(
-> 1407                     f"Too many levels: Index has only 1 level, not {level + 1}"
   1408                 )
   1409         elif level != self.name:


IndexError: Too many levels: Index has only 1 level, not 2

修改外层索引名为 Gender

df.rename_axis(index={
    
    'gender':'Gender'}).head()

	EmployeeID	birthdate_key	age	city_name	department	job_title	gender
0	1318	1/3/1954	61	Vancouver	Executive	CEO	M
1	1319	1/3/1957	58	Vancouver	Executive	VP Stores	F
2	1320	1/2/1955	60	Vancouver	Executive	Legal Counsel	F
3	1321	1/2/1959	56	Vancouver	Executive	VP Human Resources	M
4	1322	1/9/1958	57	Vancouver	Executive	VP Finance	M

用下划线合并两层行索引

后面题目继续再做，后面会更新到文档里面。

DataWhale：DataWhale

pandas学习task03索引