一.创建DataFrame
import pandas as pd
country1 = pd.Series({'Name': '中国',
'Language': 'Chinese',
'Area': '9.597M km2',
'Happiness Rank': 79})
country2 = pd.Series({'Name': '美国',
'Language': 'English (US)',
'Area': '9.834M km2',
'Happiness Rank': 14})
country3 = pd.Series({'Name': '澳大利亚',
'Language': 'English (AU)',
'Area': '7.692M km2',
'Happiness Rank': 9})
df = pd.DataFrame([country1, country2, country3], index=['CH', 'US', 'AU'])
print(df)
Area Happiness Rank Language Name
CH 9.597M km2 79 Chinese 中国
US 9.834M km2 14 English (US) 美国
AU 7.692M km2 9 English (AU) 澳大利亚
# 添加数据
# 如果个数小于要求的个数,会自动进行“广播”操作
# 如果大于要求的个数,会报错
df['Location'] = '地球'
print(df)
df['Region'] = ['亚洲', '北美洲', '大洋洲']
print(df)
Name Language Area Happiness Rank Location
CH 中国 Chinese 9.597M km2 79 地球
US 美国 English (US) 9.834M km2 14 地球
AU 澳大利亚 English (AU) 7.692M km2 9 地球
Name Language Area Happiness Rank Location Region
CH 中国 Chinese 9.597M km2 79 地球 亚洲
US 美国 English (US) 9.834M km2 14 地球 北美洲
AU 澳大利亚 English (AU) 7.692M km2 9 地球 大洋洲
二.DataFrame索引
# 行索引
print('loc:')
print(df.loc['CH'])
print(type(df.loc['CH']))
print('iloc:')
print(df.iloc[1])
print(type(df.iloc[1])
loc:
Area 9.597M km2
Happiness Rank 79
Language Chinese
Name 中国
Location 地球
Region 亚洲
Name: CH, dtype: object
<class 'pandas.core.series.Series'>
iloc:
Area 9.834M km2
Happiness Rank 14
Language English (US)
Name 美国
Location 地球
Region 北美洲
Name: US, dtype: object
<class 'pandas.core.series.Series'>
# 列索引
print(df['Area'])
print(type(df['Area']))
CH 9.597M km2
US 9.834M km2
AU 7.692M km2
Name: Area, dtype: object
<class 'pandas.core.series.Series'>
# 获取不连续的列数据
print(df[['Name', 'Area']])
Name Area
CH 中国 9.597M km2
US 美国 9.834M km2
AU 澳大利亚 7.692M km2
# 混合索引
# 注意写法上的区别
print('先取出列,再取行:')
print(df['Area']['CH'])
print(df['Area'].loc['CH'])
print(df['Area'].iloc[0])
print('先取出行,再取列:')
print(df.loc['CH']['Area'])
print(df.iloc[0]['Area'])
先取出列,再取行:
9.597M km2
9.597M km2
9.597M km2
先取出行,再取列:
9.597M km2
9.597M km2
# 转换行和列
print(df.T)
CH US AU
Area 9.597M km2 9.834M km2 7.692M km2
Happiness Rank 79 14 9
Language Chinese English (US) English (AU)
Name 中国 美国 澳大利亚
Location 地球 地球 地球
Region 亚洲 北美洲 大洋洲
三.删除操作
print(df.drop(['CH']))
# 注意drop操作只是将修改后的数据copy一份,而不会对原始数据进行修改
print(df)
Area Happiness Rank Language Name Location Region
US 9.834M km2 14 English (US) 美国 地球 北美洲
AU 7.692M km2 9 English (AU) 澳大利亚 地球 大洋洲
Area Happiness Rank Language Name Location Region
CH 9.597M km2 79 Chinese 中国 地球 亚洲
US 9.834M km2 14 English (US) 美国 地球 北美洲
AU 7.692M km2 9 English (AU) 澳大利亚 地球 大洋洲
print(df.drop(['CH'], inplace=True))
# 如果使用了inplace=True,会在原始数据上进行修改,同时不会返回一个copy
print(df)
None
Area Happiness Rank Language Name Location Region
US 9.834M km2 14 English (US) 美国 地球 北美洲
AU 7.692M km2 9 English (AU) 澳大利亚 地球 大洋洲
# 如果需要删除列,需要指定axis=1
print(df.drop(['Area'], axis=1))
print(df)
Happiness Rank Language Name Location Region
US 14 English (US) 美国 地球 北美洲
AU 9 English (AU) 澳大利亚 地球 大洋洲
Area Happiness Rank Language Name Location Region
US 9.834M km2 14 English (US) 美国 地球 北美洲
AU 7.692M km2 9 English (AU) 澳大利亚 地球 大洋洲
# 也可直接使用del关键字
del df['Name']
print(df)
Area Happiness Rank Language Location Region
US 9.834M km2 14 English (US) 地球 北美洲
AU 7.692M km2 9 English (AU) 地球 大洋洲
四.DataFrame的操作与加载
print(df['Happiness Rank'])
US 14
AU 9
Name: Happiness Rank, dtype: int64
# 注意从DataFrame中取出的数据进行操作后,会对原始数据产生影响
ranks = df['Happiness Rank']
ranks += 2
print(ranks)
print(df)
US 16
AU 11
Name: Happiness Rank, dtype: int64
Area Happiness Rank Language Location Region
US 9.834M km2 16 English (US) 地球 北美洲
AU 7.692M km2 11 English (AU) 地球 大洋洲
# 注意从DataFrame中取出的数据进行操作后,会对原始数据产生影响
# 安全的操作是使用copy()
ranks = df['Happiness Rank'].copy()
ranks += 2
print(ranks)
print(df)
US 18
AU 13
Name: Happiness Rank, dtype: int64
Area Happiness Rank Language Location Region
US 9.834M km2 16 English (US) 地球 北美洲
AU 7.692M km2 11 English (AU) 地球 大洋洲
# 加载csv文件数据
reprot_2015_df = pd.read_csv('./2015.csv')
print('2015年数据预览:')
print(reprot_2015_df.head())
2015年数据预览:
Country Region Happiness Rank Happiness Score \
0 Switzerland Western Europe 1 7.587
1 Iceland Western Europe 2 7.561
2 Denmark Western Europe 3 7.527
3 Norway Western Europe 4 7.522
4 Canada North America 5 7.427
Standard Error Economy (GDP per Capita) Family \
0 0.03411 1.39651 1.34951
1 0.04884 1.30232 1.40223
2 0.03328 1.32548 1.36058
3 0.03880 1.45900 1.33095
4 0.03553 1.32629 1.32261
Health (Life Expectancy) Freedom Trust (Government Corruption) \
0 0.94143 0.66557 0.41978
1 0.94784 0.62877 0.14145
2 0.87464 0.64938 0.48357
3 0.88521 0.66973 0.36503
4 0.90563 0.63297 0.32957
Generosity Dystopia Residual
0 0.29678 2.51738
1 0.43630 2.70201
2 0.34139 2.49204
3 0.34699 2.46531
4 0.45811 2.45176
print(reprot_2015_df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 12 columns):
Country 158 non-null object
Region 158 non-null object
Happiness Rank 158 non-null int64
Happiness Score 158 non-null float64
Standard Error 158 non-null float64
Economy (GDP per Capita) 158 non-null float64
Family 158 non-null float64
Health (Life Expectancy) 158 non-null float64
Freedom 158 non-null float64
Trust (Government Corruption) 158 non-null float64
Generosity 158 non-null float64
Dystopia Residual 158 non-null float64
dtypes: float64(9), int64(1), object(2)
memory usage: 14.9+ KB
None
print(reprot_2015_df.describe())
Happiness Rank Happiness Score Standard Error \
count 158.000000 158.000000 158.000000
mean 79.493671 5.375734 0.047885
std 45.754363 1.145010 0.017146
min 1.000000 2.839000 0.018480
25% 40.250000 4.526000 0.037268
50% 79.500000 5.232500 0.043940
75% 118.750000 6.243750 0.052300
max 158.000000 7.587000 0.136930
Economy (GDP per Capita) Family Health (Life Expectancy) \
count 158.000000 158.000000 158.000000
mean 0.846137 0.991046 0.630259
std 0.403121 0.272369 0.247078
min 0.000000 0.000000 0.000000
25% 0.545808 0.856823 0.439185
50% 0.910245 1.029510 0.696705
75% 1.158448 1.214405 0.811013
max 1.690420 1.402230 1.025250
Freedom Trust (Government Corruption) Generosity \
count 158.000000 158.000000 158.000000
mean 0.428615 0.143422 0.237296
std 0.150693 0.120034 0.126685
min 0.000000 0.000000 0.000000
25% 0.328330 0.061675 0.150553
50% 0.435515 0.107220 0.216130
75% 0.549092 0.180255 0.309883
max 0.669730 0.551910 0.795880
Dystopia Residual
count 158.000000
mean 2.098977
std 0.553550
min 0.328580
25% 1.759410
50% 2.095415
75% 2.462415
max 3.602140
print(reprot_2015_df.tail())
Country Region Happiness Rank \
153 Rwanda Sub-Saharan Africa 154
154 Benin Sub-Saharan Africa 155
155 Syria Middle East and Northern Africa 156
156 Burundi Sub-Saharan Africa 157
157 Togo Sub-Saharan Africa 158
Happiness Score Standard Error Economy (GDP per Capita) Family \
153 3.465 0.03464 0.22208 0.77370
154 3.340 0.03656 0.28665 0.35386
155 3.006 0.05015 0.66320 0.47489
156 2.905 0.08658 0.01530 0.41587
157 2.839 0.06727 0.20868 0.13995
Health (Life Expectancy) Freedom Trust (Government Corruption) \
153 0.42864 0.59201 0.55191
154 0.31910 0.48450 0.08010
155 0.72193 0.15684 0.18906
156 0.22396 0.11850 0.10062
157 0.28443 0.36453 0.10731
Generosity Dystopia Residual
153 0.22628 0.67042
154 0.18260 1.63328
155 0.47179 0.32858
156 0.19727 1.83302
157 0.16681 1.56726
五.在read_csv操作下的索引
import pandas as pd
# 使用index_col指定索引列
# 使用usecols指定需要读取的列
reprot_2016_df = pd.read_csv('F:/2016.csv',
index_col='Country',
usecols=['Country', 'Happiness Rank', 'Happiness Score', 'Region'])
print(reprot_2016_df.head())
Region Happiness Rank Happiness Score
Country
Denmark Western Europe 1 7.526
Switzerland Western Europe 2 7.509
Iceland Western Europe 3 7.501
Norway Western Europe 4 7.498
Finland Western Europe 5 7.413
print('列名(column):', reprot_2016_df.columns)
print('行名(index):', reprot_2016_df.index)
列名(column): Index(['Region', 'Happiness Rank', 'Happiness Score'], dtype='object')
行名(index): Index(['Denmark', 'Switzerland', 'Iceland', 'Norway', 'Finland', 'Canada',
'Netherlands', 'New Zealand', 'Australia', 'Sweden',
...
'Madagascar', 'Tanzania', 'Liberia', 'Guinea', 'Rwanda', 'Benin',
'Afghanistan', 'Togo', 'Syria', 'Burundi'],
dtype='object', name='Country', length=157)
# 注意index是不可变的
reprot_2016_df.index[0] = '丹麦'
TypeError Traceback (most recent call last)
<ipython-input-8-c2f1f4f940d2> in <module>()
1 # 注意index是不可变的
----> 2 reprot_2016_df.index[0] = '丹麦'
F:\python\ANACONDA\lib\site-packages\pandas\core\indexes\base.py in __setitem__(self, key, value)
2048
2049 def __setitem__(self, key, value):
-> 2050 raise TypeError("Index does not support mutable operations")
2051
2052 def __getitem__(self, key):
TypeError: Index does not support mutable operations
# 重置index
# 注意inplace加与不加的区别
print(reprot_2016_df.reset_index(inplace=True))
Country Region Happiness Rank Happiness Score
0 Denmark Western Europe 1 7.526
1 Switzerland Western Europe 2 7.509
2 Iceland Western Europe 3 7.501
3 Norway Western Europe 4 7.498
4 Finland Western Europe 5 7.413
# 重命名列名
reprot_2016_df = reprot_2016_df.rename(columns={'Region': '地区', 'Hapiness Rank': '排名', 'Hapiness Score': '幸福指数'})
peint(reprot_2016_df.head())
Country 地区 排名 幸福指数
0 Denmark Western Europe 1 7.526
1 Switzerland Western Europe 2 7.509
2 Iceland Western Europe 3 7.501
3 Norway Western Europe 4 7.498
4 Finland Western Europe 5 7.413
# 重命名列名,注意inplace的使用
reprot_2016_df.rename(columns={'Region': '地区', 'Happiness Rank': '排名', 'Happiness Score': '幸福指数'},inplace=True)
print(reprot_2016_df.head())
Country 地区 排名 幸福指数
0 Denmark Western Europe 1 7.526
1 Switzerland Western Europe 2 7.509
2 Iceland Western Europe 3 7.501
3 Norway Western Europe 4 7.498
4 Finland Western Europe 5 7.413
六.Boolean Mask
print(reprot_2016_df.head())
Country 地区 排名 幸福指数
0 Denmark Western Europe 1 7.526
1 Switzerland Western Europe 2 7.509
2 Iceland Western Europe 3 7.501
3 Norway Western Europe 4 7.498
4 Finland Western Europe 5 7.413
# 过滤 Western Europe 地区的国家
# only_western_europe = reprot_2016_df['地区'] == 'Western Europe'
print(reprot_2016_df[reprot_2016_df['地区'] == 'Western Europe'])
Country 地区 排名 幸福指数
0 Denmark Western Europe 1 7.526
1 Switzerland Western Europe 2 7.509
2 Iceland Western Europe 3 7.501
3 Norway Western Europe 4 7.498
4 Finland Western Europe 5 7.413
6 Netherlands Western Europe 7 7.339
9 Sweden Western Europe 10 7.291
11 Austria Western Europe 12 7.119
15 Germany Western Europe 16 6.994
17 Belgium Western Europe 18 6.929
18 Ireland Western Europe 19 6.907
19 Luxembourg Western Europe 20 6.871
22 United Kingdom Western Europe 23 6.725
29 Malta Western Europe 30 6.488
31 France Western Europe 32 6.478
36 Spain Western Europe 37 6.361
49 Italy Western Europe 50 5.977
61 North Cyprus Western Europe 62 5.771
68 Cyprus Western Europe 69 5.546
93 Portugal Western Europe 94 5.123
98 Greece Western Europe 99 5.033
# 过滤 Western Europe 地区的国家
# 并且排名在10之外
only_western_europe_10 = (reprot_2016_df['地区'] == 'Western Europe') & (reprot_2016_df['排名'] > 10)
print(only_western_europe_10)
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 False
11 True
12 False
13 False
14 False
15 True
16 False
17 True
18 True
19 True
20 False
21 False
22 True
23 False
24 False
25 False
26 False
27 False
28 False
29 True
...
127 False
128 False
129 False
130 False
131 False
132 False
133 False
134 False
135 False
136 False
137 False
138 False
139 False
140 False
141 False
142 False
143 False
144 False
145 False
146 False
147 False
148 False
149 False
150 False
151 False
152 False
153 False
154 False
155 False
156 False
Length: 157, dtype: bool
# 叠加 boolean mask 得到最终结果
print(reprot_2016_df[only_western_europe_10])
Country 地区 排名 幸福指数
11 Austria Western Europe 12 7.119
15 Germany Western Europe 16 6.994
17 Belgium Western Europe 18 6.929
18 Ireland Western Europe 19 6.907
19 Luxembourg Western Europe 20 6.871
22 United Kingdom Western Europe 23 6.725
29 Malta Western Europe 30 6.488
31 France Western Europe 32 6.478
36 Spain Western Europe 37 6.361
49 Italy Western Europe 50 5.977
61 North Cyprus Western Europe 62 5.771
68 Cyprus Western Europe 69 5.546
93 Portugal Western Europe 94 5.123
98 Greece Western Europe 99 5.033
# 熟练以后可以写在一行中
print(reprot_2016_df[(reprot_2016_df['地区'] == 'Western Europe') & (reprot_2016_df['排名'] > 10)])
Country 地区 排名 幸福指数
11 Austria Western Europe 12 7.119
15 Germany Western Europe 16 6.994
17 Belgium Western Europe 18 6.929
18 Ireland Western Europe 19 6.907
19 Luxembourg Western Europe 20 6.871
22 United Kingdom Western Europe 23 6.725
29 Malta Western Europe 30 6.488
31 France Western Europe 32 6.478
36 Spain Western Europe 37 6.361
49 Italy Western Europe 50 5.977
61 North Cyprus Western Europe 62 5.771
68 Cyprus Western Europe 69 5.546
93 Portugal Western Europe 94 5.123
98 Greece Western Europe 99 5.033
七.层级索引
print(reprot_2015_df.head())
Country Region Happiness Rank Happiness Score \
0 Switzerland Western Europe 1 7.587
1 Iceland Western Europe 2 7.561
2 Denmark Western Europe 3 7.527
3 Norway Western Europe 4 7.522
4 Canada North America 5 7.427
Standard Error Economy (GDP per Capita) Family \
0 0.03411 1.39651 1.34951
1 0.04884 1.30232 1.40223
2 0.03328 1.32548 1.36058
3 0.03880 1.45900 1.33095
4 0.03553 1.32629 1.32261
Health (Life Expectancy) Freedom Trust (Government Corruption) \
0 0.94143 0.66557 0.41978
1 0.94784 0.62877 0.14145
2 0.87464 0.64938 0.48357
3 0.88521 0.66973 0.36503
4 0.90563 0.63297 0.32957
Generosity Dystopia Residual
0 0.29678 2.51738
1 0.43630 2.70201
2 0.34139 2.49204
3 0.34699 2.46531
4 0.45811 2.45176
# 设置层级索引
report_2015_df2 = reprot_2015_df.set_index(['Region', 'Country'])
print(report_2015_df2.head(20))
# level0 索引
print(report_2015_df2.loc['Western Europe'])
# 两层索引
print(report_2015_df2.loc['Western Europe', 'Switzerland'])
# 交换分层顺序
print(report_2015_df2.swaplevel())