pandas基本数据对象及操作(DataFrame篇)

一.创建DataFrame

import pandas as pd
country1 = pd.Series({'Name': '中国',
                    'Language': 'Chinese',
                    'Area': '9.597M km2',
                     'Happiness Rank': 79})
country2 = pd.Series({'Name': '美国',
                    'Language': 'English (US)',
                    'Area': '9.834M km2',
                     'Happiness Rank': 14})
country3 = pd.Series({'Name': '澳大利亚',
                    'Language': 'English (AU)',
                    'Area': '7.692M km2',
                     'Happiness Rank': 9})
df = pd.DataFrame([country1, country2, country3], index=['CH', 'US', 'AU'])
print(df)

          Area  Happiness Rank      Language  Name
CH  9.597M km2              79       Chinese    中国
US  9.834M km2              14  English (US)    美国
AU  7.692M km2               9  English (AU)  澳大利亚



# 添加数据
# 如果个数小于要求的个数,会自动进行“广播”操作
# 如果大于要求的个数,会报错
df['Location'] = '地球'
print(df)
df['Region'] = ['亚洲', '北美洲', '大洋洲']
print(df)

Name      Language        Area  Happiness Rank Location
CH    中国       Chinese  9.597M km2              79       地球
US    美国  English (US)  9.834M km2              14       地球
AU  澳大利亚  English (AU)  7.692M km2               9       地球

    Name      Language        Area  Happiness Rank Location Region
CH    中国       Chinese  9.597M km2              79       地球     亚洲
US    美国  English (US)  9.834M km2              14       地球    北美洲
AU  澳大利亚  English (AU)  7.692M km2               9       地球    大洋洲

二.DataFrame索引

# 行索引
print('loc:')
print(df.loc['CH'])
print(type(df.loc['CH']))
print('iloc:')
print(df.iloc[1])
print(type(df.iloc[1])

loc:
Area              9.597M km2
Happiness Rank            79
Language             Chinese
Name                      中国
Location                  地球
Region                    亚洲
Name: CH, dtype: object

<class 'pandas.core.series.Series'>

iloc:
Area                9.834M km2
Happiness Rank              14
Language          English (US)
Name                        美国
Location                    地球
Region                     北美洲
Name: US, dtype: object

<class 'pandas.core.series.Series'>



# 列索引
print(df['Area'])
print(type(df['Area']))

CH    9.597M km2
US    9.834M km2
AU    7.692M km2
Name: Area, dtype: object
<class 'pandas.core.series.Series'>



# 获取不连续的列数据
print(df[['Name', 'Area']])

    Name        Area
CH    中国  9.597M km2
US    美国  9.834M km2
AU  澳大利亚  7.692M km2



# 混合索引
# 注意写法上的区别
print('先取出列,再取行:')
print(df['Area']['CH'])
print(df['Area'].loc['CH'])
print(df['Area'].iloc[0])

print('先取出行,再取列:')
print(df.loc['CH']['Area'])
print(df.iloc[0]['Area'])

先取出列,再取行:
9.597M km2
9.597M km2
9.597M km2
先取出行,再取列:
9.597M km2
9.597M km2



# 转换行和列
print(df.T)

                        CH            US            AU
Area            9.597M km2    9.834M km2    7.692M km2
Happiness Rank          79            14             9
Language           Chinese  English (US)  English (AU)
Name                    中国            美国          澳大利亚
Location                地球            地球            地球
Region                  亚洲           北美洲           大洋洲

三.删除操作

print(df.drop(['CH']))
# 注意drop操作只是将修改后的数据copy一份,而不会对原始数据进行修改
print(df)

          Area  Happiness Rank      Language  Name Location Region
US  9.834M km2              14  English (US)    美国       地球    北美洲
AU  7.692M km2               9  English (AU)  澳大利亚       地球    大洋洲
          Area  Happiness Rank      Language  Name Location Region
CH  9.597M km2              79       Chinese    中国       地球     亚洲
US  9.834M km2              14  English (US)    美国       地球    北美洲
AU  7.692M km2               9  English (AU)  澳大利亚       地球    大洋洲



print(df.drop(['CH'], inplace=True))
# 如果使用了inplace=True,会在原始数据上进行修改,同时不会返回一个copy
print(df)

None
          Area  Happiness Rank      Language  Name Location Region
US  9.834M km2              14  English (US)    美国       地球    北美洲
AU  7.692M km2               9  English (AU)  澳大利亚       地球    大洋洲



#  如果需要删除列,需要指定axis=1
print(df.drop(['Area'], axis=1))
print(df)

 Happiness Rank      Language  Name Location Region
US              14  English (US)    美国       地球    北美洲
AU               9  English (AU)  澳大利亚       地球    大洋洲
          Area  Happiness Rank      Language  Name Location Region
US  9.834M km2              14  English (US)    美国       地球    北美洲
AU  7.692M km2               9  English (AU)  澳大利亚       地球    大洋洲



# 也可直接使用del关键字
del df['Name']
print(df)

   Area  Happiness Rank      Language Location Region
US  9.834M km2              14  English (US)       地球    北美洲
AU  7.692M km2               9  English (AU)       地球    大洋洲

四.DataFrame的操作与加载

print(df['Happiness Rank'])

US    14
AU     9
Name: Happiness Rank, dtype: int64
 


# 注意从DataFrame中取出的数据进行操作后,会对原始数据产生影响
ranks = df['Happiness Rank']
ranks += 2
print(ranks)
print(df)

US    16
AU    11
Name: Happiness Rank, dtype: int64
          Area  Happiness Rank      Language Location Region
US  9.834M km2              16  English (US)       地球    北美洲
AU  7.692M km2              11  English (AU)       地球    大洋洲



# 注意从DataFrame中取出的数据进行操作后,会对原始数据产生影响
# 安全的操作是使用copy()
ranks = df['Happiness Rank'].copy()
ranks += 2
print(ranks)
print(df)

US    18
AU    13
Name: Happiness Rank, dtype: int64
          Area  Happiness Rank      Language Location Region
US  9.834M km2              16  English (US)       地球    北美洲
AU  7.692M km2              11  English (AU)       地球    大洋洲



# 加载csv文件数据
reprot_2015_df = pd.read_csv('./2015.csv')
print('2015年数据预览:')
print(reprot_2015_df.head())

2015年数据预览:
       Country          Region  Happiness Rank  Happiness Score  \
0  Switzerland  Western Europe               1            7.587   
1      Iceland  Western Europe               2            7.561   
2      Denmark  Western Europe               3            7.527   
3       Norway  Western Europe               4            7.522   
4       Canada   North America               5            7.427   

   Standard Error  Economy (GDP per Capita)   Family  \
0         0.03411                   1.39651  1.34951   
1         0.04884                   1.30232  1.40223   
2         0.03328                   1.32548  1.36058   
3         0.03880                   1.45900  1.33095   
4         0.03553                   1.32629  1.32261   

   Health (Life Expectancy)  Freedom  Trust (Government Corruption)  \
0                   0.94143  0.66557                        0.41978   
1                   0.94784  0.62877                        0.14145   
2                   0.87464  0.64938                        0.48357   
3                   0.88521  0.66973                        0.36503   
4                   0.90563  0.63297                        0.32957   

   Generosity  Dystopia Residual  
0     0.29678            2.51738  
1     0.43630            2.70201  
2     0.34139            2.49204  
3     0.34699            2.46531  
4     0.45811            2.45176  



print(reprot_2015_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 12 columns):
Country                          158 non-null object
Region                           158 non-null object
Happiness Rank                   158 non-null int64
Happiness Score                  158 non-null float64
Standard Error                   158 non-null float64
Economy (GDP per Capita)         158 non-null float64
Family                           158 non-null float64
Health (Life Expectancy)         158 non-null float64
Freedom                          158 non-null float64
Trust (Government Corruption)    158 non-null float64
Generosity                       158 non-null float64
Dystopia Residual                158 non-null float64
dtypes: float64(9), int64(1), object(2)
memory usage: 14.9+ KB
None



print(reprot_2015_df.describe())

Happiness Rank  Happiness Score  Standard Error  \
count      158.000000       158.000000      158.000000   
mean        79.493671         5.375734        0.047885   
std         45.754363         1.145010        0.017146   
min          1.000000         2.839000        0.018480   
25%         40.250000         4.526000        0.037268   
50%         79.500000         5.232500        0.043940   
75%        118.750000         6.243750        0.052300   
max        158.000000         7.587000        0.136930   

       Economy (GDP per Capita)      Family  Health (Life Expectancy)  \
count                158.000000  158.000000                158.000000   
mean                   0.846137    0.991046                  0.630259   
std                    0.403121    0.272369                  0.247078   
min                    0.000000    0.000000                  0.000000   
25%                    0.545808    0.856823                  0.439185   
50%                    0.910245    1.029510                  0.696705   
75%                    1.158448    1.214405                  0.811013   
max                    1.690420    1.402230                  1.025250   

          Freedom  Trust (Government Corruption)  Generosity  \
count  158.000000                     158.000000  158.000000   
mean     0.428615                       0.143422    0.237296   
std      0.150693                       0.120034    0.126685   
min      0.000000                       0.000000    0.000000   
25%      0.328330                       0.061675    0.150553   
50%      0.435515                       0.107220    0.216130   
75%      0.549092                       0.180255    0.309883   
max      0.669730                       0.551910    0.795880   

       Dystopia Residual  
count         158.000000  
mean            2.098977  
std             0.553550  
min             0.328580  
25%             1.759410  
50%             2.095415  
75%             2.462415  
max             3.602140  



print(reprot_2015_df.tail())

Country                           Region  Happiness Rank  \
153   Rwanda               Sub-Saharan Africa             154   
154    Benin               Sub-Saharan Africa             155   
155    Syria  Middle East and Northern Africa             156   
156  Burundi               Sub-Saharan Africa             157   
157     Togo               Sub-Saharan Africa             158   

     Happiness Score  Standard Error  Economy (GDP per Capita)   Family  \
153            3.465         0.03464                   0.22208  0.77370   
154            3.340         0.03656                   0.28665  0.35386   
155            3.006         0.05015                   0.66320  0.47489   
156            2.905         0.08658                   0.01530  0.41587   
157            2.839         0.06727                   0.20868  0.13995   

     Health (Life Expectancy)  Freedom  Trust (Government Corruption)  \
153                   0.42864  0.59201                        0.55191   
154                   0.31910  0.48450                        0.08010   
155                   0.72193  0.15684                        0.18906   
156                   0.22396  0.11850                        0.10062   
157                   0.28443  0.36453                        0.10731   

     Generosity  Dystopia Residual  
153     0.22628            0.67042  
154     0.18260            1.63328  
155     0.47179            0.32858  
156     0.19727            1.83302  
157     0.16681            1.56726  

五.在read_csv操作下的索引

import pandas as pd
# 使用index_col指定索引列
# 使用usecols指定需要读取的列
reprot_2016_df = pd.read_csv('F:/2016.csv', 
                             index_col='Country',
                             usecols=['Country', 'Happiness Rank', 'Happiness Score', 'Region'])
print(reprot_2016_df.head())

                     Region  Happiness Rank  Happiness Score
Country                                                     
Denmark      Western Europe               1            7.526
Switzerland  Western Europe               2            7.509
Iceland      Western Europe               3            7.501
Norway       Western Europe               4            7.498
Finland      Western Europe               5            7.413



print('列名(column):', reprot_2016_df.columns)
print('行名(index):', reprot_2016_df.index)

列名(column): Index(['Region', 'Happiness Rank', 'Happiness Score'], dtype='object')
行名(index): Index(['Denmark', 'Switzerland', 'Iceland', 'Norway', 'Finland', 'Canada',
       'Netherlands', 'New Zealand', 'Australia', 'Sweden',
       ...
       'Madagascar', 'Tanzania', 'Liberia', 'Guinea', 'Rwanda', 'Benin',
       'Afghanistan', 'Togo', 'Syria', 'Burundi'],
      dtype='object', name='Country', length=157)



# 注意index是不可变的
reprot_2016_df.index[0] = '丹麦'

TypeError                                 Traceback (most recent call last)
<ipython-input-8-c2f1f4f940d2> in <module>()
      1 # 注意index是不可变的
----> 2 reprot_2016_df.index[0] = '丹麦'

F:\python\ANACONDA\lib\site-packages\pandas\core\indexes\base.py in __setitem__(self, key, value)
   2048 
   2049     def __setitem__(self, key, value):
-> 2050         raise TypeError("Index does not support mutable operations")
   2051 
   2052     def __getitem__(self, key):

TypeError: Index does not support mutable operations



# 重置index
# 注意inplace加与不加的区别
print(reprot_2016_df.reset_index(inplace=True))

       Country          Region  Happiness Rank  Happiness Score
0      Denmark  Western Europe               1            7.526
1  Switzerland  Western Europe               2            7.509
2      Iceland  Western Europe               3            7.501
3       Norway  Western Europe               4            7.498
4      Finland  Western Europe               5            7.413



# 重命名列名
reprot_2016_df = reprot_2016_df.rename(columns={'Region': '地区', 'Hapiness Rank': '排名', 'Hapiness Score': '幸福指数'})
peint(reprot_2016_df.head())

    Country              地区  排名   幸福指数
0      Denmark  Western Europe   1  7.526
1  Switzerland  Western Europe   2  7.509
2      Iceland  Western Europe   3  7.501
3       Norway  Western Europe   4  7.498
4      Finland  Western Europe   5  7.413



# 重命名列名,注意inplace的使用
reprot_2016_df.rename(columns={'Region': '地区', 'Happiness Rank': '排名', 'Happiness Score': '幸福指数'},inplace=True)
print(reprot_2016_df.head())

Country              地区  排名   幸福指数
0      Denmark  Western Europe   1  7.526
1  Switzerland  Western Europe   2  7.509
2      Iceland  Western Europe   3  7.501
3       Norway  Western Europe   4  7.498
4      Finland  Western Europe   5  7.413

六.Boolean Mask

print(reprot_2016_df.head())

    Country              地区  排名   幸福指数
0      Denmark  Western Europe   1  7.526
1  Switzerland  Western Europe   2  7.509
2      Iceland  Western Europe   3  7.501
3       Norway  Western Europe   4  7.498
4      Finland  Western Europe   5  7.413



# 过滤 Western Europe 地区的国家
# only_western_europe = reprot_2016_df['地区'] == 'Western Europe'
print(reprot_2016_df[reprot_2016_df['地区'] == 'Western Europe'])

           Country              地区  排名   幸福指数
0          Denmark  Western Europe   1  7.526
1      Switzerland  Western Europe   2  7.509
2          Iceland  Western Europe   3  7.501
3           Norway  Western Europe   4  7.498
4          Finland  Western Europe   5  7.413
6      Netherlands  Western Europe   7  7.339
9           Sweden  Western Europe  10  7.291
11         Austria  Western Europe  12  7.119
15         Germany  Western Europe  16  6.994
17         Belgium  Western Europe  18  6.929
18         Ireland  Western Europe  19  6.907
19      Luxembourg  Western Europe  20  6.871
22  United Kingdom  Western Europe  23  6.725
29           Malta  Western Europe  30  6.488
31          France  Western Europe  32  6.478
36           Spain  Western Europe  37  6.361
49           Italy  Western Europe  50  5.977
61    North Cyprus  Western Europe  62  5.771
68          Cyprus  Western Europe  69  5.546
93        Portugal  Western Europe  94  5.123
98          Greece  Western Europe  99  5.033



# 过滤 Western Europe 地区的国家
# 并且排名在10之外
only_western_europe_10 = (reprot_2016_df['地区'] == 'Western Europe') & (reprot_2016_df['排名'] > 10)
print(only_western_europe_10)

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11      True
12     False
13     False
14     False
15      True
16     False
17      True
18      True
19      True
20     False
21     False
22      True
23     False
24     False
25     False
26     False
27     False
28     False
29      True
       ...  
127    False
128    False
129    False
130    False
131    False
132    False
133    False
134    False
135    False
136    False
137    False
138    False
139    False
140    False
141    False
142    False
143    False
144    False
145    False
146    False
147    False
148    False
149    False
150    False
151    False
152    False
153    False
154    False
155    False
156    False
Length: 157, dtype: bool



# 叠加 boolean mask 得到最终结果
print(reprot_2016_df[only_western_europe_10])

         Country              地区  排名   幸福指数
11         Austria  Western Europe  12  7.119
15         Germany  Western Europe  16  6.994
17         Belgium  Western Europe  18  6.929
18         Ireland  Western Europe  19  6.907
19      Luxembourg  Western Europe  20  6.871
22  United Kingdom  Western Europe  23  6.725
29           Malta  Western Europe  30  6.488
31          France  Western Europe  32  6.478
36           Spain  Western Europe  37  6.361
49           Italy  Western Europe  50  5.977
61    North Cyprus  Western Europe  62  5.771
68          Cyprus  Western Europe  69  5.546
93        Portugal  Western Europe  94  5.123
98          Greece  Western Europe  99  5.033



# 熟练以后可以写在一行中
print(reprot_2016_df[(reprot_2016_df['地区'] == 'Western Europe') & (reprot_2016_df['排名'] > 10)])

           Country              地区  排名   幸福指数
11         Austria  Western Europe  12  7.119
15         Germany  Western Europe  16  6.994
17         Belgium  Western Europe  18  6.929
18         Ireland  Western Europe  19  6.907
19      Luxembourg  Western Europe  20  6.871
22  United Kingdom  Western Europe  23  6.725
29           Malta  Western Europe  30  6.488
31          France  Western Europe  32  6.478
36           Spain  Western Europe  37  6.361
49           Italy  Western Europe  50  5.977
61    North Cyprus  Western Europe  62  5.771
68          Cyprus  Western Europe  69  5.546
93        Portugal  Western Europe  94  5.123
98          Greece  Western Europe  99  5.033

七.层级索引

print(reprot_2015_df.head())

   Country          Region  Happiness Rank  Happiness Score  \
0  Switzerland  Western Europe               1            7.587   
1      Iceland  Western Europe               2            7.561   
2      Denmark  Western Europe               3            7.527   
3       Norway  Western Europe               4            7.522   
4       Canada   North America               5            7.427   

   Standard Error  Economy (GDP per Capita)   Family  \
0         0.03411                   1.39651  1.34951   
1         0.04884                   1.30232  1.40223   
2         0.03328                   1.32548  1.36058   
3         0.03880                   1.45900  1.33095   
4         0.03553                   1.32629  1.32261   

   Health (Life Expectancy)  Freedom  Trust (Government Corruption)  \
0                   0.94143  0.66557                        0.41978   
1                   0.94784  0.62877                        0.14145   
2                   0.87464  0.64938                        0.48357   
3                   0.88521  0.66973                        0.36503   
4                   0.90563  0.63297                        0.32957   

   Generosity  Dystopia Residual  
0     0.29678            2.51738  
1     0.43630            2.70201  
2     0.34139            2.49204  
3     0.34699            2.46531  
4     0.45811            2.45176  

# 设置层级索引
report_2015_df2 = reprot_2015_df.set_index(['Region', 'Country'])
print(report_2015_df2.head(20))

# level0 索引
print(report_2015_df2.loc['Western Europe'])

# 两层索引
print(report_2015_df2.loc['Western Europe', 'Switzerland'])
                 
# 交换分层顺序
print(report_2015_df2.swaplevel())

猜你喜欢

转载自blog.csdn.net/a1786742005/article/details/82824875