pandas基本数据对象及操作

1、Series

创建Series
import pandas as pd
countries = ['中国', '美国', '澳大利亚']
countries_s = pd.Series(countries)
print(type(countries_s))
print(countries_s)

numbers = [4, 5, 6]
print(pd.Series(numbers))

country_dicts = {'CH': '中国',
                'US': '美国',
                'AU': '澳大利亚'}
country_dict_s = pd.Series(country_dicts)
# 给索引命名
country_dict_s.index.name = 'Code'
# 给数据命名
country_dict_s.name = 'Country'
print(country_dict_s)
print(country_dict_s.values)
print(country_dict_s.index)

处理缺失数据
countries = ['中国', '美国', '澳大利亚', None]
print(pd.Series(countries))
numbers = [4, 5, 6, None]
print(pd.Series(numbers))
country_dicts = {'CH': '中国',
                'US': '美国',
                'AU': '澳大利亚'}
country_dict_s = pd.Series(country_dicts)
print(country_dict_s)

# 通过索引判断数据是存在
# Series也可看作定长、有序的字典
print('CH' in country_dict_s)
print('NZ' in country_dict_s)
print('iloc:', country_dict_s.iloc[1])
print('loc:', country_dict_s.loc['US'])
print('[]:', country_dict_s['US'])
print('iloc:\n', country_dict_s.iloc[ [0, 2] ])
print()
print('loc:\n', country_dict_s.loc[['US', 'AU']])

向量化操作
import numpy as np
s = pd.Series(np.random.randint(0, 1000, 10000))
print(s.head())
print(len(s))

2、DataFrame

创建Dataframe
import pandas as pd
country1 = pd.Series({'Name': '中国',
                    'Language': 'Chinese',
                    'Area': '9.597M km2',
                     'Happiness Rank': 79})
country2 = pd.Series({'Name': '美国',
                    'Language': 'English (US)',
                    'Area': '9.834M km2',
                     'Happiness Rank': 14})
country3 = pd.Series({'Name': '澳大利亚',
                    'Language': 'English (AU)',
                    'Area': '7.692M km2',
                     'Happiness Rank': 9})
df = pd.DataFrame([country1, country2, country3], index=['CH', 'US', 'AU'])
# 注意在jupyter中使用print和不使用print的区别
print(df)
# 添加数据
# 如果个数小于要求的个数，会自动进行“广播”操作
# 如果大于要求的个数，会报错
df['Location'] = '地球'
print(df)
df['Region'] = ['亚洲', '北美洲', '大洋洲']
print(df)
Dataframe索引
# 行索引
print('loc:')
print(df.loc['CH'])
print(type(df.loc['CH']))
print('iloc:')
print(df.iloc[1])
print(df['Area'])
# 列索引
print(df['Area'])
print(type(df['Area']))
# 获取不连续的列数据
print(df[['Name', 'Area']])
# 混合索引
# 注意写法上的区别
print('先取出列，再取行：')
print(df['Area']['CH'])
print(df['Area'].loc['CH'])
print(df['Area'].iloc[0])
print('先取出行，再取列：')
print(df.loc['CH']['Area'])
print(df.iloc[0]['Area'])
# 转换行和列
print(df.T)
删除数据
print(df.drop(['CH']))
# 注意drop操作只是将修改后的数据copy一份，而不会对原始数据进行修改
print(df)
print(df.drop(['CH'], inplace=True))
# 如果使用了inplace=True，会在原始数据上进行修改，同时不会返回一个copy
print(df)
#  如果需要删除列，需要指定axis=1
print(df.drop(['Area'], axis=1))
print(df)
# 也可直接使用del关键字
del df['Name']
print(df)
DataFrame的操作与加载
print(df['Happiness Rank'])
# 注意从DataFrame中取出的数据进行操作后，会对原始数据产生影响
ranks = df['Happiness Rank']
ranks += 2
print(ranks)
print(df)
# 注意从DataFrame中取出的数据进行操作后，会对原始数据产生影响
# 安全的操作是使用copy()
ranks = df['Happiness Rank'].copy()
ranks += 2
print(ranks)
print(df)
# 加载csv文件数据
reprot_2015_df = pd.read_csv('./2015.csv')
print('2015年数据预览：')
#print(reprot_2015_df.head())
reprot_2015_df.head()
print(reprot_2015_df.info())

3、索引

[数据集2016.csv下载地址]
https://pan.baidu.com/s/1_D8rTk1Kl5io1qnBXMXhcA
密码：u2vt
# 使用index_col指定索引列
# 使用usecols指定需要读取的列
reprot_2016_df = pd.read_csv('./2016.csv', 
                             index_col='Country',
                             usecols=['Country', 'Happiness Rank', 'Happiness Score', 'Region'])
# 数据预览
reprot_2016_df.head()
print('列名(column)：', reprot_2016_df.columns)
print('行名(index)：', reprot_2016_df.index)
# 注意index是不可变的
reprot_2016_df.index[0] = '丹麦'
# 重置index
# 注意inplace加与不加的区别
reprot_2016_df.reset_index(inplace=True)
print(reprot_2016_df.head())
# 重命名列名
reprot_2016_df = reprot_2016_df.rename(columns={'Region': '地区', 'Hapiness Rank': '排名', 'Hapiness Score': '幸福指数'})
reprot_2016_df.head()
# 重命名列名，注意inplace的使用
reprot_2016_df.rename(columns={'Region': '地区', 'Happiness Rank': '排名', 'Happiness Score': '幸福指数'},
                     inplace=True)
reprot_2016_df.head()

4、Boolean Mask

print(reprot_2016_df.head())
# 过滤 Western Europe 地区的国家
# only_western_europe = reprot_2016_df['地区'] == 'Western Europe'
reprot_2016_df[reprot_2016_df['地区'] == 'Western Europe']
# 过滤 Western Europe 地区的国家
# 并且排名在10之外
only_western_europe_10 = (reprot_2016_df['地区'] == 'Western Europe') & (reprot_2016_df['排名'] > 10)
only_western_europe_10
# 叠加 boolean mask 得到最终结果
reprot_2016_df[only_western_europe_10]
# 熟练以后可以写在一行中
reprot_2016_df[(reprot_2016_df['地区'] == 'Western Europe') & (reprot_2016_df['排名'] > 10)]

5、层级索引

[数据集2015.csv下载地址]
https://pan.baidu.com/s/1-tBedyPvbuKQFJP5BdR1yA
密码：j22j
print(reprot_2015_df.head())
# 设置层级索引
report_2015_df2 = reprot_2015_df.set_index(['Region', 'Country'])
report_2015_df2.head(20)
# level0 索引
report_2015_df2.loc['Western Europe']
# 两层索引
report_2015_df2.loc['Western Europe', 'Switzerland']
# 交换分层顺序
report_2015_df2.swaplevel()
# 排序分层
report_2015_df2.sort_index(level=0)

pandas基本数据对象及操作

猜你喜欢