Pandas数据处理基于numpy ,DateFram 由多个series组成,每个series代表一行或者一列。
1.Pandas数据读取
import pandas food_info = pandas.read_csv('food_info.csv') print(type(food_info)) #数据类型 DateFram print(food_info.dtypes) print(help(pandas.read_csv))
food_info.head(3) #显示数据 显示前3条数据
food_info.tail(4) #显示后四行
print(food_info.columns) #显示列名 print(food_info.shape) #8618个数据,36列指标
print(food_info.loc[0]) #读取第一行数据 food_info.loc[3:6] #读取3-6行数据
ndb_col = food_info['NDB_No'] #取NDB_No列 print(ndb_col)
columns = ['Zinc_(mg)','Copper_(mg)'] #取'Zinc_(mg)','Copper_(mg)'两列数据 zinc = food_info[columns] print(zinc)
col_name = food_info.columns.tolist() #取第一列的表示元素 print(col_name) #输出第一列中所有元素 gram_columns = [] for c in col_name: #找到第一列元素中含有(g)的元素,输出出来 if c.endswith('(g)'): gram_columns.append(c) gram_df = food_info[gram_columns] print(gram_df.head(3))
print(food_info['Iron_(mg)']) #将Iron_(mg)中的元素都除以1000 div_1000 = food_info['Iron_(mg)']/1000 print(div_1000)
iron_grams = food_info['Iron_(mg)']/1000 #添加Iron_(g) 一列数据 print(food_info.shape) food_info['Iron_(g)'] = iron_grams print(food_info.shape)
donkey = food_info['Iron_(mg)'].max() #找到Iron_(mg)里面的最大值 print(donkey)
food_info.sort_values('Sodium_(mg)',inplace = True) #升序 print(food_info['Sodium_(mg)']) food_info.sort_values('Sodium_(mg)',inplace = True,ascending=False) #降序 print(food_info['Sodium_(mg)'])2.索引的计算和数据预处理
import pandas as pd #读取文件 import numpy as np titanic_survival = pd.read_csv('titanic_train.csv') titanic_survival.head()
age = titanic_survival['Age'] #输出0-10个age print(age.loc[0:10]) age_is_null = pd.isnull(age) #判断age是否为空 print(age_is_null) age_true_null = age[age_is_null] #输出age为空的元素 print(age_true_null) age_true_num = len(age_true_null) #统计age为空的数量 print(age_true_num)
mean_age = titanic_survival['Age'][age_is_null == False] #计算年龄的平均值 correct_mean_age = sum(mean_age)/len(mean_age) print(correct_mean_age)
correct_mean_age = titanic_survival['Age'].mean() #计算年龄的平均值 print(correct_mean_age)
passager = [1,2,3] #求1 2 3等座的平均票价 gares = {} for this_class in passager: pclass = titanic_survival[titanic_survival['Pclass'] == this_class] fares = pclass['Fare'] mean = fares.mean() gares[this_class] = mean print(gares)
donkey = titanic_survival.pivot_table(index='Pclass',values = 'Fare',aggfunc = np.mean) #求1 2 3等座的平均票价 简便方法 print(donkey)
donkey = titanic_survival.pivot_table(index='Pclass',values = ['Fare','Age'],aggfunc = np.mean) #求1 2 3等座的平均票价和年龄 print(donkey)
drop1 = titanic_survival.dropna(axis=1) #删除列缺失的表 通常不会这样操作因为会丢失变量 drop2 = titanic_survival.dropna(axis=0,subset=['Age','Sex']) #删除行缺失的元素 print(drop2)
dingwei = titanic_survival.loc[83,'Age'] #定位第83的age是多大 print(dingwei)
#排序 donkey = titanic_survival.sort_values('Age',ascending=False) #降序排序 print(donkey[0:10]) donkey1 = donkey.reset_index(drop=True) #从0开始重新设置前面的排序数字 print(donkey1.loc[0:10])
3自定义函数
def hundre(colume): #输出第100行所有的数值 donkey = colume.loc[99] return donkey hundre = titanic_survival.apply(hundre) print(hundre)
def donkey(colume): #输出数值为空的每个变量里面的数量 donkey1 = pd.isnull(colume) null = colume[donkey1] return len(null) count = titanic_survival.apply(donkey) print(count)
def which_class(row): #每一行数据中的座位等级 pclass = row['Pclass'] if pd.isnull(pclass): return 'Unknuwn' elif pclass == 1: return 'First' elif pclass ==2: return 'Scond' else: return 'Third' classes = titanic_survival.apply(which_class,axis=1) print(classes)
import pandas as pd donkey = pd.read_csv('fandango_score_comparison.csv') film = donkey['FILM'] print(type(film)) print(film.loc[0:5]) donkey1 = donkey['RottenTomatoes'] print(donkey1[0:5])
from pandas import Series donkey = film.values #将FILM中的值赋值给donkey print(type(donkey)) #numpy类型 print(donkey) donkey2 = donkey1.values #将RottebTomatoes的值赋给donkey2 print(donkey2)