pandas是一个关于数据怎样处理的库
1,pandas数据读取
import pandas as pd
food_info = pd.read_csv("food_info.csv") #读取csv文件,该文件的格式为dataframe
print(type(food_info)) #打印索引列的数据类型
print(food_info.dtype) #打印food_info的类型为dataframe
print(help(pd.read_csv))
food_info.head(3) #打印数据中前三行数据,默认为前五行
food_info.tail(3) #打印文件中后三行的数据,默认为后五行
print(food_info.columns) #打印文件头
print(food_info.shape) #打印文件的shape
2,pandas索引与计算
#取表格特定部分
print(food_info.loc[0]) #打印表格第一行
print(food_info.loc[3:6]) #打印表格第3到6行
ndb_col = food_info["NDB_No"]
print(ndb_col) #打印表格NDB_No列
#索引表头含有特殊符号的列
col_names = food_info.columns.tolist() #将表格的头转换为list
print(col_names)
gram_columns = [] #创建一个空向量用于循环中填充
for c in col_names:
if c.endswith("(g)") #判断columns中是否含有g
gram_columns.append(c) #如果有将其填入gram_columns中
gram_df = food_info[gram_columns] #取含有g的列建立一个表格
print(gram_df.head(3))
#对表格特定部分进行运算
print(food_info["Iron_(mg)"])
div_1000 = food_info["Iron_(mg)"] / 1000 #将mg变为g
print(div_1000)
#表格两部分进行运算
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
iron_gram = food_info["Iron_(mg)"] / 1000
print(food_info.shape)
food_info["Iron_(g)"] = iron_gram #为表格增加Iron_(g)列
print(food_info.shape)
max_calies = food_info["Energ_Kcal"].max() #寻找特定列的最大值
print(max_calies)
#排序
food_info.sort_values("Sodium_(mg)",inplace = True,ascending = False) #将某列按照数值进行排序,默认为从小到大,参数ascending决定升降序,inplace决定在原列上还是在新列上进行
3,Pandas数据预处理实例
#泰坦尼克号乘员情况表
import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
titanic_survival.head() #打印cvs数据前五列
age = titanic_survival["Age"]
age_is_null = pd.isnull(age) #判断age列的数据是否为空,空输出True,否则输出False
age_null_true = age[age_is_null] #存储age为空的数据
age_true_count = len(age_null_true) #计算多少人未得知年龄
mean_age = sum(titanic_survival["Age"]) / len(titanic_survial["Age"])
print(mean_age) #涉及nan计算得到输出为nan
good_ages = titanic_survival["Age"][age_is_null = False] #获得获得年龄信息人的年龄
mean_goodage = sum(good_ages) / len(good_ages)
print(mean_goodage) #得到正确的平均年龄
mean_goodage1 = titanic_survival["age"].mean()
print(mean_goodage) #利用函数得到平均年龄
#计算各船舱等级的平均价格
passenger_class = [1,2,3]
fares_by_class = {}
for this_class in passenger_class:
pclass_rows = titanic_survival[titanic_survival[Pclass] == this_class]
pclass_fares = pclass_rows["Fare"]
fare_for_pclass = pclass_fares.mean()
fares_by_class[this_class] = fare_for_pclass
print(fares_by_class)
#利用pivot函数计算各船舱等级关系
fares_by_claass = titanic_survival.pivot_table(index = "Pclass",values = "Fares",aggfunc = np.mean)
#计算上船地点与票价和生还人数的关系
port_stats = titanic_survival.pivot_table(index = "Embarked",values = ["Fare","Survived"],aggfunc = np.sum)
#删去Age与Sex列为空的行
new_titanic_survival = titanic_survival.dropna(axis = 0,subset = ["Age","Sex"])
#准确定位数据某点
row_index_83_age = titanic_survival.loc[83,"Age"]
print(row_index_83_age)
对age数据进行排序,然后重设序号值
new_titanic_survival = titanic_survival.sort_values("Age",ascending = False)
print(new_titanic_survival)
titanic_reindexed = new_titanic_survival.reset_index(drop = True)
print(titanic_reindexed)
4,Pandas自定义函数
def hundred_row(column):
hundred_item = column.loc[99]
return hundred_item
hundred_row = titanic_survival.apply(hundred_row) #pandas自定义函数
def null_count(column): #计算各个列的空值
column_null = pd.isnull(column)
null = column[column_null]
return len(null)
null_count = titanic_survival.apply(null_count)
5,Series结构
Dataframe结构就是由一系列Series结构构成,Dataframe里是series,series里面是ndarray
series与大部分的dataframe相同
import pandas as pd
fandango = pd.read_csv("fandango_score_comparison.csv")
series_film = fandango['FILM'] #取seriesFILM
print(type(series_film))
series_rt = fandango["RottenTomatoes"]
from pandas import Series
film_name = series_film.values
rt_scores = series_rt.values
series_custom = Series(rt_scores,index = film_name)
series_custom[["Minions (2015)","Leviathan (2014)"]]
fiveten = series_custom[0:10]
print(fiveten)