3. pandas学习笔记DataFrame高级

版权声明:转载注明出处 https://blog.csdn.net/deephacking/article/details/82710534
import pandas as pd
import numpy as np
import math
pd.set_option('display.max_columns', None)
# 导入泰坦尼克号的训练数据集;https://www.kaggle.com/shivamp629/traincsv/version/1
data = pd.read_csv("train.csv")
print(data.shape)
print(data.columns)
# print(data["PassengerId"])
age = data["Age"]
# print(age.loc[0:10])
age_is_null = pd.isnull(age)
print(age_is_null)
null_age = age.loc[age_is_null]
print(null_age)
print(len(null_age))
# 有问题的年龄平均值
mean_age = sum(data["Age"]) / len(data["Age"])
print(mean_age)
# 将正常的年龄数据取出来
good_age = data["Age"][age_is_null == False]
print(good_age)
# 求平均年龄
mean_age = sum(good_age) / len(good_age)
print(mean_age)
# 现成的求平均值函数
mean_age = data["Age"].mean()
print(mean_age)

# 用均值填充缺失值的项
data.fillna(0)
print(data)

# 求不同等级仓位的价格平均值
passenger_classes = [1, 2, 3]
fares_by_class = {}
for this_class in passenger_classes:
    pclass_rows_passenger = data[data["Pclass"] == this_class]
    pclass_fares = pclass_rows_passenger["Fare"]
    pclass_fare_mean = pclass_fares.mean()
    fares_by_class[this_class] = pclass_fare_mean
print(fares_by_class)
# 求不同等级仓位的价格平均值(快速)
fares_by_class = data.pivot_table(index="Pclass", values="Fare", aggfunc=np.mean)
print(fares_by_class)
# 求不同仓位passenger的存活人数
alive_means = data.pivot_table(index="Pclass", values="Survived", aggfunc=np.sum)
print(alive_means)
# 求不同仓位passenger的存活率
passenger_survival = data.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
print(passenger_survival)
# 不同仓位乘客平均年龄,无aggfunc时,按照means去做
passenger_age = data.pivot_table(index="Pclass", values="Age")
print(passenger_age)
# 登舱口与费用、存活人数的关系
port_starts = data.pivot_table(index="Embarked", values=["Fare", "Survived"], aggfunc=np.sum)
print(port_starts)
# 空值预处理
print(data.columns)
# 删除含有空值的列
drop_na_columns = data.dropna(axis=1)
print(drop_na_columns)
# 删除掉含有空值的行
drop_na_rows = data.dropna(axis=0)
print(drop_na_rows)
# 指定列,列中含有缺失值才删除相应的行或者列
drop_rows = data.dropna(axis=0,subset=["Age", "Sex"])
print(drop_rows)
# 定位得到具体值,行号和列号来索引
row_index_83_age = data.loc[0, "Age"]
print(row_index_83_age)

# 降序排列
sort_data = data.sort_values("Age", ascending=False)
print(sort_data[0:10])
# 重置索引
data_reindex = sort_data.reset_index(drop=True)
print(data_reindex[0:10])


# 通过Apply函数来获取第100行的数据,传入参数为列,如果传入DataFrame,代表传入列数组
def hundredth_row(column):
    hundredth_rows = column.loc[99]
    return(hundredth_rows)


hundredth_data = data.apply(hundredth_row)
print(hundredth_data)


# 通过apply函数计算非空值个数,传入参数为列,如果传入DataFrame,代表传入列数组
def not_null_count(column):
    data_null = pd.isnull(column)
    null = column[data_null == False]
    return len(null)


not_count_null = data.apply(not_null_count)
print(not_count_null)

猜你喜欢

转载自blog.csdn.net/deephacking/article/details/82710534