python数据分析Titanic_Survived预测

import pandas as pd
import matplotlib.pyplot as plt

# matplotlib画图注释中文需要设置
from matplotlib.font_manager import FontProperties
titleYW_font_set = FontProperties(fname=r"c:\windows\fonts\Gabriola.ttf", size=15)

test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")
gender_submission = pd.read_csv("gender_submission.csv")

# print(test.head())
# print(train.head())

print(train.info())

# ----------------------------数据处理-----------------------------

# 数据可视化

# # --------------对Name的处理----------------
# train_test_data = [train]
# for dataset in train_test_data:
# dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
# print(train['Title'].value_counts())
# # 统计名字前缀
#
# title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2,
# "Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mlle": 3,"Countess": 3,
# "Ms": 3, "Lady": 3, "Jonkheer": 3, "Don": 3, "Dona" : 3, "Mme": 3,"Capt": 3,"Sir": 3 }
# for dataset in train_test_data:
# dataset['Title'] = dataset['Title'].map(title_mapping)

# --------------对Pclass的处理--------------
# 看看哪种乘客等级下的存活率高
train_pclass_0 = train['Pclass'][train['Survived'] == 0].value_counts()
train_pclass_1 = train['Pclass'][train['Survived'] == 1].value_counts()
train_pclass_01 = pd.concat([train_pclass_0, train_pclass_1], axis=1)
train_pclass_01.columns = ['Not_Surived', 'Survived']
train_pclass_01.plot(kind='bar', alpha=0.9)
plt.xticks([0, 1, 2], ['Pclass_1', 'Pclass_2', 'Pclass_3'], rotation=0)
plt.grid(linestyle="--", color="green", alpha=0.5)
plt.title('Survived_Rate in Pclass', size=20)

# --------------对Sex的处理--------------
# 看看那种性别下的乘客存活率高
train_Sex_0 = train['Sex'][train['Survived'] == 0].value_counts()
train_Sex_1 = train['Sex'][train['Survived'] == 1].value_counts()
train_Sex_01 = pd.concat([train_Sex_0, train_Sex_1], axis=1)
train_Sex_01.columns = ['Not_Surived', 'Survived']
train_Sex_01.plot(kind='bar', alpha=0.9)
plt.xticks(rotation=0)
plt.grid(linestyle="--", color="green", alpha=0.5)
plt.title('Survived_Rate in Sex', size=20)

# --------------对Embarked的处理--------------
# 看看那种登船港口下的乘客存活率高
train_Embarked_0 = train['Embarked'][train['Survived'] == 0].value_counts()
train_Embarked_1 = train['Embarked'][train['Survived'] == 1].value_counts()
train_Embarked_01 =pd.concat([train_Embarked_0, train_Embarked_1], axis=1)
train_Embarked_01.columns = ['Not_Surived', 'Survived']
train_Embarked_01.plot(kind='bar', alpha=0.9)
plt.xticks(rotation=0)
plt.grid(linestyle="--", color="green", alpha=0.5)
plt.title('Survived_Rate in Embarked', size=20)

# 查看缺失值
print(train.isnull().sum())

# 填补空缺值
train['Age'].fillna(train['Age'].median(), inplace=True)

# --------------对Age的处理--------------
# 对年龄进行离散化,查看每一组的存活率
# 等宽离散化函数
train['Age_set'] = pd.cut(train['Age'], bins=6, labels=['child', 'Teenager', 'universe', 'Adults', 'elder', 'old man'])
# 看看那种年龄段的乘客存活率高
train_Age_set_0 = train['Age_set'][train['Survived'] == 0].value_counts()
train_Age_set_1 = train['Age_set'][train['Survived'] == 1].value_counts()
train_Age_set_01 =pd.concat([train_Age_set_0, train_Age_set_1], axis=1)
train_Age_set_01.columns = ['Not_Surived', 'Survived']
train_Age_set_01.plot(kind='bar', alpha=0.9)
plt.xticks(rotation=0)
plt.grid(linestyle="--", color="green", alpha=0.5)
plt.title('Survived_Rate in Age_Set', size=20)


# --------------对SibSp和Parch的处理--------------
# 把SibSp与Parch相加
train['Family_N'] = train['Parch'] + train['SibSp']+1
# print(train[['Family_N', 'Survived']])
# 分组,按不同的家人数分组
bins = pd.IntervalIndex.from_tuples([(0, 1), (1, 2), (2, 20)])
train['Family_N'] = pd.cut(train['Family_N'], bins)
# 看看那种家庭人数的乘客存活率高
train_Family_N_0 = train['Family_N'][train['Survived'] == 0].value_counts()
train_Family_N_1 = train['Family_N'][train['Survived'] == 1].value_counts()
train_Family_N_01 = pd.concat([train_Family_N_0, train_Family_N_1], axis=1)
train_Family_N_01.columns = ['Not_Surived', 'Survived']
train_Family_N_01.plot(kind='bar', alpha=0.9)
plt.xticks([0, 1, 2], ['one', 'more_than_three', 'two'], rotation=0)
plt.grid(linestyle="--", color="green", alpha=0.5)
plt.title('Survived_Rate in Faminly_N', size=20)
# plt.show()
train.info()
# train.drop(['SibSp', 'Parch', 'Ticket'], axis=1, inplace=True)

# --------------对Cabin的处理--------------
# 对已知的Cbiin进行分组,聚合时采用众数的方法
# 这里构建数据透视表即可
train_notna = train.dropna()
train_C_F = pd.pivot_table(data=train_notna[['Cabin', 'Fare']], index='Cabin', values='Fare',
aggfunc=lambda x: x.mode())
print(train_C_F)
# 发现众数可能不止一个,所以进行分离众数的操作
for i in range(train_C_F.shape[0]):
if type(train_C_F['Fare'][i]) != type(train_C_F['Fare'][1]):
train_C_F['Fare'][i] = train_C_F['Fare'][i][0]

# 对众数进行排序
train_C_F_sort = train_C_F.sort_values(by=['Fare'])
print(train_C_F_sort)
# 对缺失的Cabin进行填补
# 首先找出空白处
train_bool = train['Cabin'].isnull()
# print(train_bool)
na_index = train_bool[train_bool == True].index

# 从上述的index来赋予客舱位置
for i in na_index:
for j in range(train_C_F_sort.shape[0]):
if train['Fare'][i] <= train_C_F_sort['Fare'][j]:
train['Cabin'][i] = train_C_F_sort.index[j]
break

# print(train['Cabin'])
# -----------------------------------------------------------------

# 查看列名
print(train.columns)

# # 提取出训练集
X_train = train.drop(['Survived', 'PassengerId', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket'], axis=1)
Y_train = train['Survived']

print(X_train.columns)
# 哑变量处理
# 把空白值也当作变量处理
X_train = pd.get_dummies(X_train, columns=['Pclass', 'Sex', 'Cabin', 'Embarked', 'Age_set', 'Family_N'],
dummy_na=True)
print(X_train.columns)

# 数据集划分
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train,Y_train, test_size=0.2, random_state=123)

# 标准化
from sklearn.preprocessing import StandardScaler
Standard = StandardScaler().fit(X_train) # 训练产生标准化的规则,因为数据集分为训练与测试,测试相当于后来的。

Xtrain = Standard.transform(X_train) # 将规则应用于训练集
Xtest = Standard.transform(X_test) # 将规则应用于测试集

# 进行分类算法
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier().fit(X_train, y_train)
y_pred = clf.predict(Xtest)

# 判定分类算法
from sklearn.metrics import classification_report, auc
print(classification_report(y_test, y_pred))


# 绘制roc曲线
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = 'SimHei' # 改字体
# 求出ROC曲线的x轴和Y轴
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
print(auc(fpr, tpr))
plt.figure(figsize=(10, 6))
plt.xlim(0, 1) # 设定x轴的范围
plt.ylim(0.0, 1.1) # 设定y轴的范围
plt.xlabel('假正率')
plt.ylabel('真正率')
plt.plot(fpr, tpr, linewidth=2, linestyle="-", color='red')
plt.title('Line Roc of X_train by GradientBoostingClassifier()', size=20)
plt.show()



猜你喜欢

转载自www.cnblogs.com/hirokuh/p/9335218.html