1. Kaggle Titanic data titanic complete download, shared by the original author's conscience
https://download.csdn.net/download/lansui7312/9936840
2. Missing value handling
# -*- coding:utf - import pandas as pd import numpy as np from sklearn.ensemble import RandomForestRegressor df = pd.read_csv('train.csv',header=0) # SibSp number of cousins/sisters # Parch number of parents and children # Cabin #Embarked Port of Embarkation # df.head(20) # df.info() # print(df.describe()) # print(df.columns[1]) # Male and Female Survival Rate View x=[df[(df.Sex=='male')]['Sex'].size,df[(df.Sex=='female')]['Sex'].size] # print(x) y=[df[(df.Sex=='male') & (df.Survived==1)]['Sex'].size, df[(df.Sex=='female') & (df.Survived==1)]['Sex'].size] # print(y[1]*100/x[1],y[0]*100/x[0]) # missing value handling # 1. Delete missing lines # 2. The boarding location is not important, use the mode to assign the value df.Embarked[df.Embarked.isnull()] = df.Embarked.dropna().mode().values # print(df.Embarked) #3. Nominal attribute, assign missing values, because the missing itself may also be an implicit information. For example, Cabin means no cabin df.Cabin[df.Cabin.isnull()]='U0' # print(df.Cabin) #4. Use regression, random forest, etc. models to predict the value of missing attributes # Because age is a very important feature, it is necessary to ensure a certain accuracy of missing value filling age_df=df[['Age','Survived','Fare','Parch','SibSp','Pclass']] age_df_notnull = age_df.loc[(df.Age.notnull())] age_df_isnull = age_df.loc[(df.Age.isnull())] X = age_df_notnull.values[:,1:] Y = age_df_notnull.values[:,0] # y1 = age_df_notnull.values[:,2:] # X[:,0] is a way of writing arrays in numpy. For a two-dimensional array, take all the data in the first dimension of the two-dimensional array, and take the 0th data in the second dimension. Intuitively, X [:,0] is to take the 0th data of all rows, X[:,1] is to take the first data of all rows # print(X) rfr = RandomForestRegressor(n_estimators=1000,n_jobs=-1) rfr.fit(X,Y) predictAges = rfr.predict(age_df_isnull.values[:,1:]) df.loc[(df.Age.isnull()),'Age'] = predictAges print(df.describe())
3.