housing_price_code_kaggle

# -*- coding:utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
from sklearn.base import BaseEstimator,TransformerMixin,RegressorMixin,clone
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler,StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline,make_pipeline
from scipy.stats import skew
from sklearn.decomposition import PCA,KernelPCA
from scipy.stats import skew
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score,GridSearchCV,KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,ExtraTreesRegressor
from sklearn.svm import SVR,LinearSVR
from sklearn.linear_model import ElasticNet,SGDRegressor,BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor

"""========================================================="""
pd.set_option('max_colwidth',200)
pd.set_option('display.width',200)
pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',1000)

"""========================================================="""
train=pd.read_csv('./data/train.csv')
test=pd.read_csv('./data/test.csv')
plt.figure(figsize=(15,8))
sns.boxplot(train.YearBuilt, train.SalePrice)

"""============================================================="""
plt.figure(figsize=(10,8))
plt.scatter(x=train.GrLivArea,y=train.SalePrice)
plt.xlabel('GrLivArea',fontsize=13)
plt.ylabel('SalePrice',fontsize=13)

"""============================================================="""
train.drop(train[(train.GrLivArea>4000)& (train.SalePrice<300000)].index,inplace=True)
full=pd.concat([train,test],ignore_index=True)
full.drop(['Id'],axis=1,inplace=True)
print(full.shape)


"""=======================Data Clean==========================="""
aa=full.isnull().sum()
print(aa[aa>0].sort_values(ascending=True))
print(full.groupby(['Neighborhood'])[['LotFrontage']].agg(['mean','median','count']))
full['LotAreaCut']=pd.qcut(full.LotArea,10)#输出LotArea值所属qcut后的类别;
print(full.groupby(['LotAreaCut'])[['LotFrontage']].agg(['mean','median','count']))

full['LotFrontage']=full.groupby(['LotAreaCut','Neighborhood'])['LotFrontage'].transform(lambda x:x.fillna(x.median()))
full['LotFrontage']=full.groupby(['LotAreaCut'])['LotFrontage'].transform(lambda x:x.fillna(x.median()))

#Then we filling in other missing values according to data_description.
cols=["MasVnrArea", "BsmtUnfSF", "TotalBsmtSF", "GarageCars", "BsmtFinSF2", "BsmtFinSF1", "GarageArea"]
for col in cols:
    full[col].fillna(0,inplace=True)
cols1 = ["PoolQC" , "MiscFeature", "Alley", "Fence", "FireplaceQu", "GarageQual", "GarageCond", "GarageFinish", "GarageYrBlt", "GarageType", "BsmtExposure", "BsmtCond", "BsmtQual", "BsmtFinType2", "BsmtFinType1", "MasVnrType"]
for col in cols1:
    full[col].fillna("None",inplace=True)

cols2 = ["MSZoning", "BsmtFullBath", "BsmtHalfBath", "Utilities", "Functional", "Electrical", "KitchenQual", "SaleType","Exterior1st", "Exterior2nd"]
for col in cols2:
    full[col].fillna(full[col].mode()[0],inplace=True)

print(full.isnull().sum()[full.isnull().sum()>0])

"""===========================feature engineering================================"""
NumStr = ["MSSubClass","BsmtFullBath","BsmtHalfBath","HalfBath","BedroomAbvGr","KitchenAbvGr","MoSold","YrSold","YearBuilt","YearRemodAdd","LowQualFinSF","GarageYrBlt"]
for col in NumStr:
    full[col]=full[col].astype(str)

full.groupby(['MSSubClass'])[['SalePrice']].agg(['mean','median','count'])

def map_values():
    full["oMSSubClass"] = full.MSSubClass.map({'180':1,
                                        '30':2, '45':2,
                                        '190':3, '50':3, '90':3,
                                        '85':4, '40':4, '160':4,
                                        '70':5, '20':5, '75':5, '80':5, '150':5,
                                        '120': 6, '60':6})

    full["oMSZoning"] = full.MSZoning.map({'C (all)':1, 'RH':2, 'RM':2, 'RL':3, 'FV':4})

    full["oNeighborhood"] = full.Neighborhood.map({'MeadowV':1,
                                               'IDOTRR':2, 'BrDale':2,
                                               'OldTown':3, 'Edwards':3, 'BrkSide':3,
                                               'Sawyer':4, 'Blueste':4, 'SWISU':4, 'NAmes':4,
                                               'NPkVill':5, 'Mitchel':5,
                                               'SawyerW':6, 'Gilbert':6, 'NWAmes':6,
                                               'Blmngtn':7, 'CollgCr':7, 'ClearCr':7, 'Crawfor':7,
                                               'Veenker':8, 'Somerst':8, 'Timber':8,
                                               'StoneBr':9,
                                               'NoRidge':10, 'NridgHt':10})

    full["oCondition1"] = full.Condition1.map({'Artery':1,
                                           'Feedr':2, 'RRAe':2,
                                           'Norm':3, 'RRAn':3,
                                           'PosN':4, 'RRNe':4,
                                           'PosA':5 ,'RRNn':5})

    full["oBldgType"] = full.BldgType.map({'2fmCon':1, 'Duplex':1, 'Twnhs':1, '1Fam':2, 'TwnhsE':2})

    full["oHouseStyle"] = full.HouseStyle.map({'1.5Unf':1,
                                           '1.5Fin':2, '2.5Unf':2, 'SFoyer':2,
                                           '1Story':3, 'SLvl':3,
                                           '2Story':4, '2.5Fin':4})

    full["oExterior1st"] = full.Exterior1st.map({'BrkComm':1,
                                             'AsphShn':2, 'CBlock':2, 'AsbShng':2,
                                             'WdShing':3, 'Wd Sdng':3, 'MetalSd':3, 'Stucco':3, 'HdBoard':3,
                                             'BrkFace':4, 'Plywood':4,
                                             'VinylSd':5,
                                             'CemntBd':6,
                                             'Stone':7, 'ImStucc':7})

    full["oMasVnrType"] = full.MasVnrType.map({'BrkCmn':1, 'None':1, 'BrkFace':2, 'Stone':3})

    full["oExterQual"] = full.ExterQual.map({'Fa':1, 'TA':2, 'Gd':3, 'Ex':4})

    full["oFoundation"] = full.Foundation.map({'Slab':1,
                                           'BrkTil':2, 'CBlock':2, 'Stone':2,
                                           'Wood':3, 'PConc':4})

    full["oBsmtQual"] = full.BsmtQual.map({'Fa':2, 'None':1, 'TA':3, 'Gd':4, 'Ex':5})

    full["oBsmtExposure"] = full.BsmtExposure.map({'None':1, 'No':2, 'Av':3, 'Mn':3, 'Gd':4})

    full["oHeating"] = full.Heating.map({'Floor':1, 'Grav':1, 'Wall':2, 'OthW':3, 'GasW':4, 'GasA':5})

    full["oHeatingQC"] = full.HeatingQC.map({'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})

    full["oKitchenQual"] = full.KitchenQual.map({'Fa':1, 'TA':2, 'Gd':3, 'Ex':4})

    full["oFunctional"] = full.Functional.map({'Maj2':1, 'Maj1':2, 'Min1':2, 'Min2':2, 'Mod':2, 'Sev':2, 'Typ':3})

    full["oFireplaceQu"] = full.FireplaceQu.map({'None':1, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})

    full["oGarageType"] = full.GarageType.map({'CarPort':1, 'None':1,
                                           'Detchd':2,
                                           '2Types':3, 'Basment':3,
                                           'Attchd':4, 'BuiltIn':5})

    full["oGarageFinish"] = full.GarageFinish.map({'None':1, 'Unf':2, 'RFn':3, 'Fin':4})

    full["oPavedDrive"] = full.PavedDrive.map({'N':1, 'P':2, 'Y':3})

    full["oSaleType"] = full.SaleType.map({'COD':1, 'ConLD':1, 'ConLI':1, 'ConLw':1, 'Oth':1, 'WD':1,
                                       'CWD':2, 'Con':3, 'New':3})

    full["oSaleCondition"] = full.SaleCondition.map({'AdjLand':1, 'Abnorml':2, 'Alloca':2, 'Family':2, 'Normal':3, 'Partial':4})


    return "Done!"

map_values()

# drop two unwanted columns
full.drop("LotAreaCut",axis=1,inplace=True)
full.drop(['SalePrice'],axis=1,inplace=True)

"""====================Pipeline=============================="""
class labelenc(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass

    def fit(self,X,y=None):
        return self

    def transform(self,X):
        lab=LabelEncoder()
        X['YearBuilt']=lab.fit_transform(X['YearBuilt'])
        X['YearRemodAdd']=lab.fit_transform(X['YearRemodAdd'])
        X['GarageYrBlt']=lab.fit_transform(X['GarageYrBlt'])
        return X


#=====================Apply log1p to the skewed features, then get_dummies.=============
class skew_dummies(BaseEstimator,TransformerMixin):
    def __init__(self,skew=0.5):
        self.skew=skew

    def fit(self,X,y=None):
        return self

    def transform(self,X):
        X_numeric=X.select_dtypes(exclude=["object"])
        skewness=X_numeric.apply(lambda x:skew(x))
        skewness_feature=skewness[abs(skewness)>=self.skew].index
        X[skewness_feature]=np.log1p(X[skewness_feature])
        X=pd.get_dummies(X)
        return X

pipe=Pipeline([('labenc',labelenc()),('skew_dummies',skew_dummies(skew=1)),])
full2=full.copy()
data_pipe=pipe.fit_transform(full2)
print(data_pipe.shape)
print(data_pipe.head())

#==============================================use robustscaler since maybe there are other outliers
scaler=RobustScaler()
n_train=train.shape[0]
X=data_pipe[:n_train]
test_X=data_pipe[n_train:]
y=train.SalePrice

X_scaled=scaler.fit(X).transform(X)
y_log=np.log(train.SalePrice)
test_X_scaled=scaler.transform(test_X)

"""=================feature selection======================================="""
lasso=Lasso(alpha=0.001)
lasso.fit(X_scaled,y_log)
FI_lasso=pd.DataFrame({"Feature Importance":lasso.coef_},index=data_pipe.columns)
print(FI_lasso.sort_values(['Feature Importance'],ascending=False))
FI_lasso[FI_lasso["Feature Importance"]!=0].sort_values("Feature Importance").plot(kind='barh',figsize=(15,25))
plt.xticks(rotation=90)
#plt.show()

"""=============================add features================================================"""
class add_feature(BaseEstimator, TransformerMixin):
    def __init__(self,additional=1):
        self.additional = additional

    def fit(self,X,y=None):
        return self

    def transform(self,X):
        if self.additional==1:
            X["TotalHouse"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"]
            X["TotalArea"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"] + X["GarageArea"]

        else:
            X["TotalHouse"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"]
            X["TotalArea"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"] + X["GarageArea"]

            X["+_TotalHouse_OverallQual"] = X["TotalHouse"] * X["OverallQual"]
            X["+_GrLivArea_OverallQual"] = X["GrLivArea"] * X["OverallQual"]
            X["+_oMSZoning_TotalHouse"] = X["oMSZoning"] * X["TotalHouse"]
            X["+_oMSZoning_OverallQual"] = X["oMSZoning"] + X["OverallQual"]
            X["+_oMSZoning_YearBuilt"] = X["oMSZoning"] + X["YearBuilt"]
            X["+_oNeighborhood_TotalHouse"] = X["oNeighborhood"] * X["TotalHouse"]
            X["+_oNeighborhood_OverallQual"] = X["oNeighborhood"] + X["OverallQual"]
            X["+_oNeighborhood_YearBuilt"] = X["oNeighborhood"] + X["YearBuilt"]
            X["+_BsmtFinSF1_OverallQual"] = X["BsmtFinSF1"] * X["OverallQual"]

            X["-_oFunctional_TotalHouse"] = X["oFunctional"] * X["TotalHouse"]
            X["-_oFunctional_OverallQual"] = X["oFunctional"] + X["OverallQual"]
            X["-_LotArea_OverallQual"] = X["LotArea"] * X["OverallQual"]
            X["-_TotalHouse_LotArea"] = X["TotalHouse"] + X["LotArea"]
            X["-_oCondition1_TotalHouse"] = X["oCondition1"] * X["TotalHouse"]
            X["-_oCondition1_OverallQual"] = X["oCondition1"] + X["OverallQual"]


            X["Bsmt"] = X["BsmtFinSF1"] + X["BsmtFinSF2"] + X["BsmtUnfSF"]
            X["Rooms"] = X["FullBath"]+X["TotRmsAbvGrd"]
            X["PorchArea"] = X["OpenPorchSF"]+X["EnclosedPorch"]+X["3SsnPorch"]+X["ScreenPorch"]
            X["TotalPlace"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"] + X["GarageArea"] + X["OpenPorchSF"]+X["EnclosedPorch"]+X["3SsnPorch"]+X["ScreenPorch"]


            return X

#By using a pipeline, you can quickily experiment different feature combinations
pipe=Pipeline([
    ('labenc',labelenc()),
    ('add_feature',add_feature(additional=2)),
    ('skew_dummies',skew_dummies(skew=1)),])


"""===================================PCA========================================"""
#这里的PCA没有降维,只是对数据进行了线性转换
full_pipe=pipe.fit_transform(full)
print(full_pipe.shape)

n_train=train.shape[0]
X=full_pipe[:n_train]
test_X=full_pipe[n_train:]
y=train.SalePrice
X_scaled=scaler.fit(X).transform(X)
y_log=np.log(train.SalePrice)
test_X_scaled=scaler.transform(test_X)
pca=PCA(n_components=410)
X_scaled=pca.fit_transform(X_scaled)
test_X_scaled=pca.transform(test_X)
print(X_scaled.shape,test_X_scaled.shape)

"""======================Modeling&Evaluation=================="""
def rmse_cv(model,X,y):
    rmse=np.sqrt(-cross_val_score(model,X,y,scoring='neg_mean_squared_error',cv=5))
    return rmse

#选用的模型有 LinearRegression,Ridge,Lasso,Random Forest,Gradient Boosting Tree,Support Vector Regression,Linear Support Vector Regression,ElasticNet,Stochastic Gradient Descent,BayesianRidge,KernelRidge,Extra TreesRegressor,Xgboost
models = [LinearRegression(),Ridge(),Lasso(alpha=0.01,max_iter=10000),RandomForestRegressor(),GradientBoostingRegressor(),SVR(),LinearSVR(),
          ElasticNet(alpha=0.001,max_iter=10000),SGDRegressor(max_iter=1000,tol=1e-3),BayesianRidge(),KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5),
          ExtraTreesRegressor(),XGBRegressor()]

names=["LR","Ridge","Lasso","RF","GBR","SVR","LinSVR","Ela","SGD","Bay","Ker","Extra","Xgb"]
for name,model in zip(names,models):
    score=rmse_cv(model,X_scaled,y_log)
    print("{}: {:.6f},{:.4f}".format(name,score.mean(),score.std()))


#Next we do some hyperparameters tuning. First define a gridsearch method
class grid():
    def __init__(self,model):
        self.model=model

    def grid_get(self,X,y,param_grid):
        grid_search=GridSearchCV(self.model,param_grid,cv=5,scoring='neg_mean_squared_error')
        grid_search.fit(X,y)
        print(grid_search.best_params_,np.sqrt(-grid_search.best_score_))
        grid_search.cv_results_['mean_test_score']=np.sqrt(-grid_search.cv_results_['mean_test_score'])
        print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])

#Lasso
grid(Lasso()).grid_get(X_scaled,y_log,{'alpha':[0.0004,0.0005,0.0007,0.0006,0.0009,0.0008],'max_iter':[10000]})

#Ridge
grid(Ridge()).grid_get(X_scaled,y_log,{'alpha':[35,40,45,50,55,60,65,70,80,90]})

#SVR
grid(SVR()).grid_get(X_scaled,y_log,{'C':[11,12,13,14,15],'kernel':['rbf'],'gamma':[0.0003,0.0004],'epsilon':[0.008,0.009]})

#Kernel Ridge
param_grid={'alpha':[0.2,0.3,0.4,0.5],'kernel':['polynomial'],'degree':[3],'coef0':[0.8,1,1.2]}
grid(KernelRidge()).grid_get(X_scaled,y_log,param_grid)

#ElasticNet
grid(ElasticNet()).grid_get(X_scaled,y_log,{'alpha':[0.0005,0.0008,0.004,0.0005],'l1_ratio':[0.08,0.1,0.3,0.5,0.7],'max_iter':[10000]})

"""===========Ensemble Methods========================"""
class AverageWeight(BaseEstimator,RegressorMixin):
    def __init__(self,mod,weight):
        self.mod=mod
        self.weight=weight

    def fit(self,X,y):
        self.models_=[clone(x) for x in self.mod]
        for model in self.models_:
            model.fit(X,y)
        return self

    def predict(self,X):
        w=list()
        pred=np.array([model.predict(X) for model in self.models_])
        for data in range(pred.shape[1]):
            single=[pred[model,data]*weight for model,weight in zip(range(pred.shape[0]),self.weight)]
            w.append(np.sum(single))
        return w

lasso=Lasso(alpha=0.0005,max_iter=10000)
ridge=Ridge(alpha=60)
svr=SVR(gamma=0.0004,kernel='rbf',C=13,epsilon=0.009)
ker=KernelRidge(alpha=0.2,kernel='polynomial',degree=3,coef0=0.8)
ela=ElasticNet(alpha=0.005,l1_ratio=0.08,max_iter=10000)
bay=BayesianRidge()
#assign weights based on their gridsearch score
w1=0.02
w2=0.2
w3=0.25
w4=0.3
w5=0.03
w6=0.2
weight_avg=AverageWeight(mod=[lasso,ridge,svr,ker,ela,bay],weight=[w1,w2,w3,w4,w5,w6])
rmse_cv(weight_avg,X_scaled,y_log),rmse_cv(weight_avg,X_scaled,y_log).mean()

#But if we average only two best models, we gain better cross-validation score
weight_avg=AverageWeight(mod=[svr,ker],weight=[0.5,0.5])
rmse_cv(weight_avg,X_scaled,y_log),rmse_cv(weight_avg,X_scaled,y_log).mean()

"""========================================stacking=========================="""
#_Aside from normal stacking, I also add the "get_oof" method, because later I'll combine features generated from stacking and original features
class stacking(BaseEstimator,RegressorMixin,TransformerMixin):
    def __init__(self,mod,meta_model):
        self.mod=mod
        self.meta_model=meta_model
        self.kf=KFold(n_splits=5,random_state=42,shuffle=True)

    def fit(self,X,y):
        self.saved_model=[list() for i in self.mod]
        oof_train=np.zeros((X.shape[0],len(self.mod)))
        for i,model in enumerate(self.mod):
            for train_index,val_index in self.kf.split(X,y):
                renew_model=clone(model)
                renew_model.fit(X[train_index],y[train_index])
                self.saved_model[i].append(renew_model)
                oof_train[val_index,i]=renew_model.predict(X[val_index])

        self.meta_model.fit(oof_train,y)
        return self

    def predict(self,X):
        whole_test=np.column_stack([np.column_stack(model.predict(X) for model in single_model).mean(axis=1) for single_model in self.saved_model])
        return self.meta_model.predict(whole_test)

    def get_oof(self,X,y,test_X):
        oof=np.zeros((X.shape[0],len(self.mod)))
        test_single=np.zeros((test_X.shape[0],5))
        test_mean=np.zeros((test_X.shape[0],len(self.mod)))
        for i,model in enumerate(self.mod):
            for j,(train_index,val_index) in enumerate(self.kf.split(X,y)):
                clone_model=clone(model)
                clone_model.fit(X[train_index],y[train_index])
                oof[val_index,i]=clone_model.predict(X[val_index])
                test_single[:,j]=clone_model.predict(test_X)
            test_mean[:,i]=test_single.mean(axis=1)
            return oof,test_mean

## must do imputer first, otherwise stacking won't work, and i don't know why.
a=Imputer().fit_transform(X_scaled)
b=Imputer().fit_transform(y_log.values.reshape(-1,1)).ravel()
stack_model=stacking(mod=[lasso,ridge,svr,ker,ela,bay],meta_model=ker)
print(rmse_cv(stack_model,a,b))
print(rmse_cv(stack_model,a,b).mean())
#Next we extract the features generated from stacking, then combine them with original features.
X_train_stack,X_test_stack=stack_model.get_oof(a,b,test_X_scaled)
print(X_train_stack.shape,a.shape)
X_train_add=np.hstack((a,X_train_stack))
X_test_add=np.hstack((test_X_scaled,X_test_stack))
print(X_train_add.shape,X_test_add.shape)
print(rmse_cv(stack_model,X_train_add,b))
print(rmse_cv(stack_model,X_train_add,b).mean())

#You can even do parameter tuning for your meta model after you get "X_train_stack", or do it after combining with the original features. but that's a lot of work too !

"""================final model======================"""
stack_model=stacking(mod=[lasso,ridge,svr,ker,ela,bay],meta_model=ker)
stack_model.fit(a,b)
pred=np.exp(stack_model.predict(test_X_scaled))
result=pd.DataFrame({'Id':test.Id,"salePrice":pred})
result.to_csv('submission.csv',index=False)

结果能正确输出!
 

猜你喜欢

转载自blog.csdn.net/ningyanggege/article/details/82690766