基于Kaggle的经典AI项目:预测房价系统

预测房价系统

Kaggle 项目链接:
    http://www.kaggle.com/c/house-prices-adcvanced-regression-techniques/data
项目步骤:

这里写图片描述

完整代码

一、数据理解–整体探索

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
from scipy.stats import norm
from scipy import stats

import matplotlib.pyplot as plt
%matplotlib inline

from scipy.stats import skew
from scipy.stats.stats import pearsonr

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

 #0 数据获取
 #下载地址:https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data
train = pd.read_csv()
test = pd.read_csv()
A 数据整体理解
train.info()
B 数据探索
B1 因变量-房价
train['SalePrice'].describe()               #概览
sns.distplot(train['SalePrice']).fit = norm #分布图
B2 探索—数值型自变量

2.2.1 相关系数矩阵
DataFrame.corr() 给出两列之间的关系。利用该方法检测特征–目标变量之间的相关性

corr = train.select_dtypes(include = [np.number]).iloc[:,1:].corr()
plt.figure(figure = (12,12))
sns.heatmap(corr,vmax = 1,squre = True)     #热力图显示所有变量之间的关系

2.2.2 与saleprice 高相关变量,相关系数矩阵

corr_threshvalue = 0.5 #number of variables for heatmap
corr_cols = corr.loc[:,corr.loc['SalePrice',:].abs() > corr_threshvalue].sort_value(by = 'SalePrice',axis = 1,ascending = False)
corr_thresh = train[corr_cols].corr()
plt.figure(figsize = (12,12))
sns.set(font_scales = 1.25)
sns.heatmap(corr_thresh,cbar = True,annot = True,squre = True,fmt = '.2f',annot_kws = {'size': 10})

2.2.3 与saleprice 高相关变量-散点图**

sns.pairplot(train[corr_cols],size = 2.5)
B3 探索–分类型自变量

DataFrame.corr() 给出两列之间的关系。检测 特征–目标变量之间的相关性

train_cats = train.select_dtypes(include = [np.object]).iloc[:,1:]
train_cats.head()
train_cats.apply(lambda x:x.nunique())

2.3.1 方差分析
一元方差分析(类型变量)

def anova(train_cats_y,categorical,y):
    anv = pd.DataFrame(index = categorical)
    anv['feature'] = categorical
    pvals = []
    for c in categorical:
        samples = []
        for cls in train_cats_y[c].dropna().unique():
            s = train_cats_y[train_cats_y[c] == cls][y].values
            samples.append(s)   #某特征下不同取值对应的房价组合形成二维列表
        pval = stats.f_oneway(*samples)[1]
        pvals.append(pval)
    anv['pval'] = pvals
    return anv.values['pval']

categorical = [column for column is train.column if train.dtypes[column] == 'object']   #类型变量集合
y = 'SalePrice'
core_cate = anova(train,categorical,y)
core_cate['disparity'] = np.log(20*1./core_cate['pval'].values)/np.log(20)  #悬殊度

悬殊度-绘图

fig.ax = plt.subplots(figure=(16,8))
sns.barplot(data = core_cate,x = 'feature',y = 'disparity')
plt.xticks(rotation = 90)
plt.show()

二、数据清洗

import pandas as pd
import numpy as np
import seaborn sns
import matplotlib
from scipy import stats

import matplotlib.pyplot as plt
%matplotlib inline

from scipy.stats import skew
from scipy.stats.stats import pearsonr

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import warnings
warnings.filterwarnings('ignore')

#数据读取
train = pd.read_csv('../07_数据/train.csv')
test = pd.read_csv('../07_数据/test.csv')
corr = train.corr()
2.1 数据类型修改
train.dtypes
将数据设置为对应的类型
train['MSSubClass'].dtypes              #MSSubClass:出售房子类型,应该是分类型
train['YrSold'].dtypes                  #房子出售年月
train['MoSold'].dtypes

train['MSSubClass'] = train['MSSubClass'].astype(str)   #MSSubClass:出售房子类型,应该是分类型
train['YrSold'] = train['YrSold'].astypes(str)          #房子出售年月
train['MoSold'] = train['MoSold'].astypes(str)
2.2 重复样本处理
train.duplicated().sum()                #查看重复样本个数
train.drop_duplicates(inplace = True)   #删除重复样本
2.3 缺失值处理
2.3.1 行–缺失值处理
(train.isnull().sum(axis=1)/train.shape[1] > 0.4).sum() #统计输出缺失值大于40%的行数
nans_del_index = train.index[(train.isnull().sum(axis=1)/train.shape[1] > 0.4)]
train.drop(labels = nans_del_index,axis = 0,inplace=True)#删除缺失值大于40%的行数
2.3.2 列–缺失值处理

类别型变量–区分度计算

def anova(train_cats_y,categorical,y):
    anv = pd.DataFrame(index = categorical)
    anv = ['feature'] = categorical
    pvals = []
    for c in categorical:
        samples = []
        for cls in train_cats_y[c].dropna().unique():
            s = train_cats_y[train_cats_y[c] == cls][y].values
            samples.append(s)               #某特征下不同取值对应的房价组合形成二维列表
        pval = stats.f_oneway(*samples)[1]  #一元方差分析得到F、P,要的是P,P越小对方差影响越大
        pvals.append(pval)
    anv['pval'] = pvals
    return anv.values['pval']
categorical = [column for column is train.column if train.dtypes[column] == 'object']   #类型变量集合
y = 'SalePrice'
core_cate = anova(train,categorical,y)
core_cate['disparity'] = np.log(20*1./core_cate['pval'].values)/np.log(20)  #悬殊度/区分度

 #统计各列 缺失值比例
NAs = pd.concat([(100*train.isnull().sum(axis = 0)/train.shape[0]),
                100*test.isnull().sum/train.shape[0]],
                axis = 1,keys = ['Train','Test'])
NAs['type'] = train.dtypes[NAs.index]

 #加入类别型变量--相关度
NAs['corr_conti_y'] = train.corr()['SalePrice']

 #加入类别型变量--区分度
NAs['core_cate_y'] = core_cate[disparity]

 #只保留有缺失值的列
NAs = NAs[NAs[['Train','Test']].sum(axis=1) > 0].sort_values(by = 'True',ascending = False)

#删除缺失值大于40%的列

nans_del_cols = NAs.index[((NAs[['Train','Test']] > 40).sum(axis=1) > 0).values]
nans_del_cols
train.drop(labels = nans_del_cols,axis=1,inplace = True)


#统一填充缺失率少于1% 的列(连续型--中位数;分类型--众数)

nans_del_cols = NAs.index[((NAs[['Train','Test']] < 1).sum(axis=1)==2).values].values
nans_del_cols

nans_del_cols_num = train[nans_lessl_cols].select_dtypes(include=[np.number]).column
nans_del_cols_class = train[nans_lessl_cols].select_dtypes(include=[np.object]).column
train[nans_del_cols_num] = train[nans_del_cols_num].fillna(train[nans_del_cols_num].median())
train[nans_del_cols_class] = train[nans_del_cols_class].fillna(train[nans_del_cols_class].mode())


#相关性高的连续型变量业务填充

    #GarageTrBlt:   车库建成日期,用房屋建成日期替代
    #KasVarArea:    外墙装饰面积
    #LotFrontage:   房子与街道距离
train.loc[train['GarageTrBlt'].isnull(),'GarageTrBlt'] = train.loc[train['GarageTrBlt'].isnull(),'YearBuilt']
train['KasVarArea'] = train['KasVarArea'].fillna(0)
train['LotFrontage'] = train['LotFrontage'].fillna(0)

#区分度高的 分类型变量业务填充

    #BsmtQual:      地下室质量
    #KitchenQual:   厨房质量 
train['BsmtQual'].fillna('None',inplace=True)
train['KitchenQual'].fillna(train['KitchenQual'].mode(),inplace=True)


#统一填充剩余变量(连续型--0;分类型--None)

 #查看剩余变量情况
remain_col = train.columns[train.isnull().sum(axis=0) > 0]
NAs.loc[remain_col,]

nans_remain_cols_num = train[remain_col].select_dtypes(include=[np.number]).column
nans_remain_cols_class = train[remain_col].select_dtypes(include=[np.object]).column
train[nans_remain_cols_num] = train[nans_remain_cols_num].fillna(0)
train[nans_remain_cols_class] = train[nans_remain_cols_class].fillna(None)


#查看缺失值是否出完毕

train.isnull().sum(axis=0)
2.4 连续型变量奇异值处理
train.SalePrice.plot(king = 'box',sym = 'b+')                               #y 变量箱线图
number_para = train.select_dtypes(include = [np.number]).drop('id',axis=1)  #查看与y变量高相关性变量的箱线图
corr = number_para.corr()
corr_threshvalue = 0.5  #number of variables for heatmap
corr_cols = corr.loc[:,corr.loc['SalePrice',:].abs() > corr_threshvalue].columns
number_para[corr_cols].plot(sym='b+',king = 'box',subplots=True,figsize=(20,8))


#找到奇异值上下的临界点

number_para_q = number_para.quantile(q =[0,0.05,0.25,0.5,0.75,0.95,1],axis = 0)
number_para_q

#outlier_expand为异常值定义伸缩参数,标准为 1.5

outlier_expand = 2
number_para_q.loc['lower_outlier',:] = number_para_q.loc[0.25,:]- (number_para_q.loc[0.75,:]- number_para_q.loc[0.25,:])*outlier
number_para_q.loc['upper_outlier',:] = number_para_q.loc[0.75,:]+ (number_para_q.loc[0.75,:]- number_para_q.loc[0.25,:])*outlier


#统计各个列中高于/低于 上下临界点的数量

upper_cnt = (number_para > number_para_q.loc['upper_outlier',:]).sum()
upper_cnt
lower_cnt = (number_para < number_para_q.loc['lower_outlier',:]).sum()
lower_cnt
2.4.1 重点变量处理
 #bivariate analysis salarprice/grlivares
var = 'GrLivAreas'
data = pd.concat(train['SalePrice'],train[var],axis=1)
data.plot.scatter(x = var,y = 'SalePrice',ylim = (0.800000));


#dalecting points

train.sort_values(by = 'GrLivAreas',ascending = False)[:2]
train = train.drop(train[train['Id'] == 1299].index)
train = train.drop(train[train['Id'] == 524].index)
2.4.2 其余变量统一处理
number_para = number_para.where(number_para < number_para_q.loc['upper_outlier',:],
number_para_q.loc['upper_outlier',:],axis=1)
number_para = number_para.where(number_para < number_para_q.loc['lower_outlier',:],number_para_q.loc['lower_outlier',:],axis=1)
number_para[corr_cols].plot(sym = 'b+',kind = 'box',subplots = True,figure = (20,8))

三、特征转换

import pandas as pd
import numpy as np
import seaborn sns
import matplotlib
import scipy.stats import norm
from scipy import stats

import matplotlib.pyplot as plt
#matplotlib inline

from scipy.stats import skew
from scipy.stats.stats import pearsonr

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
准备工作–引入‘数据清洗’处理
%run '数据清洗.ipynb'
类别型变量–区分度计算
def anova(train_cats_y,categorical,y):
    anv = pd.DataFrame(index = categorical)
    anv = ['feature'] = categorical
    pvals = []
    for c in categorical:
        samples = []
        for cls in train_cats_y[c].dropna().unique():
            s = train_cats_y[train_cats_y[c] == cls][y].values
            samples.append(s)               #某特征下不同取值对应的房价组合形成二维列表
        pval = stats.f_oneway(*samples)[1]  #一元方差分析得到F、P,要的是P,P越小对方差影响越大
        pvals.append(pval)
    anv['pval'] = pvals
    anv['disparity'] = np.log(20*1./core_cate['pval'].values)/np.log(20)    #悬殊度/区分度
    return anv.sort_values['pval']
categorical = [column for column is train.column if train.dtypes[column] == 'object']   #类型变量集合
y = 'SalePrice'
core_cate = anova(train,categorical,y)
cate_feature

 #统计变量的主要信息
train_ana = pd.DataFrame()
train_ana['feature_type'] = train.dtypes                    #加入变量类型
train_ana['cate_cnt'] = train,apply(lambda x : x.nuique())  #加入每个类别型变量的取值个数
train_ana['conti_corr'] = train.corr()['SalePrice']         #加入连续型变量相关度
train_ana['cate_coor'] = train.corr()['disparity']          #加入类别型变量--区分度
 #train_ana.sort_values(by = ['feature_type','cate_cnt','conti_corr','cate_coor'],ascending = False
3.1 分类型变量
train_ana.loc[train_ana.feature_type == 'object',].sort_values('cate_cnt',ascending = False)
3.1.1 转换-分类型变量–重分组
neighborhood_order = train.groupby('Neighborhood').median().sort_values(by = 'SalePrice').index

plt.figure(figsize = (24,6))
sns.boxplot(x = 'Neighborhood',y = 'SalePrice',data = train,order = neighborhood_order)
plt.xticks(rotation = 45)   #画布旋转45度

plt.figure(figsize = (24,6))
sns.boxplot(x = 'Neighborhood',data = train,order = neighborhood_order)
plt.xticks(rotation = 45)   

neighborhood_order

train['SimpleNeighborhood'] = train.Neighborhood.replace({'IDOTRR':'IDOTRR-BrDale','BrDale':'IDOTRR-BrDale',
                                                            'Blueste':'Blueste-SWISU','SWISU':'Blueste-SWISU',
                                                            'NPkVill':'NPkVill-Mitchel','Mitchel':'NPkVill-Mitchel'
                                                        })
anova(train,['Neighborhood','SimpleNeighborhood'],y)
3.1.2 转换-分类型变量–onehot编码
 #通过onehot编码创建虚拟特性分类值
train_cat = train.select_dtypes(include = [np.object])
train_onehot = pd.get_dummies(train_cat)
3.2 连续型变量
train_ana.loc[train_ana.feature_type != 'object',].sort_values('conti_corr',ascending = False)
3.2.1 衍生-连续型变量—非线性衍生(平/立方后再求平方根)
train['OverallQual-s2'] = train['OverallQual']**2
train['OverallQual-s3'] = train['OverallQual']**3
train['OverallQual-Sq'] = np.sqrt(train['OverallQual'])
train['GrLivAreas-2'] = train['GrLivAreas']**2
train['GrLivAreas-3'] = train['GrLivAreas']**3
train['GrLivAreas-Sq'] = np.sqrt(train['GrLivAreas'])
train[('SalePrice','OverallQual','OverallQual-s2','OverallQual-s3','OverallQual-Sq',
        'GrLivAreas','GrLivAreas-2','GrLivAreas-3','GrLivAreas-Sq')].coor()['SalePrice']
3.2.2 衍生-连续型变量—简单组合(自变量相加)
train['TotalBath'] = train['BsmtFullBath'] + (0.5*train['BsmtFullBath']) + \
                     train['FullBath'] + (0.5*train['halfBath'])        #Total of bathrooms
train['AllSF'] = train['GrLivAreas'] + train['TotalBsmtSF']             #Total SF for house(incl,basement)
train['AllFlrsSF'] = train['1stFlrSF'] + train['2ndFlrSF']              #1st + 2nd floor
train['AllPorchSF'] = train['OpenPorchSF'] + train['EnclosedPorch'] + \
                      train['3SsnPorchSF'] + train['ScreenPorch']
train[['TotalBath','AllSF','AllFlrsSF','AllPorchSF']].corr()['SalePrice']
3.2.3 转换-连续型变量-正太转换
y 值正太变换
sns.distplot(train['SalePrice'],fit = norm)             #分布图
stats.probplot(train['SalePrice'],plot = plt)           #利用Q-Q图判断数据是否偏离正太分布
train['SalePrice_log'] = np.loglp(train['SalePrice'])   #取对数-分布图
stats.probplot(train['SalePrice'],plot = plt)           #利用Q-Q图判断数据是否偏离正太分布
train.corr().sort_values('SalePrice_log',ascending = False)[['SalePrice','SalePrice_log']]
连续型特征-正态分布
#对数值特征进行变换,来减少倾斜异常值的影响
#一般的经验法则,绝对偏态值 > 0.75 被认为是倾斜严重
train_num = train.select_dtypes(include = [np.number]).drop(['SalePrice'],['SalePrice_log'],axis = 1)
skewness = train_num.apply(lambda x : skew(x.dropna()))
skewness = skewness[abs(skewness) > 0.75]
skewness
skew_features = skewness.index
train[skew_features] = np.loglp(train[skew_features])
3.2.4 转换-连续型变量 无量纲转换
from sklearn.preprocession import MinMaxScaler
train_num = train.select_dtypes(include = [np.number]).drop(['SalePrice'],['SalePrice_log'],axis = 1)
min_max_scaler = MinMaxScaler()
X_trian_minmax = min_max_scaler.fit_transform(train_num)
train_num_minmax = np.round(min_max_scaler.transform(train_num),2)

train_num_minmax = pd.DataFrame(train_num_minmax,columns = train_num.columns + '_minmax',index = train.index)
train = pd.concat([train,train_onehot,train_num_minmax],axis = 1)
train.drop(['SalePrice_log'],axis = 1,inplace = 'True')

四、特征筛选

#Embedded 嵌入法
import pandas as pd
import numpy as np
import seaborn sns
import matplotlib
import scipy.stats import norm
from scipy import stats

import matplotlib.pyplot as plt
%matplotlib inline

from scipy.stats import skew
from scipy.stats.stats import pearsonr

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

%run'特征转换、衍生.ipynb'
train_num = train.select_dtypes(include = [np.number]).drop(labels = ['Id','SalePrice'],axis = 1)
train_num.shape

4.1 方差筛选法

去除低方差没有波动的变量
from sklearn.feature_selection import VarianceThreshold
varthreshold = 0.01                         #设置阈值
sel_varthres = VarianceThreshold(threshold = varthreshold)
sel_varthres.fit(train_num)
sel_not_varthrea_var = train_num.columns[np.logical_not(sel_varthres.get_support())]
sel_varthres_var = train_num,columns[(sel_varthres.get_support())]
sel_not_varthrea_var
plt.hist(train_num['Street'])
train_num = train_num[sel_varthres_var]
train_num.shape
4.2 Filter 过滤法
相关系数法 pearsonr ,或直接用corr 阈值法
from sklearn.feature_selection import SelectPercentile
from scipy.stats import pearsonr
import numpy as np
sel_percentbest = SelectPercentile(lambda X,Y: np.array(map(lambda x: pearsonr(x,y)[0],X.T)).T,percent = 80)
sel_percentbest.fit(train_num,train['SalePrice'])
sel_percentbest_var = train_num.columns[sel_percentbest.get_support()]
sel_percentbest_var

train_num = train_num[sel_percentbest_var]
train_num.shape
4.3 Wrapper 包装法
from sklearn.feature_selection import RFE   #特征选择算法
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier

rf = RandomForestRegressor(n_estimators = 400)
sel_rfe = RFE(rf.n_feature_to_select = int(train_num.shape[1]*0.8))
sel_rfe = fit.(train_num,train['SalePrice_log'])
sel_rfe_var = train_num.columns[sel_rfe.get_support()]
sel_rfe_var
train_num = train_num[rfe_var]
train_num.shape
4.4 Embedded 嵌入法
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
rf = =RandomForestRegressor(n_estimators = 400)
sel_frommodel = SelectFromModel(rf,threashold = '0.5*median')
sel_frommodel = fit.(train_num,train['SalePrice_log'])
sel_frommodel_var = train_num.columns[sel_frommodel.get_support()]
sel_frommodel_var
train_num = train_num[sel_frommodel_var]
train_num.shape

五、模型训练

---------------------集成算法-----------------------
 #引入‘特征筛选’处理
%run '特征筛选.ipynb'
train_y = train_num['SalePrice_log']
train_x = train_num.drop('SalePrice_log',axis = 1)
model_column = train_x.columns
train_x.shape
model_column
----------------决策回归树算法---------------------------
from sklearn.tree import DecisionTreeRegressor
from sklearn.sqrt_search import GridSearchCV
 # 参数优化--交叉检验
tuned_parameters = {
                    'criterion':['mse'],
                    'min_samples_split':[2,10,20],
                    'max_depth':[2,10,20,40],
                    'min_samples_leaf':[1,5,10],
                    'max_leaf_nodes':[2,10,20,40],
                    }
clf = DecisionTreeRegressor()
clf = GridSearchCV(dlf,tuned_parameters,cv=5)
clf.fit(train_x,train_y)
clf.best_params_
for params,mean_score,scores in clf.grid_score_:
    print('$0.3f(+/-%0.03f) for %r'
          %(mean_score,scores.std()*2,params))
 #可视化----模型结果展示----变量重要性显示
important_features = pd.Series(data = clf.best_estimator_.feature_importances_,index = train_X.columns).sort_values(ascending = False)
plt.figure(figsize = (20,10))
important_features.plot(king = 'bar')

 #效果评估
from sklearn.metrics import mean_squared_error
pred_y = clf.predict(train_X)
np.sqrt(mean_squared_error(np.expml(train_y),np.expml(pred_y)))

plt.figure(figsize = (20,10))
plt.scatter(x = np.expml(train_y),y = np.expml(pred_y))
 ********岭回归 Ridge************
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

 # 参数优化--交叉检验
n_folds = 5
def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model,train_X,train_y,scoring = 'neg_mean_squared_error',cv = KFold(n_folds,shuff)))
    return(rmse)

alphas = [0.05,0.1,0.3,1,3,5]
cv_ridge = [rmse_cv(Ridge(alpha = alpha)).mean()
            for alpha is alphas]
 #score.std()
cv_ridge = pd.Series(cv_ridge,index = alphas)
cv_ridge.plot(title = 'Validation - LassoCV')
plt.xlabel('alpha')
plt.ylabel('rmse')

ridge = Ridge(alpha = 1)
ridge.fit(train_X,train_y)

 #可视化----模型结果展示----变量重要性显示
important_features = pd.Series(data = ridge.coef_,index = train_X.columns).sort_values(ascending = False)
important_features = important_features[np.abs(important_features) > 0.01]
plt.figure(figsize = (20,10))
important_features.plot(king = 'bar')

 #效果评估
from sklearn.metrics import mean_squared_error
pred_y = ridge.predict(train_X)
np.sqrt(mean_squared_error(np.expml(train_y),np.expml(pred_y)))

plt.figure(figsize = (20,10))
plt.scatter(x = np.expml(train_y),y = np.expml(pred_y))
3 ********弹性网回归 ElasticNet ************
from sklearn.linear_model import ElasticNetCV

 # 参数优化--交叉检验
elasticNet = ElasticNetCV(1l_ratio = [0.1,0.3,0.5,0.6,0.7,0.8,0.85,0.9,0.95,1],
                          alpha = [0.0001,0.0003,0.0006,0.001,0.003,0.006,
                                   0.01,0.03,0.06,0.1,0.3,0.6,1,3,6],
                          max_iter = 50000,cv = 10)
elasticNet.fit(train_X,train_y)
alpha = elasticNet.alpha_
ratio = elasticNet.1l_ratio_
print('Best 1l_ratio:',ratio)
print('Best alpha:',alpha)

print('Try again for more precision with 1l_ratio centered around' + str(ratio))
elasticNet = ElasticNetCV(1l_ratio = [ratio + .85,ratio + .9,ratio + .95,ratio + .9,ratio,ratio + 1.05,ratio + 1.1,ratio + 1]
                            alphas = [0.0001,0.0003,0.0006,0.001,0.003,0.006,0.01,0.03,0.06,0.1,0.3,0.6,1,3]
                            max_iter = 50000,cv = 10)
elasticNet.fit(train_X,train_y)
if(elasticNet.1l_ratio_ > 1):
    elasticNet.1l_ratio_ = 1
alpha = elasticNet.alpha_
ratio = elasticNet.1l_ratio_
print('Best 1l_ratio:',ratio)
print('Best alpha:',alpha)

print('Now try again for more precision on alpha,with 1l_ratio fixed at' + str(ratio)+'and alpha centered around' + str(alpha))
elasticNet = ElasticNetCV(1l_ratio = ratio,
                          alphas = [alpha + .6,alpha + .65,alpha + .7,alpha + .75,alpha + .8,
                                    alpha + .85,alpha + .6,alpha + .95,alpha,alpha + 1.05,alpha + 1.1,
                                    alpha + 1.15,alpha + 1.25,alpha + 1.25,alpha + 1.35,alpha + 1.4],
                          max_iter = 50000,cv = 10)
elasticNet.fit(train_X,train_y)
if(elasticNet.1l_ratio_ > 1):
    elasticNet.1l_ratio_ = 1
alpha = elasticNet.alpha_
ratio = elasticNet.1l_ratio_
print('Best 1l_ratio:',ratio)
print('Best alpha:',alpha)

 #模型稳定性
score = rase_cv(elasticNet)
print('Averaged base models score:{: .4f}({: .4f})\n'.format(score.mean(),score.std()))
 #Averaged base model score:0.1199(0.0098)

 #可视化----模型结果展示----变量重要性显示
important_features = pd.Series(data = elasticNet.coef_,index = train_X.columns).sort_values(ascending = False)
important_features = important_features[np.abs(important_features) > 0.01]
plt.figure(figsize = (20,10))
important_features.plot(king = 'bar')

 #效果评估
from sklearn.metrics import mean_squared_error
pred_y = elasticNet.predict(train_X)
np.sqrt(mean_squared_error(np.expml(train_y),np.expml(pred_y)))

plt.scatter(pred_y,pred_y - train_y, c = 'blue',marker = 's',label = 'Training data')
plt.hlines(y = 0,xmin = 10.5,xmax = 13.5,color = 'red')
 #Plot predictions
plt.scatter(pred_y,train_y, c = 'blue',marker = 's',label = 'Training data')
plt.plot([10.5,13.5],[10.5,13.5], c = 'red')
 #4***********算法融合*************
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin
from sklearn.base import TransformerMinin
from sklearn.base import clone
class AveragedModel(BaseEstimator,RegressorMixin,TransformerMinin):
    def __init__(self,models):
        self.models = models
        # we define clones of the original models to fit the data in 
        def fit (self,x,y):
            self.model_ = [clone(x) for x in self.models]
            #Train cloned base models
            for model in self.models_:
                model.fit(X,y)
            return self

        #Now we do the predictions for cloned model and average them
        def predict(self,X):
            predictions = np.column_stack([
                model.predict(X) for model in self.models_
            ])
            return np.mean(predictions,axis = 1)

#模型相关性分析

pred_y_ridge = ridge.predict(train_X)
pred_y_elasticNet = elasticNet.predict(train_X)
pred_y_clf = clf.predict(train_X)

from scipy.stats import pearsonr
pearsonr(pred_y_ridge.T,pred_y_elasticNet.T)
pearsonr(pred_y_clf.T,pred_y_elasticNet.T)

 #模型训练
average_models = AveragedModels(models = (ridge,elasticNet))
average_models.fit(train_X,train_y)

 #交叉验证
n_folds = 5
def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model,train_X,train_y,scoring = 'neg_mean_squared_error',
                    cv = KFold(n_folds,shuffle = True,random_stats = 42)))
    return(rmse)
score = rmse_cv(average_models)
print('Averaged base models score:{: .4f}({: .4f})\n'.format(score.mean(),score.std()))

 #效果评估
from sklearn.metrics import mean_squared_error
pred_y = average_models.predict(train_X)
np.sqrt(mean_squared_error(np.expml(train_y),np.expml(pred_y)))

plt.scatter(pred_y,pred_y - train_y, c = 'blue',marker = 's',label = 'Training data')
plt.hlines(y = 0,xmin = 10.5,xmax = 13.5,color = 'red')
 #Plot predictions
plt.scatter(pred_y,train_y, c = 'blue',marker = 's',label = 'Training data')
plt.plot([10.5,13.5],[10.5,13.5], c = 'red')
5*************最终模型**************
model_ridge = ridge
model_elasticNet = elasticNet
model_averaged_model = averaged_models

import pandas as pd
import numpy as np
import seaborn sns
import matplotlib
from scipy import stats

import matplotlib.pyplot as plt
%matplotlib inline

from scipy.stats import skew
from scipy.stats.stats import pearsonr

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import warnings
warnings.filterwarnings('ignore')
 #1 读取数据
test = pd.read_csv('../07_数据/test.csv')
test.shape
 #2 数据清洗
 #run'2-数据清洗.ipynb'

test['MSSubClass'] = test['MMubClass'].astypes(str)
 #年月
test['TearBuilt'] = test['YearBuilt'].astypes(str)
test['YrSold'] = test['YrSold'].astypes(str)
test['MoSold'] = test['MoSold'].astypes(str)

 #重复值处理--删除重复值
test.drop_duplicates(inplace = True)
 #3.2.1 删除缺失值大于40%的列
test.drop(labels = clear_nans_del_cols,axis = 1,inplace = True)
 #3.2.3 相关性高的连续型变量业务填充
test.loc[test['GarageTrBlt'].isnull(),'GarageTrBlt'] = test.loc[test['GarageTrBlt'].isnull(),'YearBuilt']
test['KasVarArea'] = test['KasVarArea'].fillna(0)
test['LotFrontage'] = test['LotFrontage'].fillna(0)

 #3.2.4 区分度高的分类型变量业务填充
test['BsmtQual'].fillna('None',inplace = True)
test['KitchenQual'].fillna(clear_KitchenQual_mode,inplace = True)

 #3.2.5 统一填充剩余变量(连续型--0;分类型--None)
remain_col = test.columns[test.isnull().sum(axis = 0) > 0]
clear_nans_remain_cols_sum = test[remain_col].select_dtypes(include = [np.number]).columns
clear_nans_remain_cols_class = test[remain_col].select_dtypes(include = [np.object]).columns

test[clear_nans_remain_cols_sum] = test[clear_nans_remain_cols_sum].fillna(0)
test[clear_nans_remain_cols_class] = test[clear_nans_remain_cols_class].fillna('None')

 #查看缺失值是否出完毕
test.isnull().sum(axis = 0).sum()
****************对新数据进行预测****************
%run'5-模型训练。ipynb'
test_X = test_num[model_column]
test_ID = test['Id']

pred_y_ridge = np.expml(model_ridge.predict(test_X))
pred_y_elasticNet = np.expml(model_elasticNet.predict(test_X))
pred_y_averaged_models = np.expml(pred_y_averaged_models.predict(test_X))

sub1 = pd.DataFrame()
sub1['Id'] = test_ID
sub1['SalePrice'] = pred_y_ridge
sub1.to_csv('sub_ridge.csv',index = False)

sub2 = pd.DataFrame()
sub2['Id'] = test_ID
sub2['SalePrice'] = pred_y_elasticNet
sub2.to_csv('sub_elasticNet.csv',index = False)

sub3 = pd.DataFrame()
sub3['Id'] = test_ID
sub3['SalePrice'] = pred_y_averaged_models
sub3.to_csv('sub_averaged_models.csv',index = False)

猜你喜欢

转载自blog.csdn.net/wsp_1138886114/article/details/80555200