减少数据在内存中占用的空间

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

sample_feature = reduce_mem_usage(pd.read_csv('data_for_tree.csv'))

//上一期制作好的csv

Memory usage of dataframe is 62099672.00 MB
Memory usage after optimization is: 16719236.00 MB
Decreased by 73.1%

continuous_feature_names = [x for x in sample_feature.columns if x not in ['price','brand','model','brand']]

线性回归 & 五折交叉验证 & 模拟真实业务情况

sample_feature = sample_feature.dropna().replace('-', 0).reset_index(drop=True)
sample_feature['notRepairedDamage'] = sample_feature['notRepairedDamage'].astype(np.float32)
train = sample_feature[continuous_feature_names + ['price']]

train_X = train[continuous_feature_names]
train_y = train['price']

简单建模

from sklearn.linear_model import LinearRegression
model = LinearRegression(normalize=True)
model = model.fit(train_X, train_y)

绘制特征v_9的值与标签的散点图，图片发现模型的预测结果（蓝色点）与真实标签（黑色点）的分布差异较大，且部分预测值出现了小于0的情况，说明我们的模型存在一些问题
存在问题的模型

plt.scatter(train_X['v_9'][subsample_index], train_y[subsample_index], color='black')
plt.scatter(train_X['v_9'][subsample_index], model.predict(train_X.loc[subsample_index]), color='blue')
plt.xlabel('v_9')
plt.ylabel('price')
plt.legend(['True Price','Predicted Price'],loc='upper right')
print('The predicted price is obvious different from true price')
plt.show()

通过作图我们发现数据的标签（price）呈现长尾分布，不利于我们的建模预测。原因是很多模型都假设数据误差项符合正态分布，而长尾分布的数据违背了这一假设。

import seaborn as sns
print('It is clear to see the price shows a typical exponential distribution')
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.distplot(train_y)
plt.subplot(1,2,2)
'''
np.quantile(train_y, 0.9) - 求train_y 的90%的分位数
下面这个代码是把价格大于90%分位数的部分截断了,就是长尾分布截断
'''
sns.distplot(train_y[train_y < np.quantile(train_y, 0.9)])

在这里插入图片描述

在这里我们对标签进行了
log(x+1)变换，使标签贴近于正态分布

train_y_ln = np.log(train_y + 1)

import seaborn as sns
print('The transformed price seems like normal distribution')
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.distplot(train_y_ln)
plt.subplot(1,2,2)
sns.distplot(train_y_ln[train_y_ln < np.quantile(train_y_ln, 0.9)])

在这里插入图片描述

再次进行可视化，发现预测结果与真实值较为接近，且未出现异常状况

plt.scatter(train_X['v_9'][subsample_index], train_y[subsample_index], color='black')
plt.scatter(train_X['v_9'][subsample_index], np.exp(model.predict(train_X.loc[subsample_index])), color='blue')
plt.xlabel('v_9')
plt.ylabel('price')
plt.legend(['True Price','Predicted Price'],loc='upper right')
print('The predicted price seems normal after np.log transforming')
plt.show()

在这里插入图片描述

五折交叉验证

在使用训练集对参数进行训练的时候，经常会发现人们通常会将一整个训练集分为三个部分（比如mnist手写训练集）。一般分为：训练集（train_set），评估集（valid_set），测试集（test_set）这三个部分。这其实是为了保证训练效果而特意设置的。其中测试集很好理解，其实就是完全不参与训练的数据，仅仅用来观测测试效果的数据。而训练集和评估集则牵涉到下面的知识了。
因为在实际的训练中，训练的结果对于训练集的拟合程度通常还是挺好的（初始条件敏感），但是对于训练集之外的数据的拟合程度通常就不那么令人满意了。因此我们通常并不会把所有的数据集都拿来训练，而是分出一部分来（这一部分不参加训练）对训练集生成的参数进行测试，相对客观的判断这些参数对训练集之外的数据的符合程度。这种思想就称为交叉验证（Cross
Validation）

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error,  make_scorer

定义一个函数,用来处理预测值和真实值的log变换

def log_transfer(func):
    def wrapper(y, yhat):
        result = func(np.log(y), np.nan_to_num(np.log(yhat)))
        return result
    return wrapper

scores = cross_val_score(model, X=train_X, y=train_y, verbose=1, cv = 5, scoring=make_scorer(log_transfer(mean_absolute_error)))

使用线性回归模型，对未处理标签的特征数据进行五折交叉验证

print('AVG:', np.mean(scores))

AVG: 1.3641908155886227（5次的MAE平均值）

MAE（Mean Absolute Error）平均绝对误差是绝对误差的平均值。可以更好地反映预测值误差的实际情况。

使用线性回归模型，对处理过标签的特征数据进行五折交叉验证（Error 0.19）

scores = cross_val_score(model, X=train_X, y=train_y_ln, verbose=1, cv = 5, scoring=make_scorer(mean_absolute_error))

AVG: 0.19382863663604424
MAE从1.365降低到0.193,误差缩小了很多

事实上,五折交叉验证在某些与时间相关的数据集上反而反映了不真实的情况

用2018年的二手车价格预测2017年是不合理的,所以我们可以用时间靠前的4/5样本当作训练集,靠后的1/5当验证集

import datetime     # 这里我没看到datetime的作用,只能认为数据集是按照时间排列的
sample_feature = sample_feature.reset_index(drop=True)      # 重置索引
split_point = len(sample_feature) // 5 * 4      # 设置分割点

train = sample_feature.loc[:split_point].dropna()
val = sample_feature.loc[split_point:].dropna()

train_X = train[continuous_feature_names]
train_y_In = np.log(train['price'] + 1)
val_X = val[continuous_feature_names]
val_y_In = np.log(val['price'] + 1)

model = model.fit(train_X, train_y_In)
mean_absolute_error(val_y_In, model.predict(val_X))

— MAE为0.196,和五折交叉验证差别不大

绘制学习率曲线与验证曲线

from sklearn.model_selection import learning_curve, validation_curve

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)     # 如果规定了ylim的值,那么ylim就用规定的值
    plt.xlabel('Training example')
    plt.ylabel('score')
    train_size, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                                                           train_sizes=train_sizes,
                                                           scoring=make_scorer(mean_absolute_error))
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()
    '''
    fill_between()
            train_sizes - 第一个参数表示覆盖的区域
            train_scores_mean - train_scores_std - 第二个参数表示覆盖的下限
            train_scores_mean + train_scores_std - 第三个参数表示覆盖的上限
            color - 表示覆盖区域的颜色
            alpha - 覆盖区域的透明度,越大越不透明 [0,1]
    '''
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color='r')
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std)
    plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score')
    plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='Cross-validation score')
    plt.legend(loc='best')
    return plt

plot_learning_curve(LinearRegression(), 'Liner_model', train_X[:1000], train_y_In[:1000],
                    ylim=(0.0, 0.5), cv=5, n_jobs=1)

嵌入式特征选择 - 大部分情况下都是用嵌入式做特征选择

1.L1正则化 - Lasso回归 -
模型被限制在正方形区域(二维区域下),损失函数的最小值往往在正方形(约束)的角上,很多权值为0(多维),所以可以实现模型的稀疏性(生成稀疏权值矩阵,进而用于特征选择
2.L2正则化 - 岭回归 -
模型被限制在圆形区域(二维区域下),损失函数的最小值因为圆形约束没有角,所以不会使得权重为0,但是可以使得权重都尽可能的小,最后得到一个所有参数都比较小的模型,这样模型比较简单,能适应不同数据集,一定程度上避免了过拟合

# 我们看下三种模型的效果对比:线性回归; 加入了L1的Lasso回归; 加入了L2的岭回归

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
models = [LinearRegression(), Ridge(), Lasso()]
result = dict()     # 创建一个用来装结果的字典
for model in models:
    model_name = str(model).split('(')[0]   # 把括号去掉,只保留名字
    scores = cross_val_score(model, X=train_X, y=train_y_In, verbose=0, cv=5,       # 五折交叉验证
                             scoring=make_scorer(mean_absolute_error))
    result[model_name] = scores
    print(model_name + ' is finished')

result = pd.DataFrame(result)
result.index = ['cv' + str(x) for x in range(1, 6)]
result

model_Lr = LinearRegression().fit(train_X, train_y_In)
print('intercept:' + str(model_Lr.intercept_))
sns.barplot(abs(model_Lr.coef_), continuous_feature_names)

在这里插入图片描述

发现v6\v8\v9的权重大
L2正则化在拟合过程中通常都倾向于让权值尽可能小，最后构造一个所有参数都比较小的模型。因为一般认为参数值小的模型比较简单，能适应不同的数据集，也在一定程度上避免了过拟合现象。可以设想一下对于一个线性回归方程，若参数很大，那么只要数据偏移一点点，就会对结果造成很大的影响；但如果参数足够小，数据偏移得多一点也不会对结果造成什么影响，专业一点的说法是『抗扰动能力强』

— 岭回归:发现有更多的参数对模型起到影响,而且参数都比较小,一定程度上避免了过拟合现象,抗扰动能力强

model_Lasso = Lasso().fit(train_X, train_y_In)
print('intercept:' + str(model_Lasso.intercept_))
sns.barplot(abs(model_Lasso.coef_), continuous_feature_names)