「这是我参与11月更文挑战的第28天,活动详情查看:2021最后一次更文挑战」
关于数据集的EDA:juejin.cn/post/703488…
下面对数据进行分析和挖掘,主要完成对于用户购买商品的销售额预测
数据集分割设置
训练集和验证机设置比例为7:3保存,并可视化数据分布
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split # 数据分割
import warnings##忽略警告
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
# 设置画图的可显示中文字体
plt.rcParams['font.sans-serif'] = 'Microsoft YaHei'
df = pd.read_csv("product_data.csv")
X = df.drop(["Purchase"],axis=1)
y = df.Purchase
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
# 训练集大小 测试集大小
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
# 观察划分的数据集一致性,使用hist查看Purchase的分布
colors = ['salmon','lightskyblue','#FFF68F','palegreen','lightpink','silver','burlywood','plum','rosybrown']
plt.figure(figsize=(12,5))
plt.subplot(131)
ax1=plt.subplot(1, 3, 1)
ax1.hist(y_train,color=colors[0])
ax1.set_title("训练集分布图")
ax2=plt.subplot(1, 3, 2)
ax2.hist(y_test,color=colors[1])
ax2.set_title("测试集分布图")
ax3=plt.subplot(1, 3, 3)
ax3.hist(y_test,color=colors[0])
ax3.hist(y_test,color=colors[1])
ax3.set_title("训练集-测试集比较分布图")
plt.show()
df_train = pd.concat([X_train,y_train],axis=1)
df_test = pd.concat([X_test,y_test],axis=1)
df_train.to_csv("./data/train.csv",index=False)
df_test.to_csv("./data/test.csv",index=False)
复制代码
输出:(376303, 11) (161274, 11) (376303,) (161274,)
数据清洗
# 导入基础包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as ppf
import warnings##忽略警告
warnings.filterwarnings('ignore')
# 设置画图的可显示中文字体
plt.rcParams['font.sans-serif'] = 'Microsoft YaHei'
plt.style.use('ggplot')
# 导入包做特征工程
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.preprocessing import LabelEncoder#标签编码
from sklearn.preprocessing import RobustScaler, StandardScaler#去除异常值与数据标准化
from sklearn.pipeline import Pipeline, make_pipeline#构建管道
from scipy.stats import skew#偏度
from sklearn import impute
# 链接两张表
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
full = pd.concat([train,test],ignore_index=True)
full.head()
复制代码
数据清洗主要分为三个部分: 1.填补缺失值 2.解决错误值 3.处理数字不一致问题
缺失值处理
先删除ID列和销售额
full.drop(["User_ID","Product_ID","Purchase"],axis=1,inplace=True)
full.Product_Category_2.value_counts()
复制代码
Product_Category_2 和 Product_Category_3 且均为分类变量,先采取 众数填充 策略
# 采用众数填充,full[col].mode()[0]表示取众数
cols1 = ["Product_Category_2","Product_Category_3"]
for col in cols1:
full[col].fillna(full[col].mode()[0],inplace=True)
复制代码
字符串变量编码为数值变量
针对性别、年龄、城市等级、定居年份等字符串类型的变量转变为类别编码 性别:'男':0,"女":1 年龄:'0-17':0,'18-25':1,'26-35':2,'36-45':3,'46-50':4,'51-55':5,'55+':6 城市等级:'A':0,"B":1,"C":2 定居年份:'0':0,'1':1,"2":2,"3":3,'4+':4
# 性别
full.Gender = full.Gender.map({'F':0,"M":1}).astype(int)
# 年龄
full.Age = full.Age.map({'0-17':0,'18-25':1,'26-35':2,'36-45':3,'46-50':4,'51-55':5,'55+':6}).astype(int)
# 城市等级
full.City_Category = full.City_Category.map({'A':0,"B":1,"C":2}).astype(int)
# 定居年份
full.Stay_In_Current_City_Years = full.Stay_In_Current_City_Years.map({'0':0,'1':1,"2":2,"3":3,'4+':4}).astype(int)
复制代码
转换数据类型
将数值列转换成对应的数据类型
cols1 = ["Gender","Age","Occupation","City_Category","Stay_In_Current_City_Years",
"Marital_Status","Product_Category_1",]
for col in cols1:
full[col] = full[col].astype(np.int64)
复制代码
再拆分和标准化
标准化
n_train=train.shape[0]#训练集的行数
X = full[:n_train]#取出处理之后的训练集
test_X = full[n_train:]#取出n_train后的数据作为测试集
y= train.Purchase
X_scaled = StandardScaler().fit(X).transform(X)#做转换
y_log = np.log(train.Purchase)##这里要注意的是,更符合正态分布
#得到测试集
test_X_scaled = StandardScaler().fit_transform(test_X)
复制代码
特征选择
使用Lasso套索回归做特征选择
from sklearn.linear_model import Lasso##运用算法来进行训练集的得到特征的重要性,特征选择的一个作用是,wrapper基础模型
lasso=Lasso(alpha=0.001)
lasso.fit(X_scaled,y_log)
FI_lasso = pd.DataFrame({"Feature Importance":lasso.coef_}, index=full.columns)#索引和重要性做成dataframe形式
FI_lasso.sort_values("Feature Importance",ascending=False)#由高到低进行排序
复制代码
FI_lasso[FI_lasso["Feature Importance"]!=0].sort_values("Feature Importance").plot(kind="barh",color="salmon",figsize=(10,8))
plt.title("特征选择重要性表示图")
plt.xticks(rotation=90)
plt.show()##画图显示
复制代码
特征选择差异不大,还是使用全部的特征
特征构造
后续预测的效果很差,重新分析数据,构造特征
# 单位商品的平均销售额
avg_purchase_per_product=pd.DataFrame(full.groupby(['Product_ID'])['Purchase'].mean())
avg_purchase_per_product.reset_index(inplace=True)
# 单用户的平均销售额
avg_purchase_per_user=pd.DataFrame(full.groupby(['User_ID'])['Purchase'].mean())
avg_purchase_per_user.reset_index(inplace=True)
# 购买次数
product_count=pd.DataFrame(full['Product_ID'].value_counts())
product_count.reset_index(inplace=True)
product_count=product_count.rename(columns={'index':'Product_ID','Product_ID':'Product_count'})
# 添加 三个新特征
full['avg_purchase_per_product']=full['Product_ID'].map(avg_purchase_per_product.set_index('Product_ID')['Purchase'])
full['product_count']=full['Product_ID'].map(product_count.set_index('Product_ID')['Product_count'])
full['avg_purchase_per_user']=full['User_ID'].map(avg_purchase_per_user.set_index('User_ID')['Purchase'])
# 把隐形的商品类别损失也计算为一种特征
conditions = [
(full['Product_Category_1'] != 0) & (full['Product_Category_2'] == 0) & (full['Product_Category_3'] == 0),
(full['Product_Category_1'] != 0) & (full['Product_Category_2'] != 0) & (full['Product_Category_3'] == 0),
(full['Product_Category_1'] != 0) & (full['Product_Category_2'] != 0) & (full['Product_Category_3'] != 0)]
choices = [1, 2, 3]
# 添加商品类别计数
full['Category_Count'] = np.select(conditions, choices, default=0)
复制代码
仅仅通过上述数据预处理后,通过单模型测试和多模型融合等多方法的尝试,返现预测用户消费金额的模型效果不理想。算法模型已经很难有改进,故我们希望从数据层面进行特征工程,原数据集的特征维度并不多,特征重要性也很弱,故我们采取的策略就是通过特征创造生成相关性系数较高新特征。共新增了4个维度的特征,特征维度的创造步骤具体如下:
1.商品类别计数Category_counts
不管是数据可视化的热力图还是特征选择中的重要系数,商品类别的相关性都非常高,都显示出商品类别具有一定有挖掘的价值。此前,我们一直把商品类别2和商品类别3当做缺失值,但也很有可能是消费者没有购买商品类别2和商品类别3的商品,那么每个用户有多少不同种类的商品就是一个计数指标。
2.每个Product_ID的总消费额度 Product_counts
3.每个User_ID的均消费金额avg_purchase_per_product
4.每个Product_ID的均消费金额avg_purchase_per_product
我们希望能挖掘出更多关于商品的隐藏信息,Product_ID是未被利用的特征,该数据集总量很大有50余万条,但实际用户和商品都只有几千条,故针对User_ID和Product_ID进行的统计计量应该能获得更多的隐藏信息,故对每个Product_ID的总消费额度进行统计,其次计算每个User_ID的均消费金额,每个Product_ID的均消费金额。
预测销售额 - 回归问题
# 导入基础包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings##忽略警告
warnings.filterwarnings('ignore')
# 设置画图的可显示中文字体
plt.rcParams['font.sans-serif'] = 'Microsoft YaHei'
plt.style.use('ggplot')
# 数据预处理函数
def Data_Clearing(full):
# 除去商品ID含的字符串
full['Product_ID']=full['Product_ID'].str.slice(2).astype(int)
# 采用零填充缺失值
cols1 = ["Product_Category_2","Product_Category_3"]
for col in cols1:
full[col].fillna(0,inplace=True)
# 性别
full.Gender = full.Gender.map({'F':0,"M":1}).astype(int)
# 年龄
full.Age = full.Age.map({'0-17':17,'18-25':25,'26-35':35,'36-45':45,'46-50':50,'51-55':55,'55+':60}).astype(int)
# 城市等级
full.City_Category = full.City_Category.map({'A':0,"B":1,"C":2}).astype(int)
# 定居年份
full.Stay_In_Current_City_Years = full.Stay_In_Current_City_Years.map({'0':0,'1':1,"2":2,"3":3,'4+':4}).astype(int)
# 特征生成
# 单位商品的平均销售额
avg_purchase_per_product=pd.DataFrame(full.groupby(['Product_ID'])['Purchase'].mean())
avg_purchase_per_product.reset_index(inplace=True)
# 单用户的平均销售额
avg_purchase_per_user=pd.DataFrame(full.groupby(['User_ID'])['Purchase'].mean())
avg_purchase_per_user.reset_index(inplace=True)
# 购买次数
product_count=pd.DataFrame(full['Product_ID'].value_counts())
product_count.reset_index(inplace=True)
product_count=product_count.rename(columns={'index':'Product_ID','Product_ID':'Product_count'})
# 添加 三个新特征
full['avg_purchase_per_product']=full['Product_ID'].map(avg_purchase_per_product.set_index('Product_ID')['Purchase'])
full['product_count']=full['Product_ID'].map(product_count.set_index('Product_ID')['Product_count'])
full['avg_purchase_per_user']=full['User_ID'].map(avg_purchase_per_user.set_index('User_ID')['Purchase'])
# 把隐形的商品类别损失也计算为一种特征
conditions = [
(full['Product_Category_1'] != 0) & (full['Product_Category_2'] == 0) & (full['Product_Category_3'] == 0),
(full['Product_Category_1'] != 0) & (full['Product_Category_2'] != 0) & (full['Product_Category_3'] == 0),
(full['Product_Category_1'] != 0) & (full['Product_Category_2'] != 0) & (full['Product_Category_3'] != 0)]
choices = [1, 2, 3]
# 添加商品类别计数
full['Category_Count'] = np.select(conditions, choices, default=0)
return full
# 读取数据集
train = pd.read_csv("./product_data.csv")
# 数据清洗和处理
data = Data_Clearing(train)
复制代码
模型选择
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold,train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor
import time
# 先准备好评价函数RMSE和R2,RMSE越小越好,R2越接近1越好
from sklearn.metrics import mean_squared_error as MSE ,r2_score as R2
# 评价函数
def print_metrics(predict):
mse = MSE(y_true=y_test,y_pred=predict)
rmse = np.sqrt(MSE(y_true=y_test,y_pred=predict))
r2 = R2(y_true=y_test,y_pred=predict)
print("MSE:",mse)
print("RMSE:",rmse)
print("R2:",r2)
# 划分训练集和测试集
X=train.drop('Purchase',axis=1)
y=train['Purchase']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42,shuffle=True)
models = [LinearRegression(),
Ridge(),
Lasso(alpha=0.01,max_iter=10000),
RandomForestRegressor(),
GradientBoostingRegressor(),
ElasticNet(),
SGDRegressor(),
BayesianRidge(),
ExtraTreesRegressor(),
XGBRegressor()]
# 定义存储列表
pred_df = pd.DataFrame()
pred_df["predict"] = y_test
rmses = []
r2s = []
all = time.clock()
# ========================计算时间
names = ["LR", "Ridge", "Lasso", "RF", "GBR","Ela","SGD","Bay","Extra","XGB"]
for name, model in zip(names, models):
pre_start = time.clock()
#============================================
print("%s" % name)
# 训练模型
now_model = model.fit(X,y)
model_predict = now_model.predict(X_test)
# 得到rmse和r2的评分
rmse = np.sqrt(MSE(y_true=y_test,y_pred=model_predict))
r2 = R2(y_true=y_test,y_pred=model_predict)
# 存rmse和r2,以及模型预测的结果
pred_df["predict_"+name] = model_predict
rmses.append(rmse)
r2s.append(r2)
# 打印结果
print_metrics(model_predict)
elapsed = (time.clock() - pre_start)
#============================================
print("Time used:",elapsed)
print("-"*20)
# ========================计算时间
print("-"*20)
elapsed = (time.clock() - all)
print("Time used:",elapsed)
复制代码
SGD模型得到的rmse和r2太大,严重干扰画图,我们将SGD的数值修改一下能体现出SGD效果差即可
site = rmses.index(max(rmses))
rmses[site] = 4999
r2s[site] = -2.0
#添加画布,设置画布大小
plt.figure(figsize=(10,8),dpi=600)
plt.barh(range(len(rmses)),rmses,height=0.7, color='salmon', alpha=0.8) # 从下往上画
plt.yticks(range(len(rmses)),names)
plt.xlim(0,5500) #30~47,各1
plt.xlabel(u"RMSE")
plt.ylabel(u'Models')
plt.title(u"RMSE")
#这是设置柱状条上面的数值,来自RMSEs
for x_,y_ in enumerate(rmses):
plt.text(y_,x_-0.1, '%s' % y_)
plt.show()
复制代码
模型融合
from sklearn import ensemble,linear_model
from mlxtend.regressor import StackingCVRegressor,StackingRegressor
# 采用stacking策略进行模型融合
# 第一层
clf1_1 = ensemble.RandomForestRegressor()
clf1_2 = ensemble.ExtraTreesRegressor()
# 第二层
clf2 = XGBRegressor()
np.random.seed(42)
stack = StackingRegressor(regressors=[clf1_1,clf1_2],meta_regressor=clf2)
start = time.clock()
# ========================计算时间
print("%s" % "stack")
names.append("stack")
now_model = stack.fit(X,y)
model_predict = now_model.predict(X_test)
rmse = np.sqrt(MSE(y_true=y_test,y_pred=model_predict))
r2 = R2(y_true=y_test,y_pred=model_predict)
pred_df["predict_stack"] = model_predict
rmses.append(rmse)
r2s.append(r2)
print_metrics(model_predict)
print("-"*20)
# ========================计算时间
elapsed = (time.clock() - start)
print("Time used:",elapsed)
复制代码
#添加画布,设置画布大小
plt.figure(figsize=(12,8),dpi=600)
plt.barh(range(11), rmses, height=0.7, color='salmon', alpha=0.8) # 从下往上画
plt.yticks(range(11), names)
plt.xlim(0,5500) #30~47,各1
plt.xlabel(u"RMSE")
plt.ylabel(u'Models')
plt.title(u"RMSE")
#这是设置柱状条上面的数值,来自RMSEs
for x_,y_ in enumerate(rmses):
plt.text(y_+0.025, x_-0.1, '%s' % y_)
plt.show()
复制代码
#添加画布,设置画布大小
plt.figure(figsize=(12,8),dpi=600)
plt.barh(range(11), r2s, height=0.7, color='salmon', alpha=0.8) # 从下往上画
plt.yticks(range(11), names)
plt.xlim(-2.5,2.5) #30~47,各1
plt.xlabel(u"R2")
plt.ylabel(u'Models')
plt.title(u"R2")
#这是设置柱状条上面的数值,来自RMSEs
for x_,y_ in enumerate(r2s):
plt.text(y_+0.025,x_-0.1, '%s' % y_)
plt.show()
复制代码
可以看到预测的销售额还是很近的