proyecto de minería de datos para construir el talento (usuario) modelo de pérdidas

# coding=utf-8
# @Time    : 2019/12/3 14:48
# @Author  : Z
# @Email   : S
# @File    : 2.6ML_SMOTO_talentFeatures.py

#数据挖掘项目构建人才(用户)流失模型

import pandas as pd

talent_data = pd.read_csv("./train.csv")
#1.不同类型数据的整理

# 数值型数据
num_cols = ["Age", "MonthlyIncome", "NumCompaniesWorked", "PercentSalaryHike", "PerformanceRating",
            "StandardHours", "TotalWorkingYears", "YearsAtCompany",
            "YearsInCurrentRole", "YearsSinceLastPromotion"]
# 类别型数据
cat_cols=["Gender","MaritalStatus","OverTime"]
# 有序型数据
ord_cols=["DistanceFromHome","Education","EnvironmentSatisfaction","JobInvolvement",
          "JobLevel","JobSatisfaction","RelationshipSatisfaction","StockOptionLevel",
          "TrainingTimesLastYear","WorkLifeBalance"]
#类别标签
target_col=["Attrition"]
total_data=num_cols+ord_cols+cat_cols
#将所有的特征数据和类别标签进行整合
use_data=talent_data[total_data+target_col]

#2.正负样本的比例
#正负样本的不均衡问题
neg_data = use_data[use_data["Attrition"] == 0] #未离职
pos_data = use_data[use_data["Attrition"] == 1] #离职
print("正负样本比例:", len(pos_data)/len(neg_data))
print("离职:",len(pos_data))
print("未离职:",len(neg_data))

#3.数据集的切分
# X=talent_data[total_data]
# y=talent_data["Attrition"]
# from sklearn.cross_validation import  train_test_split
# X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=22)
print(len(neg_data)*0.8)
train_neg_data=neg_data.iloc[:int(len(neg_data)*0.8)].copy()
test_neg_data=neg_data.iloc[int(len(neg_data)*0.8):].copy()
print("train_neg_data:",len(train_neg_data))
print("test_neg_data:",len(test_neg_data))
print(len(pos_data)*0.8)
train_pos_data = pos_data.iloc[:int(len(pos_data) * 0.8)].copy()
test_pos_data = pos_data.iloc[int(len(pos_data) * 0.8):].copy()
print("train_pos_data:",len(train_pos_data))
print("test_pos_data:",len(test_pos_data))
#合并
train_data=pd.concat([train_neg_data,train_pos_data])
test_data=pd.concat([test_neg_data,test_neg_data])
print("正负样本比例:",len(pos_data)/len(neg_data))
print("训练集的个数:",len(train_data))
print("训练集中正负样本比例",len(train_pos_data)/len(train_neg_data))
print("测试集中正负样本比例",len(test_pos_data)/len(test_neg_data))

#4.类别型特征的处理
#onehotencoder 独热编码---必须要求数据是整数类型
#labelencoder 标签编码---0-(claass-1)
from sklearn.preprocessing import LabelEncoder
# "Gender","MaritalStatus","OverTime"
gender_label_enc=LabelEncoder()
train_data["Gender_enc"]=gender_label_enc.fit_transform(train_data["Gender"])
marital_label_enc=LabelEncoder()
train_data["MaritalStatus_enc"]=marital_label_enc.fit_transform(train_data["MaritalStatus"])
OT_label_enc=LabelEncoder()
train_data["OT_enc"]=OT_label_enc.fit_transform(train_data["OverTime"])
print("=="*100)
print(train_data.groupby("Gender_enc").size())
print(train_data.groupby("MaritalStatus_enc").size())
print(train_data.groupby("OT_enc").size())
print("=="*100)
#独热编码
from sklearn.preprocessing import OneHotEncoder
ohe_enc=OneHotEncoder()
train_cat_feats=ohe_enc.fit_transform(train_data[["Gender_enc","MaritalStatus_enc","OT_enc"]]).toarray()
print(type(train_data[["Gender_enc","MaritalStatus_enc","OT_enc"]]))
print(type(train_cat_feats))
print(train_cat_feats[:5,:])
# 测试集部分
test_data["Gender_enc"]=gender_label_enc.transform(test_data["Gender"])
test_data["MaritalStatus_enc"]=marital_label_enc.transform(test_data["MaritalStatus"])
test_data["OT_enc"]=OT_label_enc.transform(test_data["OverTime"])
test_cat_feats=ohe_enc.fit_transform(test_data[["Gender_enc","MaritalStatus_enc","OT_enc"]]).toarray()

#整合所有的特征
print(type(train_data[num_cols]))
print(type(train_data[num_cols].values))
train_num_feats=train_data[num_cols].values
train_col_feats=train_data[ord_cols].values
import numpy as np
train_feats=np.hstack([train_num_feats,train_col_feats,train_cat_feats])
train_target=train_data[target_col].values
print(len(train_feats))
print(len(train_target))
# 879
# 879
#测试数据
test_num_feats=test_data[num_cols].values
test_ord_feats=test_data[ord_cols].values
test_feats=np.hstack([test_num_feats,test_ord_feats,test_cat_feats])
test_target=test_data[target_col].values
print(len(test_feats))
print(len(test_target))
# 370
# 370


#仅需要对训练数据进行采样
from imblearn.over_sampling import SMOTE
smoto=SMOTE()
train_feats,train_target=smoto.fit_sample(train_feats,train_target)
from collections import Counter
# print(sorted(Counter(train_target).items()))

#
#使用随机森林建立模型
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=100,criterion="gini")
rf.fit(train_feats,train_target)
#预测
y_pred=rf.predict(test_feats)
print("model in trainset score is:",rf.score(train_feats,train_target))
print("model in testsize score is:",rf.score(test_feats,test_target))
# model in trainset score is: 1.0
# model in testsize score is: 0.972972972972973
#混淆矩阵
from sklearn.metrics import confusion_matrix,classification_report
print("混淆矩阵:\n",confusion_matrix(test_target,y_pred))
print(classification_report(test_target,y_pred))
#使用逻辑斯特回归模型--分类模型
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(penalty="l2")
lr.fit(train_feats,train_target)
#预测
y_pred1=lr.predict(test_feats)
print("model in trainset score is:",lr.score(train_feats,train_target))
print("model in testsize score is:",lr.score(test_feats,test_target))
# model in trainset score is: 1.0
# model in testsize score is: 0.972972972972973
#混淆矩阵
from sklearn.metrics import confusion_matrix,classification_report
print("混淆矩阵:\n",confusion_matrix(test_target,y_pred1))
print(classification_report(test_target,y_pred1))
Ha publicado 189 artículos originales · ganado elogios 13 · vistas 10000 +

Supongo que te gusta

Origin blog.csdn.net/NewBeeMu/article/details/103516388
Recomendado
Clasificación