Electricity supplier data mining

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_excel('Online Retail.xlsx')
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data['Turnover'] = data['UnitPrice']* data['Quantity']
def get_canceled_status(InvoiceNo,canceled=1,normal=0):
 if 'c' in str(InvoiceNo).lower():
 return canceled
 else:
 return normal

data['is_canceled'] = [get_canceled_status(InvoiceNo,1,0) for InvoiceNo in
data['InvoiceNo']]
# 观察数据
import missingno as msno
%config InlineBackend.figure_format = 'svg'
%matplotlib inline
msno.bar(data,figsize=(11,3))
#
dx_invalid = [get_canceled_status(i,-1,1)*q<=0 for (i,q) in
zip(data.InvoiceNo,data.Quantity)]
data.drop(data.index[idx_invalid],inplace=True)
# 删除重复数据
data = data.drop_duplicates() # data.duplicated().sum()
# 删除 CustomerID 为空的数据
data = data.dropna(subset=['CustomerID']) # data.isnull().sum().sum()
# 删除单价 小于等于 0 的数据
idx_invalid = [ int(i)<=0 for i in data.UnitPrice]
data.drop(data.index[idx_invalid],inplace=True)
# 重设 index
data = data.reset_index(drop=True)
# 模块一 : 数据历史销售情况

import datetime as dt
import matplotlib.pyplot as plt
data['Month'] = data['InvoiceDate'].dt.month
# 每月成交额 画图
turnover_month = data.groupby('Month')['Turnover'].sum()
# 作图
plt.figure(figsize=(10,5))
plt.rc('font', family='SimHei', size=13) # 显示中文,居家必备
plt.plot(turnover_month)
plt.xlabel('月份')
plt.ylabel('成交额')
plt.title('每月成交额')
# 每月成交数量 画图
quantity_month = data.groupby('Month')['Quantity'].sum()
# 作图
plt.figure(figsize=(10,5))
plt.rc('font', family='SimHei', size=13) # 显示中文,居家必备
plt.plot(quantity_month)
plt.xlabel('月份')
plt.ylabel('成交数量')
plt.title('每月成交数量')
# 每月订单取消金额
canceled_month =
data[data.is_canceled==1].groupby('Month')['Turnover'].sum().abs()
# 作图
plt.figure(figsize=(10,5))
plt.rc('font', family='SimHei', size=13) # 显示中文,居家必备
plt.plot(turnover_month)
plt.xlabel('月份')
plt.ylabel('取消订单金额')
plt.title('每月取消订单金额')
# 每月订单取消数量
canceled_quantity_month =
data[data.is_canceled==1].groupby('Month')['Quantity'].sum().abs()
# 作图
plt.figure(figsize=(10,5))
plt.rc('font', family='SimHei', size=13) # 显示中文,居家必备
plt.plot(canceled_quantity_month)
plt.xlabel('月份')
plt.ylabel('取消订单数量')
plt.title('每月取消订单数量')

# 国家成交金额
plt.figure(figsize=(10,8))
plt.rc('font', family='SimHei', size=13) # 显示中文,居家必备
data[data.is_canceled==0].groupby('Country')['Turnover'].sum().sort_values(ascending
=True).plot.barh()
plt.xlabel('成交金额')
plt.ylabel('国家')
plt.title('不同国家成交金额')
# 模块二 : 订单取消原因的判断
# 输出订单取消率
count = 0
for i in data.is_canceled:
 if i==1:
 count=count+1
precision = count/len(data.is_canceled)
print("canceled_rate is {}".format(precision))
# 由于 抽样 无法解决 非数值数据,故采取将 InvoiceNo,StockNo InvoiceDate 映射
为数字的方法
unique_stock_code = list(set(data.StockCode))
mapping_idx2stock_code = dict(enumerate(unique_stock_code))
mapping_stock_code2idx = {str(v): k for k, v in mapping_idx2stock_code.items()}
unique_invoice_number = list(set(data.InvoiceNo))
mapping_idx2invoice_number = dict(enumerate(unique_invoice_number))
mapping_invoice_number2idx = {str(v): k for k, v in
mapping_idx2invoice_number.items()}
unique_invoice_date = list(set(data.InvoiceDate))
mapping_idx2invoice_date = dict(enumerate(unique_invoice_date))
mapping_invoice_date2idx = {str(v): k for k, v in mapping_idx2invoice_date.items()}
mapping_data = data.copy(deep=True)
mapping_data.StockCode = list(map(lambda x:
mapping_stock_code2idx[str(x)],data.StockCode))
mapping_data.InvoiceNo = list(map(lambda x:
mapping_invoice_number2idx[str(x)],data.InvoiceNo))
mapping_data.InvoiceDate = list(map(lambda x:
mapping_invoice_date2idx[str(x)],data.InvoiceDate))
x = mapping_data[['InvoiceNo','StockCode','InvoiceDate','UnitPrice','CustomerID']]
y = mapping_data['is_canceled']
groupby_data_orgianl = mapping_data.groupby('is_canceled').count() # 对 is_canceled
做分类汇总
groupby_data_orgianl # 打印输出原始数据集样本分类分布
# 正负样本不均衡 采用 SMOTE 过抽样
from imblearn.over_sampling import SMOTE 

# 使用 SMOTE 方法进行过抽样处理
model_smote = SMOTE() # 建立 SMOTE 模型对象
x_smote_resampled, y_smote_resampled = model_smote.fit_sample(x,y) # 输入数据并
作过抽样处理
x_smote_resampled = pd.DataFrame(x_smote_resampled,
columns=['InvoiceNo','StockCode','InvoiceDate','UnitPrice','CustomerID']) # 将数据转换
为数据框并命名列名
y_smote_resampled = pd.DataFrame(y_smote_resampled,columns=['is_canceled']) # 将
数据转换为数据框并命名列名
smote_resampled = pd.concat([x_smote_resampled, y_smote_resampled],axis=1) # 按
列合并数据框
groupby_data_smote = smote_resampled.groupby('is_canceled').count() # 分类汇总
groupby_data_smote # 打印输出经过 SMOTE 处理后的数据集样本分类分布
# 训练集与测试集的划分 并训练模型
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
X_train, X_valid, y_train, y_valid =
train_test_split(x_smote_resampled,y_smote_resampled)
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)
my_model.fit(X_train, y_train,
 early_stopping_rounds=5,
 eval_set=[(X_valid, y_valid)],
 verbose=False)
#输出 平均绝对误差
from sklearn.metrics import mean_absolute_error
predictions = my_model.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))
from matplotlib import pyplot as plt
# 输出特征重要性
print(my_model.feature_importances_)
# 画图
plt.figure(figsize=(10,5))
plt.bar(range(len(my_model.feature_importances_)), my_model.feature_importances_)
plt.xticks([0, 1, 2, 3, 4],
 ['InvoiceNo','StockCode','InvoiceDate','UnitPrice','CustomerID'])
plt.show()
# 模块三 RFM 模型
# 删除 取消订单 的数据
idx_invalid = [ i<=0 for i in data.Quantity]
data.drop(data.index[idx_invalid],inplace=True)
data = data.reset_index(drop=True)
# last_date = 最后购买时间 + 一天
last_date = data['InvoiceDate'].max()+ dt.timedelta(days=1)
last_date
rfm = data.groupby(['CustomerID']).agg({'InvoiceDate': lambda x : (last_date -
x.max()).days,
 'InvoiceNo':'count','Turnover': 'sum'})
rfm.rename(columns={'InvoiceDate':'Recency','InvoiceNo':'Frequency','Turnover':'Mone
taryValue'}
 ,inplace=True)
rfm_copy = rfm.copy(deep=True)
# z-score
from sklearn import preprocessing
rfm_copy = pd.DataFrame(preprocessing.scale(rfm_copy))
rfm_copy.index.name = rfm.index.name
rfm_copy.index = rfm.index
rfm_copy.columns = rfm.columns
# Elbow method 确定聚类数目
# 参考文献: https://bl.ocks.org/rpgove/0060ff3b656618e9136b
from sklearn.cluster import KMeans
import seaborn as sns
Max = 15 # 最大聚类数
my_score = []
my_range = []
for i in range(1,Max+1):
 my_score.append(KMeans(n_clusters=i).fit(rfm_copy).score(rfm_copy))
 my_range.append(i)
plt.figure(figsize=(5,3))
plt.plot(my_range,my_score)
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
plt.xlabel('Number of Clusters(k)')
plt.ylabel('SSE')
plt.title('Elbow Curve')
plt.show()
#K-means
kmeans = KMeans(n_clusters=3,init='k-means++',random_state=3).fit(rfm_copy)
centroids = pd.DataFrame(kmeans.cluster_centers_)
list1 = ["用户群 0","用户群 1","用户群 2"]
list2 = ["R","F","M"]
centroids.index = list1
centroids.columns = list2
# 输出聚类中心
centroids
rfm_copy['cluster'] = kmeans.labels_
rfm_copy.head()
from sklearn import metrics
labels = kmeans.labels_
Calinski_Harabaz_Index = metrics.calinski_harabaz_score(rfm_copy, labels)
print("Calinski-Harabaz Index is %f" % (Calinski_Harabaz_Index))
sns.boxplot(rfm_copy.cluster,rfm_copy.Recency)
sns.boxplot(rfm_copy.cluster,rfm_copy.Frequency)
sns.boxplot(rfm_copy.cluster,rfm_copy.MonetaryValue)

 

Released four original articles · won praise 0 · Views 803

Guess you like

Origin blog.csdn.net/qq_36234688/article/details/104338421