欢迎关注,敬请点赞!
相关附件请从码云https://gitee.com/wenlong850606/Airlines_customer_value_analysis下载。
流程:
业务系统(挖掘目标)——>数据抽取 ——>数据探索与预处理——>建模&应用——>结果&反馈
参考R(消费时间间隔)F(频率)M(金额)模型,将客户关系长度L、消费时间间隔R、消费频率F、飞行里程M和折扣系数的均值C五个指标作为航空公司客户价值指标,LRFMC模型
导入数据
import pandas as pd
df = pd.read_csv('./air_data.csv')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62988 entries, 0 to 62987
Data columns (total 44 columns):
MEMBER_NO 62988 non-null int64
FFP_DATE 62988 non-null object
FIRST_FLIGHT_DATE 62988 non-null object
GENDER 62985 non-null object
FFP_TIER 62988 non-null int64
WORK_CITY 60719 non-null object
WORK_PROVINCE 59740 non-null object
WORK_COUNTRY 62962 non-null object
AGE 62568 non-null float64
LOAD_TIME 62988 non-null object
FLIGHT_COUNT 62988 non-null int64
BP_SUM 62988 non-null int64
EP_SUM_YR_1 62988 non-null int64
EP_SUM_YR_2 62988 non-null int64
SUM_YR_1 62437 non-null float64
SUM_YR_2 62850 non-null float64
SEG_KM_SUM 62988 non-null int64
WEIGHTED_SEG_KM 62988 non-null float64
LAST_FLIGHT_DATE 62988 non-null object
AVG_FLIGHT_COUNT 62988 non-null float64
AVG_BP_SUM 62988 non-null float64
BEGIN_TO_FIRST 62988 non-null int64
LAST_TO_END 62988 non-null int64
AVG_INTERVAL 62988 non-null float64
MAX_INTERVAL 62988 non-null int64
ADD_POINTS_SUM_YR_1 62988 non-null int64
ADD_POINTS_SUM_YR_2 62988 non-null int64
EXCHANGE_COUNT 62988 non-null int64
avg_discount 62988 non-null float64
P1Y_Flight_Count 62988 non-null int64
L1Y_Flight_Count 62988 non-null int64
P1Y_BP_SUM 62988 non-null int64
L1Y_BP_SUM 62988 non-null int64
EP_SUM 62988 non-null int64
ADD_Point_SUM 62988 non-null int64
Eli_Add_Point_Sum 62988 non-null int64
L1Y_ELi_Add_Points 62988 non-null int64
Points_Sum 62988 non-null int64
L1Y_Points_Sum 62988 non-null int64
Ration_L1Y_Flight_Count 62988 non-null float64
Ration_P1Y_Flight_Count 62988 non-null float64
Ration_P1Y_BPS 62988 non-null float64
Ration_L1Y_BPS 62988 non-null float64
Point_NotFlight 62988 non-null int64
dtypes: float64(12), int64(24), object(8)
memory usage: 19.2+ MB
数据探索
explore_file = './explore_result.csv' # 数据探索结果文件
explore = df.describe(percentiles=[], include='all').T # T转置后方便查阅,转置后可以取'count'字段
explore['null'] = len(df) - explore['count']
explore = explore[['null', 'max', 'min']] # 空值、最大值、最小值
explore.columns = ['空值记录数', '最大值', '最小值'] # 属性列名重命名
explore.index.name = '属性名称' # 对索引命名
explore.to_csv(explore_file)
explore
空值记录数 | 最大值 | 最小值 | |
---|---|---|---|
属性名称 | |||
MEMBER_NO | 0 | 62988 | 1 |
FFP_DATE | 0 | NaN | NaN |
FIRST_FLIGHT_DATE | 0 | NaN | NaN |
GENDER | 3 | NaN | NaN |
FFP_TIER | 0 | 6 | 4 |
WORK_CITY | 2269 | NaN | NaN |
WORK_PROVINCE | 3248 | NaN | NaN |
WORK_COUNTRY | 26 | NaN | NaN |
AGE | 420 | 110 | 6 |
LOAD_TIME | 0 | NaN | NaN |
FLIGHT_COUNT | 0 | 213 | 2 |
BP_SUM | 0 | 505308 | 0 |
EP_SUM_YR_1 | 0 | 0 | 0 |
EP_SUM_YR_2 | 0 | 74460 | 0 |
SUM_YR_1 | 551 | 239560 | 0 |
SUM_YR_2 | 138 | 234188 | 0 |
SEG_KM_SUM | 0 | 580717 | 368 |
WEIGHTED_SEG_KM | 0 | 558440 | 0 |
LAST_FLIGHT_DATE | 0 | NaN | NaN |
AVG_FLIGHT_COUNT | 0 | 26.625 | 0.25 |
AVG_BP_SUM | 0 | 63163.5 | 0 |
BEGIN_TO_FIRST | 0 | 729 | 0 |
LAST_TO_END | 0 | 731 | 1 |
AVG_INTERVAL | 0 | 728 | 0 |
MAX_INTERVAL | 0 | 728 | 0 |
ADD_POINTS_SUM_YR_1 | 0 | 600000 | 0 |
ADD_POINTS_SUM_YR_2 | 0 | 728282 | 0 |
EXCHANGE_COUNT | 0 | 46 | 0 |
avg_discount | 0 | 1.5 | 0 |
P1Y_Flight_Count | 0 | 118 | 0 |
L1Y_Flight_Count | 0 | 111 | 0 |
P1Y_BP_SUM | 0 | 246197 | 0 |
L1Y_BP_SUM | 0 | 259111 | 0 |
EP_SUM | 0 | 74460 | 0 |
ADD_Point_SUM | 0 | 984938 | 0 |
Eli_Add_Point_Sum | 0 | 984938 | 0 |
L1Y_ELi_Add_Points | 0 | 728282 | 0 |
Points_Sum | 0 | 985572 | 0 |
L1Y_Points_Sum | 0 | 728282 | 0 |
Ration_L1Y_Flight_Count | 0 | 1 | 0 |
Ration_P1Y_Flight_Count | 0 | 1 | 0 |
Ration_P1Y_BPS | 0 | 0.999989 | 0 |
Ration_L1Y_BPS | 0 | 0.999993 | 0 |
Point_NotFlight | 0 | 140 | 0 |
数据清洗
cleaned_file = './cleaned_result.csv'
df = df[(df['SUM_YR_1'].notnull()) & (df['SUM_YR_2'].notnull())] # 删除票价为空的记录
index1 = (df['SUM_YR_1'] != 0) | (df['SUM_YR_2'] != 0) # 保留票价非0
index2 = (df['SEG_KM_SUM'] == 0) & (df['avg_discount'] == 0) # 保留平均折扣率和总飞行公里数同时为0的记录,有(极少)可能买过票,但观测窗口没有活跃度
df = df[index1 | index2]
df.to_csv(cleaned_file) # 导出数据清洗结果
数据规约
选择的相关指标: LOAD_TIME(观测窗口截至日期)、FFP_DATE(入会时间)、LAST_TO_END(最后一次乘机时间至观测窗口结束时长)、FLIGHT_COUNT(飞行次数)、SEG_KM_SUM(观测窗口飞行总公里数)、avg_discount(平均折扣率)
import numpy as np
scale_file = './scale_result.csv'
def scale_data(data): # 属性规约,数据变换函数
data = data[['LOAD_TIME', 'FFP_DATE', 'LAST_TO_END', 'FLIGHT_COUNT', 'SEG_KM_SUM', 'avg_discount']]
d_loadtime = pd.to_datetime(data['LOAD_TIME']) # 转换为时间类型
d_ffptime = pd.to_datetime(data['FFP_DATE'])
time_delta = d_loadtime - d_ffptime
temp_data = data.copy()
temp_data['L'] = time_delta.map(lambda x:x / np.timedelta64(30, 'D')) # 按一个月30天来转换
temp_data['R'] = data['LAST_TO_END']
temp_data['F'] = data['FLIGHT_COUNT']
temp_data['M'] = data['SEG_KM_SUM']
temp_data['C'] = data['avg_discount']
new_data = temp_data[['L', 'R', 'F', 'M', 'C']]
return new_data
new_data = scale_data(df)
new_data.describe() # 判断是否需要标准化
L | R | F | M | C | |
---|---|---|---|---|---|
count | 62044.000000 | 62044.000000 | 62044.000000 | 62044.000000 | 62044.000000 |
mean | 49.623036 | 172.532703 | 11.971359 | 17321.694749 | 0.722180 |
std | 28.262697 | 181.526164 | 14.110619 | 21052.728111 | 0.184833 |
min | 12.166667 | 1.000000 | 2.000000 | 368.000000 | 0.136017 |
25% | 24.500000 | 29.000000 | 3.000000 | 4874.000000 | 0.613085 |
50% | 42.600000 | 105.000000 | 7.000000 | 10200.000000 | 0.712162 |
75% | 72.733333 | 260.000000 | 15.000000 | 21522.500000 | 0.809293 |
max | 114.566667 | 731.000000 | 213.000000 | 580717.000000 | 1.500000 |
规范化
ascore_data = (new_data - new_data.mean(axis = 0)) / (new_data.std(axis = 0))
ascore_data.columns = ['ZL', 'ZR', 'ZF', 'ZM', 'ZC'] # 属性列名重命名
ascore_data.to_csv(scale_file)
ascore_data
ZL | ZR | ZF | ZM | ZC | |
---|---|---|---|---|---|
0 | 1.435707 | -0.944948 | 14.034016 | 26.761154 | 1.295540 |
1 | 1.307152 | -0.911894 | 9.073213 | 13.126864 | 2.868176 |
2 | 1.328381 | -0.889859 | 8.718869 | 12.653481 | 2.880950 |
3 | 0.658476 | -0.416098 | 0.781585 | 12.540622 | 1.994714 |
4 | 0.386032 | -0.922912 | 9.923636 | 13.898736 | 1.344335 |
... | ... | ... | ... | ... | ... |
62974 | 2.076128 | -0.460169 | -0.706656 | -0.805297 | -0.065898 |
62975 | 0.557046 | -0.283886 | -0.706656 | -0.805297 | -0.282309 |
62976 | -0.149421 | -0.735611 | -0.706656 | -0.772332 | -2.689885 |
62977 | -1.206173 | 1.605649 | -0.706656 | -0.779837 | -2.554628 |
62978 | -0.479656 | 0.603039 | -0.706656 | -0.786677 | -2.392319 |
62044 rows × 5 columns
建模
扫描二维码关注公众号,回复:
11157737 查看本文章
from sklearn.cluster import KMeans # 导入K均值聚类算法
from sklearn.externals import joblib
km = KMeans(n_clusters=5)
km.fit(ascore_data)
joblib.dump(km, './km.kpl')
c:\users\13721\appdata\local\programs\python\python37-32\lib\site-packages\sklearn\externals\joblib\__init__.py:15: FutureWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.
warnings.warn(msg, category=FutureWarning)
['./km.kpl']
km = joblib.load('./km.kpl')
r1 = pd.Series(km.labels_)
r1 = r1.value_counts() # 每个聚类的样本个数
r2 = pd.DataFrame(km.cluster_centers_) # 聚类的中心
r = pd.concat([r2, r1], axis=1) # 进行列合并
r.columns = list(ascore_data.columns) + ['聚类个数'] # 列名重命名
r.index.name = '聚类类别' # 索引名称
r.index = ['客户群体1', '客户群体2', '客户群体3', '客户群体4', '客户群体5']
r.to_csv('./cluster_result.csv')
r
ZL | ZR | ZF | ZM | ZC | 聚类个数 | |
---|---|---|---|---|---|---|
客户群体1 | 1.160667 | -0.377221 | -0.086919 | -0.094844 | -0.155905 | 15740 |
客户群体2 | 0.051843 | -0.002668 | -0.226803 | -0.231254 | 2.191347 | 4184 |
客户群体3 | 0.483328 | -0.799383 | 2.483202 | 2.424724 | 0.308630 | 5336 |
客户群体4 | -0.313678 | 1.686258 | -0.574016 | -0.536820 | -0.173326 | 12125 |
客户群体5 | -0.700206 | -0.414888 | -0.161143 | -0.160958 | -0.255132 | 24659 |
r.describe()
ZL | ZR | ZF | ZM | ZC | 聚类个数 | |
---|---|---|---|---|---|---|
count | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 |
mean | 0.136391 | 0.018420 | 0.286864 | 0.280170 | 0.383123 | 12408.800000 |
std | 0.720956 | 0.974066 | 1.241903 | 1.210720 | 1.034744 | 8350.399254 |
min | -0.700206 | -0.799383 | -0.574016 | -0.536820 | -0.255132 | 4184.000000 |
25% | -0.313678 | -0.414888 | -0.226803 | -0.231254 | -0.173326 | 5336.000000 |
50% | 0.051843 | -0.377221 | -0.161143 | -0.160958 | -0.155905 | 12125.000000 |
75% | 0.483328 | -0.002668 | -0.086919 | -0.094844 | 0.308630 | 15740.000000 |
max | 1.160667 | 1.686258 | 2.483202 | 2.424724 | 2.191347 | 24659.000000 |
from pyecharts import options as opts
from pyecharts.charts import Radar
def radar_base(array) -> Radar:
c = (
Radar()
.add_schema(
schema=[
opts.RadarIndicatorItem(name=array.columns[0] + '入会时长', max_=1.5, min_=-1.0),
opts.RadarIndicatorItem(name=array.columns[1] + '最近间距', max_=2.0, min_=-1.0),
opts.RadarIndicatorItem(name=array.columns[2] + '消费频率', max_=2.5, min_=-1.0),
opts.RadarIndicatorItem(name=array.columns[3] + '飞行里程', max_=2.5, min_=-1.0),
opts.RadarIndicatorItem(name=array.columns[4] + '折扣均值', max_=2.5, min_=-1.0),
]
)
.add(array.index[0], [list(array.values[0, :5])], color='yellow')
.add(array.index[1], [list(array.values[1, :5])], color='black')
.add(array.index[2], [list(array.values[2, :5])], color='red')
.add(array.index[3], [list(array.values[3, :5])], color='blue')
.add(array.index[4], [list(array.values[4, :5])], color='green')
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(title_opts=opts.TitleOpts(title="航空客户价值分析雷达图"))
)
return c
radar_base(r).render_notebook()
分析结论:
(此处折扣定义:0.5折,0.7折,越高机票越贵)
客户群体1:[近期乘机],频率低,里程短,[等级低],折扣小,[老客户],评价:[价值不高],[维持]
客户群体2:[近期乘机不多],频率低,里程短,[等级低],折扣高,[中等客户],评论:[潜在客户],[开发]
客户群体3:[近期乘机],频率高,里程长,[等级高],折扣较高,[老客户],评价:[高价值客户],[重点维护]
客户群体4:[长时间没乘机],频率低,里程短,[等级低],折扣小,[新客户],评价:[价值不高]
客户群体5:[近期乘机],频率低,里程短,[等级低],折扣小,[新客户],评价:[潜在客户],[开发]
欢迎关注,敬请点赞!
返回顶部