“上帝”的价值——客户价值分析

欢迎关注，敬请点赞！

“上帝”的价值——航空公司客户价值分析

流程：

导入数据
数据探索
数据清洗
数据规约
规范化
建模

分析结论：

相关附件请从码云https://gitee.com/wenlong850606/Airlines_customer_value_analysis下载。

流程：

业务系统（挖掘目标）——>数据抽取 ——>数据探索与预处理——>建模&应用——>结果&反馈

参考R(消费时间间隔)F(频率)M(金额)模型，将客户关系长度L、消费时间间隔R、消费频率F、飞行里程M和折扣系数的均值C五个指标作为航空公司客户价值指标，LRFMC模型

导入数据

import pandas as pd


df = pd.read_csv('./air_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62988 entries, 0 to 62987
Data columns (total 44 columns):
MEMBER_NO                  62988 non-null int64
FFP_DATE                   62988 non-null object
FIRST_FLIGHT_DATE          62988 non-null object
GENDER                     62985 non-null object
FFP_TIER                   62988 non-null int64
WORK_CITY                  60719 non-null object
WORK_PROVINCE              59740 non-null object
WORK_COUNTRY               62962 non-null object
AGE                        62568 non-null float64
LOAD_TIME                  62988 non-null object
FLIGHT_COUNT               62988 non-null int64
BP_SUM                     62988 non-null int64
EP_SUM_YR_1                62988 non-null int64
EP_SUM_YR_2                62988 non-null int64
SUM_YR_1                   62437 non-null float64
SUM_YR_2                   62850 non-null float64
SEG_KM_SUM                 62988 non-null int64
WEIGHTED_SEG_KM            62988 non-null float64
LAST_FLIGHT_DATE           62988 non-null object
AVG_FLIGHT_COUNT           62988 non-null float64
AVG_BP_SUM                 62988 non-null float64
BEGIN_TO_FIRST             62988 non-null int64
LAST_TO_END                62988 non-null int64
AVG_INTERVAL               62988 non-null float64
MAX_INTERVAL               62988 non-null int64
ADD_POINTS_SUM_YR_1        62988 non-null int64
ADD_POINTS_SUM_YR_2        62988 non-null int64
EXCHANGE_COUNT             62988 non-null int64
avg_discount               62988 non-null float64
P1Y_Flight_Count           62988 non-null int64
L1Y_Flight_Count           62988 non-null int64
P1Y_BP_SUM                 62988 non-null int64
L1Y_BP_SUM                 62988 non-null int64
EP_SUM                     62988 non-null int64
ADD_Point_SUM              62988 non-null int64
Eli_Add_Point_Sum          62988 non-null int64
L1Y_ELi_Add_Points         62988 non-null int64
Points_Sum                 62988 non-null int64
L1Y_Points_Sum             62988 non-null int64
Ration_L1Y_Flight_Count    62988 non-null float64
Ration_P1Y_Flight_Count    62988 non-null float64
Ration_P1Y_BPS             62988 non-null float64
Ration_L1Y_BPS             62988 non-null float64
Point_NotFlight            62988 non-null int64
dtypes: float64(12), int64(24), object(8)
memory usage: 19.2+ MB

数据探索

返回顶部

explore_file = './explore_result.csv'  # 数据探索结果文件
explore = df.describe(percentiles=[], include='all').T  # T转置后方便查阅，转置后可以取'count'字段
explore['null'] = len(df) - explore['count']

explore = explore[['null', 'max', 'min']]  # 空值、最大值、最小值
explore.columns = ['空值记录数', '最大值', '最小值']  # 属性列名重命名
explore.index.name = '属性名称'  # 对索引命名
explore.to_csv(explore_file)

explore

	空值记录数	最大值	最小值
属性名称
MEMBER_NO	0	62988	1
FFP_DATE	0	NaN	NaN
FIRST_FLIGHT_DATE	0	NaN	NaN
GENDER	3	NaN	NaN
FFP_TIER	0	6	4
WORK_CITY	2269	NaN	NaN
WORK_PROVINCE	3248	NaN	NaN
WORK_COUNTRY	26	NaN	NaN
AGE	420	110	6
LOAD_TIME	0	NaN	NaN
FLIGHT_COUNT	0	213	2
BP_SUM	0	505308	0
EP_SUM_YR_1	0	0	0
EP_SUM_YR_2	0	74460	0
SUM_YR_1	551	239560	0
SUM_YR_2	138	234188	0
SEG_KM_SUM	0	580717	368
WEIGHTED_SEG_KM	0	558440	0
LAST_FLIGHT_DATE	0	NaN	NaN
AVG_FLIGHT_COUNT	0	26.625	0.25
AVG_BP_SUM	0	63163.5	0
BEGIN_TO_FIRST	0	729	0
LAST_TO_END	0	731	1
AVG_INTERVAL	0	728	0
MAX_INTERVAL	0	728	0
ADD_POINTS_SUM_YR_1	0	600000	0
ADD_POINTS_SUM_YR_2	0	728282	0
EXCHANGE_COUNT	0	46	0
avg_discount	0	1.5	0
P1Y_Flight_Count	0	118	0
L1Y_Flight_Count	0	111	0
P1Y_BP_SUM	0	246197	0
L1Y_BP_SUM	0	259111	0
EP_SUM	0	74460	0
ADD_Point_SUM	0	984938	0
Eli_Add_Point_Sum	0	984938	0
L1Y_ELi_Add_Points	0	728282	0
Points_Sum	0	985572	0
L1Y_Points_Sum	0	728282	0
Ration_L1Y_Flight_Count	0	1	0
Ration_P1Y_Flight_Count	0	1	0
Ration_P1Y_BPS	0	0.999989	0
Ration_L1Y_BPS	0	0.999993	0
Point_NotFlight	0	140	0

数据清洗

返回顶部

cleaned_file = './cleaned_result.csv'
df = df[(df['SUM_YR_1'].notnull()) & (df['SUM_YR_2'].notnull())]  # 删除票价为空的记录
index1 = (df['SUM_YR_1'] != 0) | (df['SUM_YR_2'] != 0)  # 保留票价非0
index2 = (df['SEG_KM_SUM'] == 0) & (df['avg_discount'] == 0)  # 保留平均折扣率和总飞行公里数同时为0的记录，有(极少)可能买过票，但观测窗口没有活跃度
df = df[index1 | index2]
df.to_csv(cleaned_file)  # 导出数据清洗结果

数据规约

选择的相关指标: LOAD_TIME(观测窗口截至日期)、FFP_DATE(入会时间)、LAST_TO_END(最后一次乘机时间至观测窗口结束时长)、FLIGHT_COUNT(飞行次数)、SEG_KM_SUM(观测窗口飞行总公里数)、avg_discount(平均折扣率)

import numpy as np


scale_file = './scale_result.csv'
def scale_data(data):  # 属性规约，数据变换函数
    data = data[['LOAD_TIME', 'FFP_DATE', 'LAST_TO_END', 'FLIGHT_COUNT', 'SEG_KM_SUM', 'avg_discount']]
    d_loadtime = pd.to_datetime(data['LOAD_TIME'])  # 转换为时间类型
    d_ffptime = pd.to_datetime(data['FFP_DATE'])
    time_delta = d_loadtime - d_ffptime
    temp_data = data.copy()
    temp_data['L'] = time_delta.map(lambda x:x / np.timedelta64(30, 'D'))  # 按一个月30天来转换
    temp_data['R'] = data['LAST_TO_END']
    temp_data['F'] = data['FLIGHT_COUNT']
    temp_data['M'] = data['SEG_KM_SUM']
    temp_data['C'] = data['avg_discount']
    new_data = temp_data[['L', 'R', 'F', 'M', 'C']]
    return new_data

new_data = scale_data(df)
new_data.describe()  # 判断是否需要标准化

	L	R	F	M	C
count	62044.000000	62044.000000	62044.000000	62044.000000	62044.000000
mean	49.623036	172.532703	11.971359	17321.694749	0.722180
std	28.262697	181.526164	14.110619	21052.728111	0.184833
min	12.166667	1.000000	2.000000	368.000000	0.136017
25%	24.500000	29.000000	3.000000	4874.000000	0.613085
50%	42.600000	105.000000	7.000000	10200.000000	0.712162
75%	72.733333	260.000000	15.000000	21522.500000	0.809293
max	114.566667	731.000000	213.000000	580717.000000	1.500000

规范化

返回顶部

ascore_data = (new_data - new_data.mean(axis = 0)) / (new_data.std(axis = 0))
ascore_data.columns = ['ZL', 'ZR', 'ZF', 'ZM', 'ZC']  # 属性列名重命名
ascore_data.to_csv(scale_file)

ascore_data

	ZL	ZR	ZF	ZM	ZC
0	1.435707	-0.944948	14.034016	26.761154	1.295540
1	1.307152	-0.911894	9.073213	13.126864	2.868176
2	1.328381	-0.889859	8.718869	12.653481	2.880950
3	0.658476	-0.416098	0.781585	12.540622	1.994714
4	0.386032	-0.922912	9.923636	13.898736	1.344335
...	...	...	...	...	...
62974	2.076128	-0.460169	-0.706656	-0.805297	-0.065898
62975	0.557046	-0.283886	-0.706656	-0.805297	-0.282309
62976	-0.149421	-0.735611	-0.706656	-0.772332	-2.689885
62977	-1.206173	1.605649	-0.706656	-0.779837	-2.554628
62978	-0.479656	0.603039	-0.706656	-0.786677	-2.392319

62044 rows × 5 columns

建模

返回顶部

扫描二维码关注公众号，回复： 11157737 查看本文章

from sklearn.cluster import KMeans  # 导入K均值聚类算法
from sklearn.externals import joblib
km = KMeans(n_clusters=5)
km.fit(ascore_data)
joblib.dump(km, './km.kpl')

c:\users\13721\appdata\local\programs\python\python37-32\lib\site-packages\sklearn\externals\joblib\__init__.py:15: FutureWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.
  warnings.warn(msg, category=FutureWarning)

['./km.kpl']

km = joblib.load('./km.kpl')
r1 = pd.Series(km.labels_)
r1 = r1.value_counts()  # 每个聚类的样本个数
r2 = pd.DataFrame(km.cluster_centers_)  # 聚类的中心
r = pd.concat([r2, r1], axis=1)  # 进行列合并
r.columns = list(ascore_data.columns) + ['聚类个数']  # 列名重命名
r.index.name = '聚类类别'  # 索引名称
r.index = ['客户群体1', '客户群体2', '客户群体3', '客户群体4', '客户群体5']

r.to_csv('./cluster_result.csv')

	ZL	ZR	ZF	ZM	ZC	聚类个数
客户群体1	1.160667	-0.377221	-0.086919	-0.094844	-0.155905	15740
客户群体2	0.051843	-0.002668	-0.226803	-0.231254	2.191347	4184
客户群体3	0.483328	-0.799383	2.483202	2.424724	0.308630	5336
客户群体4	-0.313678	1.686258	-0.574016	-0.536820	-0.173326	12125
客户群体5	-0.700206	-0.414888	-0.161143	-0.160958	-0.255132	24659

r.describe()

	ZL	ZR	ZF	ZM	ZC	聚类个数
count	5.000000	5.000000	5.000000	5.000000	5.000000	5.000000
mean	0.136391	0.018420	0.286864	0.280170	0.383123	12408.800000
std	0.720956	0.974066	1.241903	1.210720	1.034744	8350.399254
min	-0.700206	-0.799383	-0.574016	-0.536820	-0.255132	4184.000000
25%	-0.313678	-0.414888	-0.226803	-0.231254	-0.173326	5336.000000
50%	0.051843	-0.377221	-0.161143	-0.160958	-0.155905	12125.000000
75%	0.483328	-0.002668	-0.086919	-0.094844	0.308630	15740.000000
max	1.160667	1.686258	2.483202	2.424724	2.191347	24659.000000

返回顶部

from pyecharts import options as opts
from pyecharts.charts import Radar


def radar_base(array) -> Radar:
    c = (
        Radar()
        .add_schema(
            schema=[
                opts.RadarIndicatorItem(name=array.columns[0] + '入会时长', max_=1.5, min_=-1.0),
                opts.RadarIndicatorItem(name=array.columns[1] + '最近间距', max_=2.0, min_=-1.0),
                opts.RadarIndicatorItem(name=array.columns[2] + '消费频率', max_=2.5, min_=-1.0),
                opts.RadarIndicatorItem(name=array.columns[3] + '飞行里程', max_=2.5, min_=-1.0),
                opts.RadarIndicatorItem(name=array.columns[4] + '折扣均值', max_=2.5, min_=-1.0),
            ]
        )
        .add(array.index[0], [list(array.values[0, :5])], color='yellow')
        .add(array.index[1], [list(array.values[1, :5])], color='black')
        .add(array.index[2], [list(array.values[2, :5])], color='red')
        .add(array.index[3], [list(array.values[3, :5])], color='blue')
        .add(array.index[4], [list(array.values[4, :5])], color='green')
        .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
        .set_global_opts(title_opts=opts.TitleOpts(title="航空客户价值分析雷达图"))
    )
    return c

radar_base(r).render_notebook()

客户价值雷达图

分析结论：

(此处折扣定义：0.5折，0.7折，越高机票越贵)
客户群体1：[近期乘机]，频率低，里程短，[等级低]，折扣小，[老客户]，评价：[价值不高]，[维持]

客户群体2：[近期乘机不多]，频率低，里程短，[等级低]，折扣高，[中等客户]，评论：[潜在客户]，[开发]

客户群体3：[近期乘机]，频率高，里程长，[等级高]，折扣较高，[老客户]，评价：[高价值客户]，[重点维护]

客户群体4：[长时间没乘机]，频率低，里程短，[等级低]，折扣小，[新客户]，评价：[价值不高]

客户群体5：[近期乘机]，频率低，里程短，[等级低]，折扣小，[新客户]，评价：[潜在客户]，[开发]

欢迎关注，敬请点赞！
返回顶部

文龙问路

原创文章 43 获赞 14 访问量 2868

关注私信