“上帝”的价值——客户价值分析

欢迎关注,敬请点赞!

“上帝”的价值——航空公司客户价值分析

相关附件请从码云https://gitee.com/wenlong850606/Airlines_customer_value_analysis下载。

流程:

业务系统(挖掘目标)——>数据抽取 ——>数据探索与预处理——>建模&应用——>结果&反馈

参考R(消费时间间隔)F(频率)M(金额)模型,将客户关系长度L、消费时间间隔R、消费频率F、飞行里程M和折扣系数的均值C五个指标作为航空公司客户价值指标,LRFMC模型

导入数据

import pandas as pd


df = pd.read_csv('./air_data.csv')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62988 entries, 0 to 62987
Data columns (total 44 columns):
MEMBER_NO                  62988 non-null int64
FFP_DATE                   62988 non-null object
FIRST_FLIGHT_DATE          62988 non-null object
GENDER                     62985 non-null object
FFP_TIER                   62988 non-null int64
WORK_CITY                  60719 non-null object
WORK_PROVINCE              59740 non-null object
WORK_COUNTRY               62962 non-null object
AGE                        62568 non-null float64
LOAD_TIME                  62988 non-null object
FLIGHT_COUNT               62988 non-null int64
BP_SUM                     62988 non-null int64
EP_SUM_YR_1                62988 non-null int64
EP_SUM_YR_2                62988 non-null int64
SUM_YR_1                   62437 non-null float64
SUM_YR_2                   62850 non-null float64
SEG_KM_SUM                 62988 non-null int64
WEIGHTED_SEG_KM            62988 non-null float64
LAST_FLIGHT_DATE           62988 non-null object
AVG_FLIGHT_COUNT           62988 non-null float64
AVG_BP_SUM                 62988 non-null float64
BEGIN_TO_FIRST             62988 non-null int64
LAST_TO_END                62988 non-null int64
AVG_INTERVAL               62988 non-null float64
MAX_INTERVAL               62988 non-null int64
ADD_POINTS_SUM_YR_1        62988 non-null int64
ADD_POINTS_SUM_YR_2        62988 non-null int64
EXCHANGE_COUNT             62988 non-null int64
avg_discount               62988 non-null float64
P1Y_Flight_Count           62988 non-null int64
L1Y_Flight_Count           62988 non-null int64
P1Y_BP_SUM                 62988 non-null int64
L1Y_BP_SUM                 62988 non-null int64
EP_SUM                     62988 non-null int64
ADD_Point_SUM              62988 non-null int64
Eli_Add_Point_Sum          62988 non-null int64
L1Y_ELi_Add_Points         62988 non-null int64
Points_Sum                 62988 non-null int64
L1Y_Points_Sum             62988 non-null int64
Ration_L1Y_Flight_Count    62988 non-null float64
Ration_P1Y_Flight_Count    62988 non-null float64
Ration_P1Y_BPS             62988 non-null float64
Ration_L1Y_BPS             62988 non-null float64
Point_NotFlight            62988 non-null int64
dtypes: float64(12), int64(24), object(8)
memory usage: 19.2+ MB

数据探索

返回顶部

explore_file = './explore_result.csv'  # 数据探索结果文件
explore = df.describe(percentiles=[], include='all').T  # T转置后方便查阅,转置后可以取'count'字段
explore['null'] = len(df) - explore['count']
explore = explore[['null', 'max', 'min']]  # 空值、最大值、最小值
explore.columns = ['空值记录数', '最大值', '最小值']  # 属性列名重命名
explore.index.name = '属性名称'  # 对索引命名
explore.to_csv(explore_file)
explore
空值记录数 最大值 最小值
属性名称
MEMBER_NO 0 62988 1
FFP_DATE 0 NaN NaN
FIRST_FLIGHT_DATE 0 NaN NaN
GENDER 3 NaN NaN
FFP_TIER 0 6 4
WORK_CITY 2269 NaN NaN
WORK_PROVINCE 3248 NaN NaN
WORK_COUNTRY 26 NaN NaN
AGE 420 110 6
LOAD_TIME 0 NaN NaN
FLIGHT_COUNT 0 213 2
BP_SUM 0 505308 0
EP_SUM_YR_1 0 0 0
EP_SUM_YR_2 0 74460 0
SUM_YR_1 551 239560 0
SUM_YR_2 138 234188 0
SEG_KM_SUM 0 580717 368
WEIGHTED_SEG_KM 0 558440 0
LAST_FLIGHT_DATE 0 NaN NaN
AVG_FLIGHT_COUNT 0 26.625 0.25
AVG_BP_SUM 0 63163.5 0
BEGIN_TO_FIRST 0 729 0
LAST_TO_END 0 731 1
AVG_INTERVAL 0 728 0
MAX_INTERVAL 0 728 0
ADD_POINTS_SUM_YR_1 0 600000 0
ADD_POINTS_SUM_YR_2 0 728282 0
EXCHANGE_COUNT 0 46 0
avg_discount 0 1.5 0
P1Y_Flight_Count 0 118 0
L1Y_Flight_Count 0 111 0
P1Y_BP_SUM 0 246197 0
L1Y_BP_SUM 0 259111 0
EP_SUM 0 74460 0
ADD_Point_SUM 0 984938 0
Eli_Add_Point_Sum 0 984938 0
L1Y_ELi_Add_Points 0 728282 0
Points_Sum 0 985572 0
L1Y_Points_Sum 0 728282 0
Ration_L1Y_Flight_Count 0 1 0
Ration_P1Y_Flight_Count 0 1 0
Ration_P1Y_BPS 0 0.999989 0
Ration_L1Y_BPS 0 0.999993 0
Point_NotFlight 0 140 0

数据清洗

返回顶部

cleaned_file = './cleaned_result.csv'
df = df[(df['SUM_YR_1'].notnull()) & (df['SUM_YR_2'].notnull())]  # 删除票价为空的记录
index1 = (df['SUM_YR_1'] != 0) | (df['SUM_YR_2'] != 0)  # 保留票价非0
index2 = (df['SEG_KM_SUM'] == 0) & (df['avg_discount'] == 0)  # 保留平均折扣率和总飞行公里数同时为0的记录,有(极少)可能买过票,但观测窗口没有活跃度
df = df[index1 | index2]
df.to_csv(cleaned_file)  # 导出数据清洗结果

数据规约

选择的相关指标: LOAD_TIME(观测窗口截至日期)、FFP_DATE(入会时间)、LAST_TO_END(最后一次乘机时间至观测窗口结束时长)、FLIGHT_COUNT(飞行次数)、SEG_KM_SUM(观测窗口飞行总公里数)、avg_discount(平均折扣率)

import numpy as np


scale_file = './scale_result.csv'
def scale_data(data):  # 属性规约,数据变换函数
    data = data[['LOAD_TIME', 'FFP_DATE', 'LAST_TO_END', 'FLIGHT_COUNT', 'SEG_KM_SUM', 'avg_discount']]
    d_loadtime = pd.to_datetime(data['LOAD_TIME'])  # 转换为时间类型
    d_ffptime = pd.to_datetime(data['FFP_DATE'])
    time_delta = d_loadtime - d_ffptime
    temp_data = data.copy()
    temp_data['L'] = time_delta.map(lambda x:x / np.timedelta64(30, 'D'))  # 按一个月30天来转换
    temp_data['R'] = data['LAST_TO_END']
    temp_data['F'] = data['FLIGHT_COUNT']
    temp_data['M'] = data['SEG_KM_SUM']
    temp_data['C'] = data['avg_discount']
    new_data = temp_data[['L', 'R', 'F', 'M', 'C']]
    return new_data

new_data = scale_data(df)
new_data.describe()  # 判断是否需要标准化
L R F M C
count 62044.000000 62044.000000 62044.000000 62044.000000 62044.000000
mean 49.623036 172.532703 11.971359 17321.694749 0.722180
std 28.262697 181.526164 14.110619 21052.728111 0.184833
min 12.166667 1.000000 2.000000 368.000000 0.136017
25% 24.500000 29.000000 3.000000 4874.000000 0.613085
50% 42.600000 105.000000 7.000000 10200.000000 0.712162
75% 72.733333 260.000000 15.000000 21522.500000 0.809293
max 114.566667 731.000000 213.000000 580717.000000 1.500000

规范化

返回顶部

ascore_data = (new_data - new_data.mean(axis = 0)) / (new_data.std(axis = 0))
ascore_data.columns = ['ZL', 'ZR', 'ZF', 'ZM', 'ZC']  # 属性列名重命名
ascore_data.to_csv(scale_file)
ascore_data
ZL ZR ZF ZM ZC
0 1.435707 -0.944948 14.034016 26.761154 1.295540
1 1.307152 -0.911894 9.073213 13.126864 2.868176
2 1.328381 -0.889859 8.718869 12.653481 2.880950
3 0.658476 -0.416098 0.781585 12.540622 1.994714
4 0.386032 -0.922912 9.923636 13.898736 1.344335
... ... ... ... ... ...
62974 2.076128 -0.460169 -0.706656 -0.805297 -0.065898
62975 0.557046 -0.283886 -0.706656 -0.805297 -0.282309
62976 -0.149421 -0.735611 -0.706656 -0.772332 -2.689885
62977 -1.206173 1.605649 -0.706656 -0.779837 -2.554628
62978 -0.479656 0.603039 -0.706656 -0.786677 -2.392319

62044 rows × 5 columns

建模

返回顶部

扫描二维码关注公众号,回复: 11157737 查看本文章
from sklearn.cluster import KMeans  # 导入K均值聚类算法
from sklearn.externals import joblib
km = KMeans(n_clusters=5)
km.fit(ascore_data)
joblib.dump(km, './km.kpl')
c:\users\13721\appdata\local\programs\python\python37-32\lib\site-packages\sklearn\externals\joblib\__init__.py:15: FutureWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.
  warnings.warn(msg, category=FutureWarning)

['./km.kpl']
km = joblib.load('./km.kpl')
r1 = pd.Series(km.labels_)
r1 = r1.value_counts()  # 每个聚类的样本个数
r2 = pd.DataFrame(km.cluster_centers_)  # 聚类的中心
r = pd.concat([r2, r1], axis=1)  # 进行列合并
r.columns = list(ascore_data.columns) + ['聚类个数']  # 列名重命名
r.index.name = '聚类类别'  # 索引名称
r.index = ['客户群体1', '客户群体2', '客户群体3', '客户群体4', '客户群体5']

r.to_csv('./cluster_result.csv')
r
ZL ZR ZF ZM ZC 聚类个数
客户群体1 1.160667 -0.377221 -0.086919 -0.094844 -0.155905 15740
客户群体2 0.051843 -0.002668 -0.226803 -0.231254 2.191347 4184
客户群体3 0.483328 -0.799383 2.483202 2.424724 0.308630 5336
客户群体4 -0.313678 1.686258 -0.574016 -0.536820 -0.173326 12125
客户群体5 -0.700206 -0.414888 -0.161143 -0.160958 -0.255132 24659
r.describe()
ZL ZR ZF ZM ZC 聚类个数
count 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000
mean 0.136391 0.018420 0.286864 0.280170 0.383123 12408.800000
std 0.720956 0.974066 1.241903 1.210720 1.034744 8350.399254
min -0.700206 -0.799383 -0.574016 -0.536820 -0.255132 4184.000000
25% -0.313678 -0.414888 -0.226803 -0.231254 -0.173326 5336.000000
50% 0.051843 -0.377221 -0.161143 -0.160958 -0.155905 12125.000000
75% 0.483328 -0.002668 -0.086919 -0.094844 0.308630 15740.000000
max 1.160667 1.686258 2.483202 2.424724 2.191347 24659.000000

返回顶部

from pyecharts import options as opts
from pyecharts.charts import Radar


def radar_base(array) -> Radar:
    c = (
        Radar()
        .add_schema(
            schema=[
                opts.RadarIndicatorItem(name=array.columns[0] + '入会时长', max_=1.5, min_=-1.0),
                opts.RadarIndicatorItem(name=array.columns[1] + '最近间距', max_=2.0, min_=-1.0),
                opts.RadarIndicatorItem(name=array.columns[2] + '消费频率', max_=2.5, min_=-1.0),
                opts.RadarIndicatorItem(name=array.columns[3] + '飞行里程', max_=2.5, min_=-1.0),
                opts.RadarIndicatorItem(name=array.columns[4] + '折扣均值', max_=2.5, min_=-1.0),
            ]
        )
        .add(array.index[0], [list(array.values[0, :5])], color='yellow')
        .add(array.index[1], [list(array.values[1, :5])], color='black')
        .add(array.index[2], [list(array.values[2, :5])], color='red')
        .add(array.index[3], [list(array.values[3, :5])], color='blue')
        .add(array.index[4], [list(array.values[4, :5])], color='green')
        .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
        .set_global_opts(title_opts=opts.TitleOpts(title="航空客户价值分析雷达图"))
    )
    return c
radar_base(r).render_notebook()

客户价值雷达图

分析结论:

(此处折扣定义:0.5折,0.7折,越高机票越贵)
客户群体1:[近期乘机],频率低,里程短,[等级低],折扣小,[老客户],评价:[价值不高],[维持]

客户群体2:[近期乘机不多],频率低,里程短,[等级低],折扣高,[中等客户],评论:[潜在客户],[开发]

客户群体3:[近期乘机],频率高,里程长,[等级高],折扣较高,[老客户],评价:[高价值客户],[重点维护]

客户群体4:[长时间没乘机],频率低,里程短,[等级低],折扣小,[新客户],评价:[价值不高]

客户群体5:[近期乘机],频率低,里程短,[等级低],折扣小,[新客户],评价:[潜在客户],[开发]

欢迎关注,敬请点赞!
返回顶部

原创文章 43 获赞 14 访问量 2868

猜你喜欢

转载自blog.csdn.net/weixin_45221012/article/details/103818018
今日推荐