数据分析之两种用户分群方法(RFM和聚类)

本文由于没有现成的数据,就自己生成了一些商品订单数据,基于该数据进行了RFM和聚类的构建

1.数据的生成

数据库表操作

 1 use my_work;
 2 
 3 -- 创建商品订单表
 4 CREATE table goods_orders_ful(
 5     user_id varchar(100),  -- 用户id
 6     order_id varchar(100), -- 订单id
 7     is_paid bool, -- 用户是否实际支付,1支付;0未支付
 8     amount double, -- 订单金额
 9     created_date date, -- 订单生成日期 yyyy-mm-dd
10     created_time timestamp, -- 订单生成时间 yyyy-mm-dd hh:mm:ss
11     business_type varchar(10), -- 业务类型
12     region_name varchar(10), -- 所属区域:如 东部地区
13     order_source_name varchar(10), -- 订单渠道:Web、H5、App 
14     is_done bool -- 订单是否完成
15     );
16                                             
17 -- 创建用户订单行为中间表                 
18 drop table if exists user_info_frm_01;
19 CREATE table user_info_frm_01
20 as
21 select gof.user_id,
22        sum(gof.amount) all_of_money,
23        max(gof.created_date) latest_date, 
24        count(gof.order_id) all_of_orders
25 FROM goods_orders_ful gof 
26 where gof.is_paid = 1
27 and gof.is_done = 1
28 and gof.created_date >= '2020-01-01'
29 and gof.created_date < '2020-07-01'
30 group by gof.user_id;
31 
32 SELECT count(*) from user_info_frm_01 uif;
33 SELECT * from user_info_frm_01 uif limit 10
34 
35 -- 创建行为指标均值表
36 create table if not exists user_info_frm_02
37 as
38 select avg(uif.all_of_money) all_of_money_avg,
39        avg(datediff('2020-07-22', uif.latest_date)) latest_days_avg,
40        avg(uif.all_of_orders) orders_avg
41 from user_info_frm_01 uif;
42 
43 SELECT * from user_info_frm_02;
44 -- 消费均值1107.10,最小天数均值86.9,订单数量均值2.1
45 
46 -- 将用户进行rfm一级打标
47 create table if not exists user_info_frm_03
48 as
49 SELECT uif.user_id,
50        case when uif.all_of_money >= 1107.10
51             then ''
52             else ''
53             end money,
54        case when datediff('2020-07-22', uif.latest_date) >= 86.9
55             then ''
56             else ''
57             end recency,
58        case when uif.all_of_orders >= 2.1
59             then ''
60             else ''
61             end frequency
62 from user_info_frm_01 uif;
63 
64 -- 将用户进行二级打标
65 create table if not exists user_info_frm_04
66 as
67 select uif.user_id,
68        uif.recency,
69        uif.frequency,
70        uif.money,
71        case when uif.recency = '' and uif.frequency = '' and uif.money = ''
72             then '重要价值用户'
73             when uif.recency = '' and uif.frequency = '' and uif.money = ''
74             then '重要保持用户'
75             when uif.recency = '' and uif.frequency = '' and uif.money = ''
76             then '重要发展用户'
77             when uif.recency = '' and uif.frequency = '' and uif.money = ''
78             then '重要挽留用户'
79             when uif.recency = '' and uif.frequency = '' and uif.money = ''
80             then '一般价值用户'
81             when uif.recency = '' and uif.frequency = '' and uif.money = ''
82             then '一般保持用户'
83             when uif.recency = '' and uif.frequency = '' and uif.money = ''
84             then '一般发展用户'
85             when uif.recency = '' and uif.frequency = '' and uif.money = ''
86             then '一般挽留用户'
87             else NULL 
88             end type
89             
90 from user_info_frm_03 uif;

python 程序生成数据

 1 # _*_ coding: utf-8 _*_ #
 2 # @Time     :2020/7/25 7:30 下午
 3 # @Author   :Zhx
 4 
 5 
 6 import pymysql
 7 import uuid
 8 import random
 9 import time
10 
11 
12 class CreateData(object):
13 
14     def __init__(self):
15         pass
16 
17     @staticmethod
18     def create():
19         user_id_ = random.randint(1, 5000)
20         order_id_ = uuid.uuid1()
21         is_paid_ = random.choice([1, 0, 1, 1, 1, 1, 1, 1, 1, 1])
22         amount_ = random.uniform(10, 1000)
23         a1 = (2020, 1, 1, 0, 0, 0, 0, 0, 0)
24         a2 = (2020, 6, 31, 23, 59, 59, 0, 0, 0)
25 
26         start = time.mktime(a1)  # 生成开始时间戳
27         end = time.mktime(a2)  # 生成结束时间戳
28 
29         # 随机生成10个日期字符串
30         t = random.randint(start, end)  # 在开始和结束时间戳中随机取出一个
31         date_tuple = time.localtime(t)  # 将时间戳生成时间元组
32         created_date_ = time.strftime("%Y-%m-%d", date_tuple)  # 将时间元组转成格式化字符串
33         created_time_ = time.strftime("%Y-%m-%d %H:%M:%S", date_tuple)
34         business_type_ = random.randint(0, 20)
35         region_name_ = random.choice(['', '西', '', ''])
36         order_source_name_ = random.choice(['Web', 'app', 'H5'])
37         is_done_ = is_paid_
38         return user_id_, order_id_, is_paid_, amount_, created_date_, created_time_, \
39             business_type_, region_name_, order_source_name_, is_done_
40 
41 
42 if __name__ == '__main__':
43     database = 'my_work'
44     table = 'goods_orders_ful'
45     counts = 10000
46     create_data = CreateData()
47     con = pymysql.connect(database=database, host='localhost',
48                           user='root', port=3306, password='199498zhx@')
49     cur = con.cursor()
50     for i in range(counts):
51         user_id, order_id, is_paid, amount, created_date, created_time, \
52              business_type, region_name, order_source_name, is_done = create_data.create()
53         sql = """insert into %s.%s values('%s', '%s', %d, %f, '%s', '%s', '%s', '%s', '%s', %d)""" % \
54               (database, table, user_id, order_id, is_paid, amount, created_date, created_time, business_type,
55                region_name, order_source_name, is_done)
56         try:
57             cur.execute(sql)
58             print(i, i % 1000)
59             con.commit()
60         except Exception as e:
61             print(e)
62             con.rollback()
63     con.close()
64     cur.close()

源数据字段有:

  user_id varchar(100),  -- 用户id

  order_id varchar(100), -- 订单id

  is_paid bool, -- 用户是否实际支付,1支付;0未支付

      amount double, -- 订单金额

扫描二维码关注公众号,回复: 11447944 查看本文章

  created_date date, -- 订单生成日期 yyyy-mm-dd

  created_time timestamp, -- 订单生成时间 yyyy-mm-dd hh:mm:ss

  business_type varchar(10), -- 业务类型

  region_name varchar(10), -- 所属区域:如 东部地区

  order_source_name varchar(10), -- 订单渠道:Web、H5、App 

  is_done bool -- 订单是否完成

 

RFM 模型最终表数据

最终的可视化分析使用jupyter完成

 

 

 

 

 

 

 

 

 

 

猜你喜欢

转载自www.cnblogs.com/tttzqf/p/13382546.html