准备工作
import os
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
import matplotlib. dates as mdates
import seaborn as sns
from datetime import date
% matplotlib inline
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
pd. set_option( 'expand_frame_repr' , False )
pd. set_option( 'display.max_rows' , 200 )
pd. set_option( 'display.max_columns' , 200 )
from jupyterthemes import jtplot
jtplot. style( theme= 'onedork' )
from IPython. core. interactiveshell import InteractiveShell
InteractiveShell. ast_node_interactivity = "all"
off_train = pd. read_csv( 'ccf_offline_stage1_train.csv' )
off_train. columns = [ 'user_id' , 'merchant_id' , 'coupon_id' ,
'discount_rate' , 'distance' , 'date_received' , 'date' ]
off_test = pd. read_csv( 'ccf_offline_stage1_test_revised.csv' )
off_test. columns = [ 'user_id' , 'merchant_id' , 'coupon_id' ,
'discount_rate' , 'distance' , 'date_received' ]
on_train = pd. read_csv( 'ccf_online_stage1_train.csv' )
on_train. columns = [ 'user_id' , 'merchant_id' , 'action' ,
'coupon_id' , 'discount_rate' , 'date_received' , 'date' ]
term
来源
内容
dataset3
table3,off_test
off_test数据
dataset2
table2,off_train
领券日期在20160515-20160615之间的
dataset1
table2,off_train
领券日期在20160414-20160514的
feature3
table2,off_train
消费日期在20160315-20160630的,或领券日期在20160315-20160630但没有消费的
feature2
table2,off_train
消费日期在20160201-20160514的,或领券日期在20160201-20160514但没有消费的
feature1
table2,off_train
消费日期在20160101-20160413的,或领券日期在20160101-20160413但没有消费的
项目
预测区间(提取label)
特征区间(提取feature)
-
领券了的
消费了的+领券了没消费的
测试集
dataset3
feature3
训练集1
dataset2
feature2
训练集2
dataset1
feature1
这里找NaN空值用isnull(),不能用=='null’或者==np.nan之类的。 读取数据集时没有把两个时间列的所有元素转换为datetime格式,不等号右边就用数字20160315这样的就行,如果转换了,不等号右边得用pd.to_datetime转换
dataset3 = off_test
feature3 = off_train[ ( ( off_train. date >= 20160315 ) & ( off_train. date <= 20160630 ) ) | (
( off_train. date. isnull( ) ) & ( off_train. date_received >= 20160315 ) & ( off_train. date_received <= 20160630 ) ) ]
dataset2 = off_train[ ( off_train. date_received >= 20160515 )
& ( off_train. date_received <= 20160615 ) ]
feature2 = off_train[ ( off_train. date >= 20160201 ) & ( off_train. date <= 20160514 ) | (
( off_train. date. isnull( ) ) & ( off_train. date_received >= 20160201 ) & ( off_train. date_received <= 20160514 ) ) ]
dataset1 = off_train[ ( off_train. date_received >= 20160414 )
& ( off_train. date_received <= 20160514 ) ]
feature1 = off_train[ ( off_train. date >= 20160101 ) & ( off_train. date <= 20160413 ) | (
( off_train. date. isnull( ) ) & ( off_train. date_received >= 20160101 ) & ( off_train. date_received <= 20160413 ) ) ]
其他特征
column
含义
this_month_user_receive_all_coupon_count
用户领取优惠券的总次数
this_month_user_receive_same_coupon_count
用户领取该行中的优惠券的次数
this_month_user_receive_same_coupon_firstone
用户领取某张优惠券(使用次数不止1次)的首次领取时间
this_month_user_receive_same_coupon_lastone
用户领取某张优惠券(使用次数不止1次)的末次领取时间
this_day_user_receive_all_coupon_count
用户领券当天领券总数量
this_day_user_receive_same_coupon_count
用户领券当天领取该优惠券的数量
dataset3
t = dataset3[ [ 'user_id' ] ] . copy( )
t[ 'this_month_user_receive_all_coupon_count' ] = 1
t = t. groupby( [ 'user_id' ] , as_index= False ) . count( )
t. head( 2 )
user_id
this_month_user_receive_all_coupon_count
0
209
2
1
215
1
t1 = dataset3[ [ 'user_id' , 'coupon_id' ] ] . copy( )
t1[ 'this_month_user_receive_same_coupon_count' ] = 1
t1 = t1. groupby( [ 'user_id' , 'coupon_id' ] , as_index= False ) . count( )
t1. head( 2 )
user_id
coupon_id
this_month_user_receive_same_coupon_count
0
209
825
1
1
209
7557
1
t2 = dataset3[ [ 'user_id' , 'coupon_id' , 'date_received' ] ] . copy( )
t2. date_received = t2. date_received. astype( 'str' )
t2 = t2. groupby( [ 'user_id' , 'coupon_id' ] , as_index= False ) [
'date_received' ] . agg( lambda x: ':' . join( x) )
t2. iloc[ 5 : 7 ]
t2[ 'receive_number' ] = t2. date_received. apply ( lambda s: len ( s. split( ':' ) ) )
t2 = t2[ t2. receive_number > 1 ]
t2[ 'max_date_received' ] = t2. date_received. apply (
lambda s: max ( [ int ( d) for d in s. split( ':' ) ] ) )
t2[ 'min_date_received' ] = t2. date_received. apply (
lambda s: min ( [ int ( d) for d in s. split( ':' ) ] ) )
t2 = t2[ [ 'user_id' , 'coupon_id' , 'max_date_received' , 'min_date_received' ] ]
t2. head( 2 )
user_id
coupon_id
date_received
5
432
10438
20160706
6
448
10927
20160710:20160707:20160706:20160704
user_id
coupon_id
max_date_received
min_date_received
6
448
10927
20160710
20160704
13
736
3686
20160724
20160713
t3 = dataset3[ [ 'user_id' , 'coupon_id' , 'date_received' ] ] . copy( )
t3 = pd. merge( t3, t2, how= 'left' , on= [ 'user_id' , 'coupon_id' ] )
t3[ 'this_month_user_receive_same_coupon_lastone' ] = t3. max_date_received - \
t3. date_received
t3[ 'this_month_user_receive_same_coupon_firstone' ] = t3. date_received - \
t3. min_date_received
def is_firstlastone ( x) :
if x == 0 :
return 1
elif x > 0 :
return 0
else :
return - 1
t3. this_month_user_receive_same_coupon_lastone = t3. this_month_user_receive_same_coupon_lastone. apply (
is_firstlastone)
t3. this_month_user_receive_same_coupon_firstone = t3. this_month_user_receive_same_coupon_firstone. apply (
is_firstlastone)
t3 = t3[ [ 'user_id' , 'coupon_id' , 'date_received' , 'this_month_user_receive_same_coupon_lastone' ,
'this_month_user_receive_same_coupon_firstone' ] ]
t3. head( 2 )
user_id
coupon_id
date_received
this_month_user_receive_same_coupon_lastone
this_month_user_receive_same_coupon_firstone
0
4129537
9983
20160712
-1
-1
1
6949378
3429
20160706
-1
-1
t4 = dataset3[ [ 'user_id' , 'date_received' ] ] . copy( )
t4[ 'this_day_user_receive_all_coupon_count' ] = 1
t4 = t4. groupby( [ 'user_id' , 'date_received' ] , as_index= False ) . count( )
t4. head( 2 )
user_id
date_received
this_day_user_receive_all_coupon_count
0
209
20160721
2
1
215
20160703
1
t5 = dataset3[ [ 'user_id' , 'coupon_id' , 'date_received' ] ] . copy( )
t5[ 'this_day_user_receive_same_coupon_count' ] = 1
t5 = t5. groupby( [ 'user_id' , 'coupon_id' , 'date_received' ] ,
as_index= False ) . count( )
t5. head( 2 )
user_id
coupon_id
date_received
this_day_user_receive_same_coupon_count
0
209
825
20160721
1
1
209
7557
20160721
1
'''t6 = dataset3[['user_id', 'coupon_id', 'date_received']].copy()
t6.date_received = t6.date_received.astype('str')
t6 = t6.groupby(['user_id', 'coupon_id'], as_index=False)[
'date_received'].agg(lambda x: ':'.join(x))
t6.rename(columns={'date_received': 'dates'}, inplace=True)
t6.head(2)'''
"t6 = dataset3[['user_id', 'coupon_id', 'date_received']].copy()\nt6.date_received = t6.date_received.astype('str')\nt6 = t6.groupby(['user_id', 'coupon_id'], as_index=False)[\n 'date_received'].agg(lambda x: ':'.join(x))\nt6.rename(columns={'date_received': 'dates'}, inplace=True)\nt6.head(2)"
'''# 前、后一张券的时间间隔
def get_day_gap_before(s):
date_received, dates = s.split('-')
dates = dates.split(':')
gaps = []
for d in dates:
this_gap = (date(int(date_received[0:4]), int(date_received[4:6]), int(
date_received[6:8]))-date(int(d[0:4]), int(d[4:6]), int(d[6:8]))).days
if this_gap > 0:
gaps.append(this_gap)
if len(gaps) == 0:
return -1
else:
return min(gaps)
def get_day_gap_after(s):
date_received, dates = s.split('-')
dates = dates.split(':')
gaps = []
for d in dates:
this_gap = (date(int(d[0:4]), int(d[4:6]), int(d[6:8]))-date(
int(date_received[0:4]), int(date_received[4:6]), int(date_received[6:8]))).days
if this_gap > 0:
gaps.append(this_gap)
if len(gaps) == 0:
return -1
else:
return min(gaps)
t7 = dataset3[['user_id', 'coupon_id', 'date_received']]
t7 = pd.merge(t7, t6, on=['user_id', 'coupon_id'], how='left')
t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates
t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
t7 = t7[['user_id', 'coupon_id', 'date_received',
'day_gap_before', 'day_gap_after']]
t7.head()'''
"# 前、后一张券的时间间隔\ndef get_day_gap_before(s):\n date_received, dates = s.split('-')\n dates = dates.split(':')\n gaps = []\n for d in dates:\n this_gap = (date(int(date_received[0:4]), int(date_received[4:6]), int(\n date_received[6:8]))-date(int(d[0:4]), int(d[4:6]), int(d[6:8]))).days\n if this_gap > 0:\n gaps.append(this_gap)\n if len(gaps) == 0:\n return -1\n else:\n return min(gaps)\n\n\ndef get_day_gap_after(s):\n date_received, dates = s.split('-')\n dates = dates.split(':')\n gaps = []\n for d in dates:\n this_gap = (date(int(d[0:4]), int(d[4:6]), int(d[6:8]))-date(\n int(date_received[0:4]), int(date_received[4:6]), int(date_received[6:8]))).days\n if this_gap > 0:\n gaps.append(this_gap)\n if len(gaps) == 0:\n return -1\n else:\n return min(gaps)\n\n\nt7 = dataset3[['user_id', 'coupon_id', 'date_received']]\nt7 = pd.merge(t7, t6, on=['user_id', 'coupon_id'], how='left')\nt7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates\nt7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)\nt7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)\nt7 = t7[['user_id', 'coupon_id', 'date_received',\n 'day_gap_before', 'day_gap_after']]\nt7.head()"
other_feature3 = pd. merge( t1, t, on= 'user_id' )
other_feature3 = pd. merge( other_feature3, t3, on= [ 'user_id' , 'coupon_id' ] )
other_feature3 = pd. merge( other_feature3, t4, on= [ 'user_id' , 'date_received' ] )
other_feature3 = pd. merge( other_feature3, t5, on= [
'user_id' , 'coupon_id' , 'date_received' ] )
other_feature3. to_csv( 'other_feature3.csv' , index= None )
other_feature3. shape
(113640, 9)
dataset2
t = dataset2[ [ 'user_id' ] ] . copy( )
t[ 'this_month_user_receive_all_coupon_count' ] = 1
t = t. groupby( 'user_id' , as_index= False ) . count( )
t. head( 2 )
user_id
this_month_user_receive_all_coupon_count
0
4
1
1
165
1
t1 = dataset2[ [ 'user_id' , 'coupon_id' ] ] . copy( )
t1[ 'this_month_user_receive_same_coupon_count' ] = 1
t1 = t1. groupby( [ 'user_id' , 'coupon_id' ] , as_index= False ) . count( )
t1. head( 2 )
user_id
coupon_id
this_month_user_receive_same_coupon_count
0
4
2902.0
1
1
165
7571.0
1
t2 = dataset2[ [ 'user_id' , 'coupon_id' , 'date_received' ] ] . copy( )
t2. date_received = t2. date_received. astype( 'int' ) . astype( 'str' )
t2 = t2. groupby( [ 'user_id' , 'coupon_id' ] , as_index= False ) [
'date_received' ] . agg( lambda x: ':' . join( x) )
t2[ 'receive_number' ] = t2. date_received. apply ( lambda s: len ( s. split( ':' ) ) )
t2 = t2[ t2. receive_number > 1 ]
t2[ 'max_date_received' ] = t2. date_received. apply (
lambda s: max ( [ int ( d) for d in s. split( ':' ) ] ) )
t2[ 'min_date_received' ] = t2. date_received. apply (
lambda s: min ( [ int ( d) for d in s. split( ':' ) ] ) )
t2 = t2[ [ 'user_id' , 'coupon_id' , 'max_date_received' , 'min_date_received' ] ]
t2. head( 2 )
user_id
coupon_id
max_date_received
min_date_received
18
696
7571.0
20160524
20160520
31
947
12414.0
20160610
20160609
t3 = dataset2[ [ 'user_id' , 'coupon_id' , 'date_received' ] ] . copy( )
t3 = pd. merge( t3, t2, on= [ 'user_id' , 'coupon_id' ] , how= 'left' )
t3[ 'this_month_user_receive_same_coupon_lastone' ] = t3. max_date_received - \
t3. date_received. astype( 'int' )
t3[ 'this_month_user_receive_same_coupon_firstone' ] = t3. date_received. astype(
'int' ) - t3. min_date_received
def is_firstlastone ( x) :
if x == 0 :
return 1
elif x > 0 :
return 0
else :
return - 1
t3. this_month_user_receive_same_coupon_lastone = t3. this_month_user_receive_same_coupon_lastone. apply (
is_firstlastone)
t3. this_month_user_receive_same_coupon_firstone = t3. this_month_user_receive_same_coupon_firstone. apply (
is_firstlastone)
t3 = t3[ [ 'user_id' , 'coupon_id' , 'date_received' , 'this_month_user_receive_same_coupon_lastone' ,
'this_month_user_receive_same_coupon_firstone' ] ]
t3. head( 2 )
user_id
coupon_id
date_received
this_month_user_receive_same_coupon_lastone
this_month_user_receive_same_coupon_firstone
0
1439408
11002.0
20160528.0
-1
-1
1
1439408
8591.0
20160613.0
1
0
t4 = dataset2[ [ 'user_id' , 'date_received' ] ] . copy( )
t4[ 'this_day_user_receive_all_coupon_count' ] = 1
t4 = t4. groupby( [ 'user_id' , 'date_received' ] , as_index= False ) . count( )
t4. head( 2 )
user_id
date_received
this_day_user_receive_all_coupon_count
0
4
20160607.0
1
1
165
20160525.0
1
t5 = dataset2[ [ 'user_id' , 'coupon_id' , 'date_received' ] ] . copy( )
t5[ 'this_day_user_receive_same_coupon_count' ] = 1
t5 = t5. groupby( [ 'user_id' , 'coupon_id' , 'date_received' ] ,
as_index= False ) . count( )
t5. head( 2 )
user_id
coupon_id
date_received
this_day_user_receive_same_coupon_count
0
4
2902.0
20160607.0
1
1
165
7571.0
20160525.0
1
'''t6 = dataset2[['user_id', 'coupon_id', 'date_received']].copy()
t6.date_received = t6.date_received.astype('int').astype('str')
t6 = t6.groupby(['user_id', 'coupon_id'], as_index=False)[
'date_received'].agg(lambda x: ':'.join(x))
t6.rename(columns={'date_received': 'dates'}, inplace=True)
t6.head(2)'''
"t6 = dataset2[['user_id', 'coupon_id', 'date_received']].copy()\nt6.date_received = t6.date_received.astype('int').astype('str')\nt6 = t6.groupby(['user_id', 'coupon_id'], as_index=False)[\n 'date_received'].agg(lambda x: ':'.join(x))\nt6.rename(columns={'date_received': 'dates'}, inplace=True)\nt6.head(2)"
'''def get_day_gap_before(s):
date_received, dates = s.split('-')
dates = dates.split(':')
gaps = []
for d in dates:
this_gap = (date(int(date_received[0:4]), int(date_received[4:6]), int(
date_received[6:8]))-date(int(d[0:4]), int(d[4:6]), int(d[6:8]))).days
if this_gap > 0:
gaps.append(this_gap)
if len(gaps) == 0:
return -1
else:
return min(gaps)
def get_day_gap_after(s):
date_received, dates = s.split('-')
dates = dates.split(':')
gaps = []
for d in dates:
this_gap = (date(int(d[0:4]), int(d[4:6]), int(d[6:8]))-date(
int(date_received[0:4]), int(date_received[4:6]), int(date_received[6:8]))).days
if this_gap > 0:
gaps.append(this_gap)
if len(gaps) == 0:
return -1
else:
return min(gaps)
t7 = dataset2[['user_id', 'coupon_id', 'date_received']].copy()
t7 = pd.merge(t7, t6, on=['user_id', 'coupon_id'], how='left')
t7['date_received_date'] = t7.date_received.astype(
'int').astype('str') + '-' + t7.dates
t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
t7 = t7[['user_id', 'coupon_id', 'date_received',
'day_gap_before', 'day_gap_after']]
t7.head(2)'''
"def get_day_gap_before(s):\n date_received, dates = s.split('-')\n dates = dates.split(':')\n gaps = []\n for d in dates:\n this_gap = (date(int(date_received[0:4]), int(date_received[4:6]), int(\n date_received[6:8]))-date(int(d[0:4]), int(d[4:6]), int(d[6:8]))).days\n if this_gap > 0:\n gaps.append(this_gap)\n if len(gaps) == 0:\n return -1\n else:\n return min(gaps)\n\n\ndef get_day_gap_after(s):\n date_received, dates = s.split('-')\n dates = dates.split(':')\n gaps = []\n for d in dates:\n this_gap = (date(int(d[0:4]), int(d[4:6]), int(d[6:8]))-date(\n int(date_received[0:4]), int(date_received[4:6]), int(date_received[6:8]))).days\n if this_gap > 0:\n gaps.append(this_gap)\n if len(gaps) == 0:\n return -1\n else:\n return min(gaps)\n\n\nt7 = dataset2[['user_id', 'coupon_id', 'date_received']].copy()\nt7 = pd.merge(t7, t6, on=['user_id', 'coupon_id'], how='left')\nt7['date_received_date'] = t7.date_received.astype(\n 'int').astype('str') + '-' + t7.dates\nt7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)\nt7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)\nt7 = t7[['user_id', 'coupon_id', 'date_received',\n 'day_gap_before', 'day_gap_after']]\nt7.head(2)"
other_feature2 = pd. merge( t1, t, on= 'user_id' )
other_feature2 = pd. merge( other_feature2, t3, on= [ 'user_id' , 'coupon_id' ] )
other_feature2 = pd. merge( other_feature2, t4, on= [ 'user_id' , 'date_received' ] )
other_feature2 = pd. merge( other_feature2, t5, on= [
'user_id' , 'coupon_id' , 'date_received' ] )
other_feature2. to_csv( 'other_feature2.csv' , index= None )
other_feature2. head( )
user_id
coupon_id
this_month_user_receive_same_coupon_count
this_month_user_receive_all_coupon_count
date_received
this_month_user_receive_same_coupon_lastone
this_month_user_receive_same_coupon_firstone
this_day_user_receive_all_coupon_count
this_day_user_receive_same_coupon_count
0
4
2902.0
1
1
20160607.0
-1
-1
1
1
1
165
7571.0
1
1
20160525.0
-1
-1
1
1
2
166
9261.0
1
1
20160525.0
-1
-1
1
1
3
215
8944.0
1
1
20160524.0
-1
-1
1
1
4
236
11002.0
1
1
20160528.0
-1
-1
1
1
other_feature2. shape
(258446, 9)
dataset1. head( )
user_id
merchant_id
coupon_id
discount_rate
distance
date_received
date
7
1832624
3381
7610.0
200:20
0.0
20160429.0
NaN
18
163606
1569
5054.0
200:30
10.0
20160421.0
NaN
43
4061024
3381
7610.0
200:20
10.0
20160426.0
NaN
52
106443
450
3732.0
30:5
NaN
20160429.0
NaN
96
114747
1569
5054.0
200:30
9.0
20160426.0
NaN
t = dataset1[ [ 'user_id' ] ] . copy( )
t[ 'this_month_user_receive_all_coupon_count' ] = 1
t = t. groupby( 'user_id' , as_index= False ) . count( )
t. head( 2 )
user_id
this_month_user_receive_all_coupon_count
0
173
1
1
285
1
t1 = dataset1[ [ 'user_id' , 'coupon_id' ] ] . copy( )
t1[ 'this_month_user_receive_same_coupon_count' ] = 1
t1 = t1. groupby( [ 'user_id' , 'coupon_id' ] , as_index= False ) . count( )
t1. head( 2 )
user_id
coupon_id
this_month_user_receive_same_coupon_count
0
173
7610.0
1
1
285
1532.0
1
t2 = dataset1[ [ 'user_id' , 'coupon_id' , 'date_received' ] ] . copy( )
t2. date_received = t2. date_received. astype( 'int' ) . astype( 'str' )
t2 = t2. groupby( [ 'user_id' , 'coupon_id' ] , as_index= False ) [
'date_received' ] . agg( lambda x: ':' . join( x) )
t2[ 'receive_number' ] = t2. date_received. apply ( lambda s: len ( s. split( ':' ) ) )
t2 = t2[ t2. receive_number > 1 ]
t2[ 'max_date_received' ] = t2. date_received. apply (
lambda s: max ( [ int ( d) for d in s. split( ':' ) ] ) )
t2[ 'min_date_received' ] = t2. date_received. apply (
lambda s: min ( [ int ( d) for d in s. split( ':' ) ] ) )
t2 = t2[ [ 'user_id' , 'coupon_id' , 'max_date_received' , 'min_date_received' ] ]
t2. head( 2 )
user_id
coupon_id
max_date_received
min_date_received
38
2227
7088.0
20160511
20160426
147
8397
8556.0
20160506
20160425
t3 = dataset1[ [ 'user_id' , 'coupon_id' , 'date_received' ] ] . copy( )
t3 = pd. merge( t3, t2, on= [ 'user_id' , 'coupon_id' ] , how= 'left' )
t3[ 'this_month_user_receive_same_coupon_lastone' ] = t3. max_date_received - \
t3. date_received. astype( 'int' )
t3[ 'this_month_user_receive_same_coupon_firstone' ] = t3. date_received. astype(
'int' ) - t3. min_date_received
def is_firstlastone ( x) :
if x == 0 :
return 1
elif x > 0 :
return 0
else :
return - 1
t3. this_month_user_receive_same_coupon_lastone = t3. this_month_user_receive_same_coupon_lastone. apply (
is_firstlastone)
t3. this_month_user_receive_same_coupon_firstone = t3. this_month_user_receive_same_coupon_firstone. apply (
is_firstlastone)
t3 = t3[ [ 'user_id' , 'coupon_id' , 'date_received' , 'this_month_user_receive_same_coupon_lastone' ,
'this_month_user_receive_same_coupon_firstone' ] ]
t3. head( 2 )
user_id
coupon_id
date_received
this_month_user_receive_same_coupon_lastone
this_month_user_receive_same_coupon_firstone
0
1832624
7610.0
20160429.0
-1
-1
1
163606
5054.0
20160421.0
-1
-1
t4 = dataset1[ [ 'user_id' , 'date_received' ] ] . copy( )
t4[ 'this_day_user_receive_all_coupon_count' ] = 1
t4 = t4. groupby( [ 'user_id' , 'date_received' ] , as_index= False ) . count( )
t4. head( 2 )
user_id
date_received
this_day_user_receive_all_coupon_count
0
173
20160414.0
1
1
285
20160501.0
1
t5 = dataset1[ [ 'user_id' , 'coupon_id' , 'date_received' ] ] . copy( )
t5[ 'this_day_user_receive_same_coupon_count' ] = 1
t5 = t5. groupby( [ 'user_id' , 'coupon_id' , 'date_received' ] ,
as_index= False ) . count( )
t5. head( 2 )
user_id
coupon_id
date_received
this_day_user_receive_same_coupon_count
0
173
7610.0
20160414.0
1
1
285
1532.0
20160501.0
1
'''t6 = dataset1[['user_id', 'coupon_id', 'date_received']].copy()
t6.date_received = t6.date_received.astype('int').astype('str')
t6 = t6.groupby(['user_id', 'coupon_id'], as_index=False)[
'date_received'].agg(lambda x: ':'.join(x))
t6.rename(columns={'date_received': 'dates'}, inplace=True)
t6.head(2)'''
"t6 = dataset1[['user_id', 'coupon_id', 'date_received']].copy()\nt6.date_received = t6.date_received.astype('int').astype('str')\nt6 = t6.groupby(['user_id', 'coupon_id'], as_index=False)[\n 'date_received'].agg(lambda x: ':'.join(x))\nt6.rename(columns={'date_received': 'dates'}, inplace=True)\nt6.head(2)"
'''def get_day_gap_before(s):
date_received, dates = s.split('-')
dates = dates.split(':')
gaps = []
for d in dates:
this_gap = (date(int(date_received[0:4]), int(date_received[4:6]), int(
date_received[6:8]))-date(int(d[0:4]), int(d[4:6]), int(d[6:8]))).days
if this_gap > 0:
gaps.append(this_gap)
if len(gaps) == 0:
return -1
else:
return min(gaps)
def get_day_gap_after(s):
date_received, dates = s.split('-')
dates = dates.split(':')
gaps = []
for d in dates:
this_gap = (date(int(d[0:4]), int(d[4:6]), int(d[6:8]))-date(
int(date_received[0:4]), int(date_received[4:6]), int(date_received[6:8]))).days
if this_gap > 0:
gaps.append(this_gap)
if len(gaps) == 0:
return -1
else:
return min(gaps)
t7 = dataset1[['user_id', 'coupon_id', 'date_received']].copy()
t7 = pd.merge(t7, t6, on=['user_id', 'coupon_id'], how='left')
t7['date_received_date'] = t7.date_received.astype(
'int').astype('str') + '-' + t7.dates
t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
t7 = t7[['user_id', 'coupon_id', 'date_received',
'day_gap_before', 'day_gap_after']]
t7.head(2)'''
"def get_day_gap_before(s):\n date_received, dates = s.split('-')\n dates = dates.split(':')\n gaps = []\n for d in dates:\n this_gap = (date(int(date_received[0:4]), int(date_received[4:6]), int(\n date_received[6:8]))-date(int(d[0:4]), int(d[4:6]), int(d[6:8]))).days\n if this_gap > 0:\n gaps.append(this_gap)\n if len(gaps) == 0:\n return -1\n else:\n return min(gaps)\n\n\ndef get_day_gap_after(s):\n date_received, dates = s.split('-')\n dates = dates.split(':')\n gaps = []\n for d in dates:\n this_gap = (date(int(d[0:4]), int(d[4:6]), int(d[6:8]))-date(\n int(date_received[0:4]), int(date_received[4:6]), int(date_received[6:8]))).days\n if this_gap > 0:\n gaps.append(this_gap)\n if len(gaps) == 0:\n return -1\n else:\n return min(gaps)\n\n\nt7 = dataset1[['user_id', 'coupon_id', 'date_received']].copy()\nt7 = pd.merge(t7, t6, on=['user_id', 'coupon_id'], how='left')\nt7['date_received_date'] = t7.date_received.astype(\n 'int').astype('str') + '-' + t7.dates\nt7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)\nt7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)\nt7 = t7[['user_id', 'coupon_id', 'date_received',\n 'day_gap_before', 'day_gap_after']]\nt7.head(2)"
other_feature1 = pd. merge( t1, t, on= 'user_id' )
other_feature1 = pd. merge( other_feature1, t3, on= [ 'user_id' , 'coupon_id' ] )
other_feature1 = pd. merge( other_feature1, t4, on= [ 'user_id' , 'date_received' ] )
other_feature1 = pd. merge( other_feature1, t5, on= [
'user_id' , 'coupon_id' , 'date_received' ] )
other_feature1. to_csv( 'other_feature1.csv' , index= None )
other_feature1. head( )
user_id
coupon_id
this_month_user_receive_same_coupon_count
this_month_user_receive_all_coupon_count
date_received
this_month_user_receive_same_coupon_lastone
this_month_user_receive_same_coupon_firstone
this_day_user_receive_all_coupon_count
this_day_user_receive_same_coupon_count
0
173
7610.0
1
1
20160414.0
-1
-1
1
1
1
285
1532.0
1
1
20160501.0
-1
-1
1
1
2
316
8952.0
1
1
20160430.0
-1
-1
1
1
3
377
2857.0
1
1
20160512.0
-1
-1
1
1
4
387
7610.0
1
1
20160421.0
-1
-1
1
1
other_feature1. shape
(137167, 9)
coupon相关特征
column
含义
discount_rate
优惠券打折率,“满师减一”或0.9统一为0.9
distance
到店距离
day_of_week
周几
day_of_month
几号
days_distance
领券时间距6月30号距离(dataset2和dataset1作相应变化)
this_day_user_receive_same_coupon_count
用户领券当天领取该优惠券的数量
discount_man
“满减”折扣中满多少
discount_jian
“满减”折扣中减多少
is_man_jian
优惠券是不是满减形式
coupon_count
优惠券出现次数
def calc_discount_rate ( s) :
s = str ( s)
s = s. split( ':' )
if len ( s) == 1 :
return float ( s[ 0 ] )
else :
return 1.0 - float ( s[ 1 ] ) / float ( s[ 0 ] )
def get_discount_man ( s) :
s = str ( s)
s = s. split( ':' )
if len ( s) == 1 :
return 0
else :
return int ( s[ 0 ] )
def get_discount_jian ( s) :
s = str ( s)
s = s. split( ':' )
if len ( s) == 1 :
return 0
else :
return int ( s[ 1 ] )
def is_man_jian ( s) :
s = str ( s)
s = s. split( ':' )
if len ( s) == 1 :
return 0
else :
return 1
dataset3
dataset3[ 'day_of_week' ] = dataset3. date_received. astype( 'int' ) . astype( 'str' ) . apply (
lambda x: date( int ( x[ 0 : 4 ] ) , int ( x[ 4 : 6 ] ) , int ( x[ 6 : 8 ] ) ) . weekday( ) + 1 )
dataset3[ 'day_of_month' ] = dataset3. date_received. astype( 'int' ) . astype(
'str' ) . apply ( lambda x: int ( x[ 6 : 8 ] ) )
dataset3[ 'days_distance' ] = dataset3. date_received. astype( 'int' ) . astype( 'str' ) . apply (
lambda x: ( date( int ( x[ 0 : 4 ] ) , int ( x[ 4 : 6 ] ) , int ( x[ 6 : 8 ] ) ) - date( 2016 , 6 , 30 ) ) . days)
dataset3[ 'discount_man' ] = dataset3. discount_rate. apply ( get_discount_man)
dataset3[ 'discount_jian' ] = dataset3. discount_rate. apply ( get_discount_jian)
dataset3[ 'is_man_jian' ] = dataset3. discount_rate. apply ( is_man_jian)
dataset3[ 'discount_rate' ] = dataset3. discount_rate. apply ( calc_discount_rate)
dataset3. head( )
user_id
merchant_id
coupon_id
discount_rate
distance
date_received
day_of_week
day_of_month
days_distance
discount_man
discount_jian
is_man_jian
0
4129537
450
9983
0.833333
1.0
20160712
2
12
12
30
5
1
1
6949378
1300
3429
0.833333
NaN
20160706
3
6
6
30
5
1
2
2166529
7113
6928
0.900000
5.0
20160727
3
27
27
200
20
1
3
2166529
7113
1808
0.900000
5.0
20160727
3
27
27
100
10
1
4
6172162
7605
6500
0.966667
2.0
20160708
5
8
8
30
1
1
d = dataset3[ [ 'coupon_id' ] ] . copy( )
d[ 'coupon_count' ] = 1
d = d. groupby( 'coupon_id' , as_index= False ) . count( )
d. head( 2 )
coupon_id
coupon_count
0
3
16
1
7
1
dataset3 = pd. merge( dataset3, d, on= 'coupon_id' , how= 'left' )
dataset3. to_csv( 'coupon3_feature.csv' , index= None )
dataset3. head( )
user_id
merchant_id
coupon_id
discount_rate
distance
date_received
day_of_week
day_of_month
days_distance
discount_man
discount_jian
is_man_jian
coupon_count
0
4129537
450
9983
0.833333
1.0
20160712
2
12
12
30
5
1
11586
1
6949378
1300
3429
0.833333
NaN
20160706
3
6
6
30
5
1
4887
2
2166529
7113
6928
0.900000
5.0
20160727
3
27
27
200
20
1
205
3
2166529
7113
1808
0.900000
5.0
20160727
3
27
27
100
10
1
205
4
6172162
7605
6500
0.966667
2.0
20160708
5
8
8
30
1
1
132
dataset2. head( )
user_id
merchant_id
coupon_id
discount_rate
distance
date_received
date
1
1439408
4663
11002.0
150:20
1.0
20160528.0
NaN
4
1439408
2632
8591.0
20:1
0.0
20160613.0
NaN
6
1439408
2632
8591.0
20:1
0.0
20160516.0
20160613.0
9
2029232
450
1532.0
30:5
0.0
20160530.0
NaN
10
2029232
6459
12737.0
20:1
0.0
20160519.0
NaN
dataset2
dataset2[ 'day_of_week' ] = dataset2[ 'date_received' ] . copy( ) . astype( 'str' ) . apply (
lambda x: date( int ( x[ 0 : 4 ] ) , int ( x[ 4 : 6 ] ) , int ( x[ 6 : 8 ] ) ) . weekday( ) + 1 )
dataset2[ 'day_of_month' ] = dataset2. date_received. copy( ) . astype(
'str' ) . apply ( lambda x: int ( x[ 6 : 8 ] ) )
dataset2[ 'days_distance' ] = dataset2. date_received. copy( ) . astype( 'str' ) . apply (
lambda x: ( date( int ( x[ 0 : 4 ] ) , int ( x[ 4 : 6 ] ) , int ( x[ 6 : 8 ] ) ) - date( 2016 , 5 , 14 ) ) . days)
dataset2[ 'discount_man' ] = dataset2. discount_rate. apply ( get_discount_man)
dataset2[ 'discount_jian' ] = dataset2. discount_rate. apply ( get_discount_jian)
dataset2[ 'is_man_jian' ] = dataset2. discount_rate. apply ( is_man_jian)
dataset2[ 'discount_rate' ] = dataset2. discount_rate. apply ( calc_discount_rate)
dataset2. head( )
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:7: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
import sys
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:9: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
if __name__ == '__main__':
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:10: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# Remove the CWD from sys.path while we load stuff.
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:11: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# This is added back by InteractiveShellApp.init_path()
user_id
merchant_id
coupon_id
discount_rate
distance
date_received
date
day_of_week
day_of_month
days_distance
discount_man
discount_jian
is_man_jian
1
1439408
4663
11002.0
0.866667
1.0
20160528.0
NaN
6
28
14
150
20
1
4
1439408
2632
8591.0
0.950000
0.0
20160613.0
NaN
1
13
30
20
1
1
6
1439408
2632
8591.0
0.950000
0.0
20160516.0
20160613.0
1
16
2
20
1
1
9
2029232
450
1532.0
0.833333
0.0
20160530.0
NaN
1
30
16
30
5
1
10
2029232
6459
12737.0
0.950000
0.0
20160519.0
NaN
4
19
5
20
1
1
d = dataset2[ [ 'coupon_id' ] ] . copy( )
d[ 'coupon_count' ] = 1
d = d. groupby( 'coupon_id' , as_index= False ) . count( )
d. head( 2 )
coupon_id
coupon_count
0
1.0
4
1
3.0
12
dataset2 = pd. merge( dataset2, d, on= 'coupon_id' , how= 'left' )
dataset2. to_csv( 'coupon2_feature.csv' , index= None )
dataset2. head( )
user_id
merchant_id
coupon_id
discount_rate
distance
date_received
date
day_of_week
day_of_month
days_distance
discount_man
discount_jian
is_man_jian
coupon_count
0
1439408
4663
11002.0
0.866667
1.0
20160528.0
NaN
6
28
14
150
20
1
7730
1
1439408
2632
8591.0
0.950000
0.0
20160613.0
NaN
1
13
30
20
1
1
5
2
1439408
2632
8591.0
0.950000
0.0
20160516.0
20160613.0
1
16
2
20
1
1
5
3
2029232
450
1532.0
0.833333
0.0
20160530.0
NaN
1
30
16
30
5
1
11728
4
2029232
6459
12737.0
0.950000
0.0
20160519.0
NaN
4
19
5
20
1
1
16
dataset1
dataset1[ 'day_of_week' ] = dataset1. date_received. copy( ) . astype(
'str' ) . apply ( lambda x: date( int ( x[ 0 : 4 ] ) , int ( x[ 4 : 6 ] ) , int ( x[ 6 : 8 ] ) ) . weekday( ) + 1 )
dataset1[ 'day_of_month' ] = dataset1. date_received. copy( ) . astype(
'str' ) . apply ( lambda x: int ( x[ 6 : 8 ] ) )
dataset1[ 'days_distance' ] = dataset1. date_received. copy( ) . astype( 'str' ) . apply (
lambda x: ( date( int ( x[ 0 : 4 ] ) , int ( x[ 4 : 6 ] ) , int ( x[ 6 : 8 ] ) ) - date( 2016 , 4 , 13 ) ) . days)
dataset1[ 'discount_man' ] = dataset1. discount_rate. apply ( get_discount_man)
dataset1[ 'discount_jian' ] = dataset1. discount_rate. apply ( get_discount_jian)
dataset1[ 'is_man_jian' ] = dataset1. discount_rate. apply ( is_man_jian)
dataset1[ 'discount_rate' ] = dataset1. discount_rate. apply ( calc_discount_rate)
dataset1. head( )
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
after removing the cwd from sys.path.
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:7: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
import sys
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:9: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
if __name__ == '__main__':
D:\anaconda3\lib\site-packages\ipykernel_launcher.py:10: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# Remove the CWD from sys.path while we load stuff.
user_id
merchant_id
coupon_id
discount_rate
distance
date_received
date
day_of_week
day_of_month
days_distance
discount_man
discount_jian
is_man_jian
7
1832624
3381
7610.0
0.900000
0.0
20160429.0
NaN
5
29
16
200
20
1
18
163606
1569
5054.0
0.850000
10.0
20160421.0
NaN
4
21
8
200
30
1
43
4061024
3381
7610.0
0.900000
10.0
20160426.0
NaN
2
26
13
200
20
1
52
106443
450
3732.0
0.833333
NaN
20160429.0
NaN
5
29
16
30
5
1
96
114747
1569
5054.0
0.850000
9.0
20160426.0
NaN
2
26
13
200
30
1
d = dataset1[ [ 'coupon_id' ] ] . copy( )
d[ 'coupon_count' ] = 1
d = d. groupby( 'coupon_id' , as_index= False ) . count( )
d. head( 2 )
coupon_id
coupon_count
0
1.0
1
1
2.0
2
dataset1 = pd. merge( dataset1, d, on= 'coupon_id' , how= 'left' )
dataset1. to_csv( 'coupon1_feature.csv' , index= None )
dataset1. head( 2 )
user_id
merchant_id
coupon_id
discount_rate
distance
date_received
date
day_of_week
day_of_month
days_distance
discount_man
discount_jian
is_man_jian
coupon_count
0
1832624
3381
7610.0
0.90
0.0
20160429.0
NaN
5
29
16
200
20
1
22281
1
163606
1569
5054.0
0.85
10.0
20160421.0
NaN
4
21
8
200
30
1
11893
Merchant相关特征
column
含义
total_sales
商家被消费次数
sales_use_coupon
商家被领券后消费的次数
total_coupon
商家被领券次数
coupon_rate
sales_use_coupon/total_sales
transfer_rate
sales_use_coupon/total_coupon
merchant_min_distance
最小距离
merchant_max_distance
最大距离
merchant_min_distance
最小距离
merchant_max_distance
最大距离
dataset3
merchant3 = feature3[ [ 'merchant_id' , 'coupon_id' ,
'distance' , 'date_received' , 'date' ] ] . copy( )
t = merchant3[ [ 'merchant_id' ] ] . copy( )
t. drop_duplicates( inplace= True )
t. head( 2 )
merchant_id
1
4663
3
2632
t1 = merchant3[ merchant3. date. notnull( ) ] [ [ 'merchant_id' ] ] . copy( )
t1[ 'total_sales' ] = 1
t1 = t1. groupby( 'merchant_id' , as_index= False ) . count( )
t1. head( 2 )
merchant_id
total_sales
0
1
10
1
2
4
t2 = merchant3[ ( merchant3. date. notnull( ) ) & (
merchant3. coupon_id. notnull( ) ) ] [ [ 'merchant_id' ] ] . copy( )
t2[ 'sales_use_coupon' ] = 1
t2 = t2. groupby( 'merchant_id' , as_index= False ) . count( )
t2. head( 2 )
merchant_id
sales_use_coupon
0
3
1
1
4
5
t3 = merchant3[ merchant3. coupon_id. notnull( ) ] [ [ 'merchant_id' ] ] . copy( )
t3[ 'total_coupon' ] = 1
t3 = t3. groupby( 'merchant_id' , as_index= False ) . count( )
t3. head( 2 )
merchant_id
total_coupon
0
2
7
1
3
10
t4 = merchant3[ ( merchant3. date. notnull( ) ) & (
merchant3. coupon_id. notnull( ) ) ] [ [ 'merchant_id' , 'distance' ] ] . copy( )
t4. head( 2 )
merchant_id
distance
6
2632
0.0
33
1361
0.0
t5 = t4. groupby( 'merchant_id' , as_index= False ) . min ( )
t5. rename( columns= { 'distance' : 'merchant_min_distance' } , inplace= True )
t6 = t4. groupby( 'merchant_id' , as_index= False ) . max ( )
t6. rename( columns= { 'distance' : 'merchant_max_distance' } , inplace= True )
t7 = t4. groupby( 'merchant_id' , as_index= False ) . mean( )
t7. rename( columns= { 'distance' : 'merchant_mean_distance' } , inplace= True )
t8 = t4. groupby( 'merchant_id' , as_index= False ) . median( )
t8. rename( columns= { 'distance' : 'merchant_median_distance' } , inplace= True )
t8. head( 2 )
merchant_id
merchant_median_distance
0
3
0.0
1
4
0.0
merchant3_feature = pd. merge( t, t1, on= 'merchant_id' , how= 'left' )
merchant3_feature = pd. merge(
merchant3_feature, t2, on= 'merchant_id' , how= 'left' )
merchant3_feature = pd. merge(
merchant3_feature, t3, on= 'merchant_id' , how= 'left' )
merchant3_feature = pd. merge(
merchant3_feature, t5, on= 'merchant_id' , how= 'left' )
merchant3_feature = pd. merge(
merchant3_feature, t6, on= 'merchant_id' , how= 'left' )
merchant3_feature = pd. merge(
merchant3_feature, t7, on= 'merchant_id' , how= 'left' )
merchant3_feature = pd. merge(
merchant3_feature, t8, on= 'merchant_id' , how= 'left' )
merchant3_feature. sales_use_coupon = merchant3_feature. sales_use_coupon. replace(
np. nan, 0 )
merchant3_feature[ 'merchant_coupon_transfer_rate' ] = merchant3_feature. sales_use_coupon / \
merchant3_feature. total_coupon
merchant3_feature[ 'coupon_rate' ] = merchant3_feature. sales_use_coupon. astype(
'float' ) / merchant3_feature. total_sales
merchant3_feature. total_sales = merchant3_feature. total_sales. replace( np. nan, 0 )
merchant3_feature. total_coupon = merchant3_feature. total_coupon. replace(
np. nan, 0 )
merchant3_feature. to_csv( 'merchant3_feature.csv' , index= None )
merchant3_feature. head( )
merchant_id
total_sales
sales_use_coupon
total_coupon
merchant_min_distance
merchant_max_distance
merchant_mean_distance
merchant_median_distance
merchant_coupon_transfer_rate
coupon_rate
0
4663
393.0
39.0
11445.0
0.0
10.0
1.911765
1.0
0.003408
0.099237
1
2632
8.0
3.0
23.0
0.0
1.0
0.666667
1.0
0.130435
0.375000
2
3381
9579.0
416.0
46729.0
0.0
10.0
1.713580
1.0
0.008902
0.043428
3
450
7757.0
1239.0
35933.0
0.0
10.0
0.894068
0.0
0.034481
0.159727
4
6459
20.0
0.0
16.0
NaN
NaN
NaN
NaN
0.000000
0.000000
merchant3_feature. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8293 entries, 0 to 8292
Data columns (total 10 columns):
merchant_id 8293 non-null int64
total_sales 8293 non-null float64
sales_use_coupon 8293 non-null float64
total_coupon 8293 non-null float64
merchant_min_distance 3552 non-null float64
merchant_max_distance 3552 non-null float64
merchant_mean_distance 3552 non-null float64
merchant_median_distance 3552 non-null float64
merchant_coupon_transfer_rate 5363 non-null float64
coupon_rate 8194 non-null float64
dtypes: float64(9), int64(1)
memory usage: 712.7 KB
dataset2
merchant2 = feature2[ [ 'merchant_id' , 'coupon_id' ,
'distance' , 'date_received' , 'date' ] ] . copy( )
t = merchant2[ [ 'merchant_id' ] ] . copy( )
t. drop_duplicates( inplace= True )
t. head( 2 )
merchant_id
0
2632
7
3381
t1 = merchant2[ merchant2. date. notnull( ) ] [ [ 'merchant_id' ] ] . copy( )
t1[ 'total_sales' ] = 1
t1 = t1. groupby( 'merchant_id' ) . agg( 'sum' ) . reset_index( )
t1. head( 2 )
merchant_id
total_sales
0
1
3
1
3
8
t2 = merchant2[ ( merchant2. date. notnull( ) ) & (
merchant2. coupon_id. notnull( ) ) ] [ [ 'merchant_id' ] ] . copy( )
t2[ 'sales_use_coupon' ] = 1
t2 = t2. groupby( 'merchant_id' ) . agg( 'sum' ) . reset_index( )
t2. head( 2 )
merchant_id
sales_use_coupon
0
15
11
1
17
1
t3 = merchant2[ merchant2. coupon_id. notnull( ) ] [ [ 'merchant_id' ] ] . copy( )
t3[ 'total_coupon' ] = 1
t3 = t3. groupby( 'merchant_id' ) . agg( 'sum' ) . reset_index( )
t3. head( 2 )
merchant_id
total_coupon
0
5
2
1
8
2
t4 = merchant2[ ( merchant2. date. notnull( ) ) & (
merchant2. coupon_id. notnull( ) ) ] [ [ 'merchant_id' , 'distance' ] ] . copy( )
t5 = t4. groupby( 'merchant_id' ) . agg( 'min' ) . reset_index( )
t5. rename( columns= { 'distance' : 'merchant_min_distance' } , inplace= True )
t6 = t4. groupby( 'merchant_id' ) . agg( 'max' ) . reset_index( )
t6. rename( columns= { 'distance' : 'merchant_max_distance' } , inplace= True )
t7 = t4. groupby( 'merchant_id' ) . agg( 'mean' ) . reset_index( )
t7. rename( columns= { 'distance' : 'merchant_mean_distance' } , inplace= True )
t8 = t4. groupby( 'merchant_id' ) . agg( 'median' ) . reset_index( )
t8. rename( columns= { 'distance' : 'merchant_median_distance' } , inplace= True )
t8. head( 2 )
merchant_id
merchant_median_distance
0
15
0.0
1
17
1.0
merchant2_feature = pd. merge( t, t1, on= 'merchant_id' , how= 'left' )
merchant2_feature = pd. merge(
merchant2_feature, t2, on= 'merchant_id' , how= 'left' )
merchant2_feature = pd. merge(
merchant2_feature, t3, on= 'merchant_id' , how= 'left' )
merchant2_feature = pd. merge(
merchant2_feature, t5, on= 'merchant_id' , how= 'left' )
merchant2_feature = pd. merge(
merchant2_feature, t6, on= 'merchant_id' , how= 'left' )
merchant2_feature = pd. merge(
merchant2_feature, t7, on= 'merchant_id' , how= 'left' )
merchant2_feature = pd. merge(
merchant2_feature, t8, on= 'merchant_id' , how= 'left' )
merchant2_feature. sales_use_coupon = merchant2_feature. sales_use_coupon. replace(
np. nan, 0 )
merchant2_feature[ 'merchant_coupon_transfer_rate' ] = merchant2_feature. sales_use_coupon. astype(
'float' ) / merchant2_feature. total_coupon
merchant2_feature[ 'coupon_rate' ] = merchant2_feature. sales_use_coupon. astype(
'float' ) / merchant2_feature. total_sales
merchant2_feature. total_sales = merchant2_feature. total_sales. replace( np. nan, 0 )
merchant2_feature. total_coupon = merchant2_feature. total_coupon. replace(
np. nan, 0 )
merchant2_feature. to_csv( 'merchant2_feature.csv' , index= None )
merchant2_feature. head( )
merchant_id
total_sales
sales_use_coupon
total_coupon
merchant_min_distance
merchant_max_distance
merchant_mean_distance
merchant_median_distance
merchant_coupon_transfer_rate
coupon_rate
0
2632
14.0
3.0
31.0
1.0
1.0
1.000000
1.0
0.096774
0.214286
1
3381
11537.0
1980.0
56499.0
0.0
10.0
1.697002
1.0
0.035045
0.171622
2
2099
5711.0
1497.0
12215.0
0.0
10.0
1.004814
0.0
0.122554
0.262126
3
1569
493.0
91.0
25173.0
0.0
10.0
2.388235
1.0
0.003615
0.184584
4
8390
1207.0
133.0
690.0
0.0
10.0
0.865079
0.0
0.192754
0.110191
dataset1
merchant1 = feature1[ [ 'merchant_id' , 'coupon_id' ,
'distance' , 'date_received' , 'date' ] ] . copy( )
t = merchant1[ [ 'merchant_id' ] ] . copy( )
t. drop_duplicates( inplace= True )
t. head( 2 )
merchant_id
0
2632
8
3381
t1 = merchant1[ merchant1. date. notnull( ) ] [ [ 'merchant_id' ] ] . copy( )
t1[ 'total_sales' ] = 1
t1 = t1. groupby( 'merchant_id' ) . agg( 'sum' ) . reset_index( )
t1. head( 2 )
merchant_id
total_sales
0
1
4
1
4
12
t2 = merchant1[ ( merchant1. date. notnull( ) ) & (
merchant1. coupon_id. notnull( ) ) ] [ [ 'merchant_id' ] ] . copy( )
t2[ 'sales_use_coupon' ] = 1
t2 = t2. groupby( 'merchant_id' ) . agg( 'sum' ) . reset_index( )
t2. head( 2 )
merchant_id
sales_use_coupon
0
13
1
1
14
1
t3 = merchant1[ merchant1. coupon_id. notnull( ) ] [ [ 'merchant_id' ] ] . copy( )
t3[ 'total_coupon' ] = 1
t3 = t3. groupby( 'merchant_id' ) . agg( 'sum' ) . reset_index( )
t3. head( 2 )
merchant_id
total_coupon
0
8
2
1
13
3
t4 = merchant1[ ( merchant1. date. notnull( ) ) & (
merchant1. coupon_id. notnull( ) ) ] [ [ 'merchant_id' , 'distance' ] ]
t5 = t4. groupby( 'merchant_id' ) . agg( 'min' ) . reset_index( )
t5. rename( columns= { 'distance' : 'merchant_min_distance' } , inplace= True )
t6 = t4. groupby( 'merchant_id' ) . agg( 'max' ) . reset_index( )
t6. rename( columns= { 'distance' : 'merchant_max_distance' } , inplace= True )
t7 = t4. groupby( 'merchant_id' ) . agg( 'mean' ) . reset_index( )
t7. rename( columns= { 'distance' : 'merchant_mean_distance' } , inplace= True )
t8 = t4. groupby( 'merchant_id' ) . agg( 'median' ) . reset_index( )
t8. rename( columns= { 'distance' : 'merchant_median_distance' } , inplace= True )
t8. head( 2 )
merchant_id
merchant_median_distance
0
13
0.0
1
14
0.0
merchant1_feature = pd. merge( t, t1, on= 'merchant_id' , how= 'left' )
merchant1_feature = pd. merge(
merchant1_feature, t2, on= 'merchant_id' , how= 'left' )
merchant1_feature = pd. merge(
merchant1_feature, t3, on= 'merchant_id' , how= 'left' )
merchant1_feature = pd. merge(
merchant1_feature, t5, on= 'merchant_id' , how= 'left' )
merchant1_feature = pd. merge(
merchant1_feature, t6, on= 'merchant_id' , how= 'left' )
merchant1_feature = pd. merge(
merchant1_feature, t7, on= 'merchant_id' , how= 'left' )
merchant1_feature = pd. merge(
merchant1_feature, t8, on= 'merchant_id' , how= 'left' )
merchant1_feature. sales_use_coupon = merchant1_feature. sales_use_coupon. replace(
np. nan, 0 )
merchant1_feature[ 'merchant_coupon_transfer_rate' ] = merchant1_feature. sales_use_coupon. astype(
'float' ) / merchant1_feature. total_coupon
merchant1_feature[ 'coupon_rate' ] = merchant1_feature. sales_use_coupon. astype(
'float' ) / merchant1_feature. total_sales
merchant1_feature. total_sales = merchant1_feature. total_sales. replace(
np. nan, 0 )
merchant1_feature. total_coupon = merchant1_feature. total_coupon. replace(
np. nan, 0 )
merchant1_feature. to_csv( 'merchant1_feature.csv' , index= None )
merchant1_feature. head( )
merchant_id
total_sales
sales_use_coupon
total_coupon
merchant_min_distance
merchant_max_distance
merchant_mean_distance
merchant_median_distance
merchant_coupon_transfer_rate
coupon_rate
0
2632
14.0
1.0
28.0
1.0
1.0
1.000000
1.0
0.035714
0.071429
1
3381
14962.0
2168.0
100369.0
0.0
10.0
1.650457
1.0
0.021600
0.144900
2
2099
5783.0
1705.0
16824.0
0.0
10.0
0.968072
0.0
0.101343
0.294830
3
4833
522.0
116.0
8321.0
0.0
10.0
3.037736
2.0
0.013941
0.222222
4
8390
1056.0
133.0
690.0
0.0
10.0
0.865079
0.0
0.192754
0.125947
User相关特征
column
含义
count_merchant
每个用户在多少商家产生消费
user_%_distance
距离特征
buy_use_coupon
领券又消费的数量
buy_total
消费数量
coupon_received
领券数量
%_user_date_datereceived_gap
领券和消费的时间差的三个特征
buy_use_coupon_rate
-
user_coupon_transfer_rate
-
dataset3
user3 = feature3[ [ 'user_id' , 'merchant_id' , 'coupon_id' ,
'discount_rate' , 'distance' , 'date_received' , 'date' ] ] . copy( )
t = user3[ [ 'user_id' ] ] . copy( )
t. drop_duplicates( inplace= True )
t. head( 2 )
user_id
1
1439408
7
1832624
t1 = user3[ user3. date. notnull( ) ] [ [ 'user_id' , 'merchant_id' ] ] . copy( )
t1. drop_duplicates( inplace= True )
t1 = t1. groupby( 'user_id' ) . count( ) . reset_index( )
t1. rename( columns= { 'merchant_id' : 'count_merchant' } , inplace= True )
t1. head( 2 )
user_id
count_merchant
0
165
2
1
209
1
t2 = user3[ ( user3. date. notnull( ) ) & ( user3. coupon_id. notnull( ) )
] [ [ 'user_id' , 'distance' ] ] . copy( )
t3 = t2. groupby( 'user_id' ) . min ( ) . reset_index( )
t3. rename( columns= { 'distance' : 'user_min_distance' } , inplace= True )
t4 = t2. groupby( 'user_id' ) . max ( ) . reset_index( )
t4. rename( columns= { 'distance' : 'user_max_distance' } , inplace= True )
t5 = t2. groupby( 'user_id' ) . mean( ) . reset_index( )
t5. rename( columns= { 'distance' : 'user_mean_distance' } , inplace= True )
t6 = t2. groupby( 'user_id' ) . median( ) . reset_index( )
t6. rename( columns= { 'distance' : 'user_median_distance' } , inplace= True )
t6. head( 2 )
user_id
user_median_distance
0
417
0.0
1
687
NaN
t7 = user3[ ( user3. date. notnull( ) ) & (
user3. coupon_id. notnull( ) ) ] [ [ 'user_id' ] ] . copy( )
t7[ 'buy_use_coupon' ] = 1
t7 = t7. groupby( 'user_id' ) . count( ) . reset_index( )
t7. head( 2 )
user_id
buy_use_coupon
0
417
1
1
687
1
t8 = user3[ user3. date. notnull( ) ] [ [ 'user_id' ] ] . copy( )
t8[ 'buy_total' ] = 1
t8 = t8. groupby( 'user_id' ) . count( ) . reset_index( )
t8. head( 2 )
user_id
buy_total
0
165
12
1
209
1
t9 = user3[ user3. coupon_id. notnull( ) ] [ [ 'user_id' ] ] . copy( )
t9[ 'coupon_received' ] = 1
t9 = t9. groupby( 'user_id' ) . count( ) . reset_index( )
t9. head( 2 )
user_id
coupon_received
0
4
1
1
165
2
def get_user_date_datereceived_gap ( s) :
s = s. split( ':' )
return ( date( int ( s[ 0 ] [ 0 : 4 ] ) , int ( s[ 0 ] [ 4 : 6 ] ) , int ( s[ 0 ] [ 6 : 8 ] ) ) -
date( int ( s[ 1 ] [ 0 : 4 ] ) , int ( s[ 1 ] [ 4 : 6 ] ) , int ( s[ 1 ] [ 6 : 8 ] ) ) ) . days
t10 = user3[ ( user3. date_received. notnull( ) ) & (
user3. date. notnull( ) ) ] [ [ 'user_id' , 'date_received' , 'date' ] ] . copy( )
t10[ [ 'date_received' , 'date' ] ] = t10[ [
'date_received' , 'date' ] ] . astype( str )
t10[ 'user_date_datereceived_gap' ] = t10. date + ':' + t10. date_received
t10. user_date_datereceived_gap = t10. user_date_datereceived_gap. apply (
get_user_date_datereceived_gap)
t10 = t10[ [ 'user_id' , 'user_date_datereceived_gap' ] ]
t10. head( 2 )
user_id
user_date_datereceived_gap
6
1439408
28
33
1113008
6
t11 = t10. groupby( 'user_id' ) . mean( ) . reset_index( )
t11. rename( columns= {
'user_date_datereceived_gap' : 'avg_user_date_datereceived_gap' } , inplace= True )
t12 = t10. groupby( 'user_id' ) . min ( ) . reset_index( )
t12. rename( columns= {
'user_date_datereceived_gap' : 'min_user_date_datereceived_gap' } , inplace= True )
t13 = t10. groupby( 'user_id' ) . max ( ) . reset_index( )
t13. rename( columns= {
'user_date_datereceived_gap' : 'max_user_date_datereceived_gap' } , inplace= True )
t13. head( 2 )
user_id
max_user_date_datereceived_gap
0
417
14
1
687
5
user3_feature = pd. merge( t, t1, on= 'user_id' , how= 'left' )
user3_feature = pd. merge( user3_feature, t3, on= 'user_id' , how= 'left' )
user3_feature = pd. merge( user3_feature, t4, on= 'user_id' , how= 'left' )
user3_feature = pd. merge( user3_feature, t5, on= 'user_id' , how= 'left' )
user3_feature = pd. merge( user3_feature, t6, on= 'user_id' , how= 'left' )
user3_feature = pd. merge( user3_feature, t7, on= 'user_id' , how= 'left' )
user3_feature = pd. merge( user3_feature, t8, on= 'user_id' , how= 'left' )
user3_feature = pd. merge( user3_feature, t9, on= 'user_id' , how= 'left' )
user3_feature = pd. merge( user3_feature, t11, on= 'user_id' , how= 'left' )
user3_feature = pd. merge( user3_feature, t12, on= 'user_id' , how= 'left' )
user3_feature = pd. merge( user3_feature, t13, on= 'user_id' , how= 'left' )
user3_feature. buy_use_coupon = user3_feature. buy_use_coupon. replace( np. nan, 0 )
user3_feature. count_merchant = user3_feature. count_merchant. replace( np. nan, 0 )
user3_feature[ 'buy_use_coupon_rate' ] = user3_feature. buy_use_coupon / \
user3_feature. buy_total
user3_feature[ 'user_coupon_transfer_rate' ] = user3_feature. buy_use_coupon / \
user3_feature. coupon_received
user3_feature. buy_total = user3_feature. buy_total. replace( np. nan, 0 )
user3_feature. coupon_received = user3_feature. coupon_received. replace(
np. nan, 0 )
user3_feature. to_csv( 'user3_feature.csv' , index= None )
user3_feature. isnull( ) . sum ( )
user3_feature. head( )
user_id 0
count_merchant 0
user_min_distance 320676
user_max_distance 320676
user_mean_distance 320676
user_median_distance 320676
buy_use_coupon 0
buy_total 0
coupon_received 0
avg_user_date_datereceived_gap 316780
min_user_date_datereceived_gap 316780
max_user_date_datereceived_gap 316780
buy_use_coupon_rate 170895
user_coupon_transfer_rate 38354
dtype: int64
user_id
count_merchant
user_min_distance
user_max_distance
user_mean_distance
user_median_distance
buy_use_coupon
buy_total
coupon_received
avg_user_date_datereceived_gap
min_user_date_datereceived_gap
max_user_date_datereceived_gap
buy_use_coupon_rate
user_coupon_transfer_rate
0
1439408
1.0
0.0
0.0
0.0
0.0
1.0
2.0
4.0
28.0
28.0
28.0
0.5
0.25
1
1832624
0.0
NaN
NaN
NaN
NaN
0.0
0.0
1.0
NaN
NaN
NaN
NaN
0.00
2
2029232
1.0
NaN
NaN
NaN
NaN
0.0
2.0
2.0
NaN
NaN
NaN
0.0
0.00
3
2747744
0.0
NaN
NaN
NaN
NaN
0.0
0.0
1.0
NaN
NaN
NaN
NaN
0.00
4
196342
1.0
NaN
NaN
NaN
NaN
0.0
1.0
1.0
NaN
NaN
NaN
0.0
0.00
dataset2
user2 = feature2[ [ 'user_id' , 'merchant_id' , 'coupon_id' ,
'discount_rate' , 'distance' , 'date_received' , 'date' ] ] . copy( )
t = user2[ [ 'user_id' ] ] . copy( )
t. drop_duplicates( inplace= True )
t. head( 2 )
user_id
0
1439408
7
1832624
t1 = user2[ user2. date. notnull( ) ] [ [ 'user_id' , 'merchant_id' ] ] . copy( )
t1. drop_duplicates( inplace= True )
t1 = t1. groupby( 'user_id' ) . count( ) . reset_index( )
t1. rename( columns= { 'merchant_id' : 'count_merchant' } , inplace= True )
t1. head( 2 )
user_id
count_merchant
0
165
2
1
184
1
t2 = user2[ ( user2. date. notnull( ) ) & ( user2. coupon_id. notnull( ) )
] [ [ 'user_id' , 'distance' ] ] . copy( )
t3 = t2. groupby( 'user_id' ) . min ( ) . reset_index( )
t3. rename( columns= { 'distance' : 'user_min_distance' } , inplace= True )
t4 = t2. groupby( 'user_id' ) . max ( ) . reset_index( )
t4. rename( columns= { 'distance' : 'user_max_distance' } , inplace= True )
t5 = t2. groupby( 'user_id' ) . mean( ) . reset_index( )
t5. rename( columns= { 'distance' : 'user_mean_distance' } , inplace= True )
t6 = t2. groupby( 'user_id' ) . median( ) . reset_index( )
t6. rename( columns= { 'distance' : 'user_median_distance' } , inplace= True )
t6. head( 2 )
user_id
user_median_distance
0
184
0.0
1
417
0.0
t7 = user2[ ( user2. date. notnull( ) ) & (
user2. coupon_id. notnull( ) ) ] [ [ 'user_id' ] ] . copy( )
t7[ 'buy_use_coupon' ] = 1
t7 = t7. groupby( 'user_id' ) . count( ) . reset_index( )
t7. head( 2 )
user_id
buy_use_coupon
0
184
1
1
417
1
t8 = user2[ user2. date. notnull( ) ] [ [ 'user_id' ] ] . copy( )
t8[ 'buy_total' ] = 1
t8 = t8. groupby( 'user_id' ) . count( ) . reset_index( )
t8. head( 2 )
user_id
buy_total
0
165
8
1
184
1
t9 = user2[ user2. coupon_id. notnull( ) ] [ [ 'user_id' ] ] . copy( )
t9[ 'coupon_received' ] = 1
t9 = t9. groupby( 'user_id' ) . count( ) . reset_index( )
t9. head( 2 )
user_id
coupon_received
0
4
1
1
144
1
def get_user_date_datereceived_gap ( s) :
s = s. split( ':' )
return ( date( int ( s[ 0 ] [ 0 : 4 ] ) , int ( s[ 0 ] [ 4 : 6 ] ) , int ( s[ 0 ] [ 6 : 8 ] ) ) - date( int ( s[ 1 ] [ 0 : 4 ] ) , int ( s[ 1 ] [ 4 : 6 ] ) , int ( s[ 1 ] [ 6 : 8 ] ) ) ) . days
t10 = user2[ ( user2. date_received. notnull( ) ) & (
user2. date. notnull( ) ) ] [ [ 'user_id' , 'date_received' , 'date' ] ] . copy( )
t10[ [ 'date_received' , 'date' ] ] = t10[ [
'date_received' , 'date' ] ] . astype( int ) . astype( str )
t10[ 'user_date_datereceived_gap' ] = t10. date + ':' + t10. date_received
t10. user_date_datereceived_gap = t10. user_date_datereceived_gap. apply (
get_user_date_datereceived_gap)
t10 = t10[ [ 'user_id' , 'user_date_datereceived_gap' ] ]
t10. head( 2 )
user_id
user_date_datereceived_gap
38
2881376
8
75
114747
25
t11 = t10. groupby( 'user_id' ) . mean( ) . reset_index( )
t11. rename( columns= {
'user_date_datereceived_gap' : 'avg_user_date_datereceived_gap' } , inplace= True )
t12 = t10. groupby( 'user_id' ) . min ( ) . reset_index( )
t12. rename( columns= {
'user_date_datereceived_gap' : 'min_user_date_datereceived_gap' } , inplace= True )
t13 = t10. groupby( 'user_id' ) . max ( ) . reset_index( )
t13. rename( columns= {
'user_date_datereceived_gap' : 'max_user_date_datereceived_gap' } , inplace= True )
t13. head( 2 )
user_id
max_user_date_datereceived_gap
0
184
30
1
417
14
user2_feature = pd. merge( t, t1, on= 'user_id' , how= 'left' )
user2_feature = pd. merge( user2_feature, t3, on= 'user_id' , how= 'left' )
user2_feature = pd. merge( user2_feature, t4, on= 'user_id' , how= 'left' )
user2_feature = pd. merge( user2_feature, t5, on= 'user_id' , how= 'left' )
user2_feature = pd. merge( user2_feature, t6, on= 'user_id' , how= 'left' )
user2_feature = pd. merge( user2_feature, t7, on= 'user_id' , how= 'left' )
user2_feature = pd. merge( user2_feature, t8, on= 'user_id' , how= 'left' )
user2_feature = pd. merge( user2_feature, t9, on= 'user_id' , how= 'left' )
user2_feature = pd. merge( user2_feature, t11, on= 'user_id' , how= 'left' )
user2_feature = pd. merge( user2_feature, t12, on= 'user_id' , how= 'left' )
user2_feature = pd. merge( user2_feature, t13, on= 'user_id' , how= 'left' )
user2_feature. buy_use_coupon = user2_feature. buy_use_coupon. replace( np. nan, 0 )
user2_feature. count_merchant = user2_feature. count_merchant. replace( np. nan, 0 )
user2_feature[ 'buy_use_coupon_rate' ] = user2_feature. buy_use_coupon / \
user2_feature. buy_total
user2_feature[ 'user_coupon_transfer_rate' ] = user2_feature. buy_use_coupon / \
user2_feature. coupon_received
user2_feature. buy_total = user2_feature. buy_total. replace( np. nan, 0 )
user2_feature. coupon_received = user2_feature. coupon_received. replace(
np. nan, 0 )
user2_feature. to_csv( 'user2_feature.csv' , index= None )
user2_feature. head( )
user_id
count_merchant
user_min_distance
user_max_distance
user_mean_distance
user_median_distance
buy_use_coupon
buy_total
coupon_received
avg_user_date_datereceived_gap
min_user_date_datereceived_gap
max_user_date_datereceived_gap
buy_use_coupon_rate
user_coupon_transfer_rate
0
1439408
1.0
NaN
NaN
NaN
NaN
0.0
1.0
2.0
NaN
NaN
NaN
0.0
0.0
1
1832624
0.0
NaN
NaN
NaN
NaN
0.0
0.0
1.0
NaN
NaN
NaN
NaN
0.0
2
73611
0.0
NaN
NaN
NaN
NaN
0.0
0.0
1.0
NaN
NaN
NaN
NaN
0.0
3
163606
0.0
NaN
NaN
NaN
NaN
0.0
0.0
1.0
NaN
NaN
NaN
NaN
0.0
4
94107
0.0
NaN
NaN
NaN
NaN
0.0
0.0
1.0
NaN
NaN
NaN
NaN
0.0
dataset1
user1 = feature1[ [ 'user_id' , 'merchant_id' , 'coupon_id' ,
'discount_rate' , 'distance' , 'date_received' , 'date' ] ] . copy( )
t = user1[ [ 'user_id' ] ] . copy( )
t. drop_duplicates( inplace= True )
t. head( 2 )
user_id
0
1439408
8
2029232
t1 = user1[ user1. date. notnull( ) ] [ [ 'user_id' , 'merchant_id' ] ] . copy( )
t1. drop_duplicates( inplace= True )
t1 = t1. groupby( 'user_id' ) . count( ) . reset_index( )
t1. rename( columns= { 'merchant_id' : 'count_merchant' } , inplace= True )
t1. head( 2 )
user_id
count_merchant
0
165
2
1
184
1
t2 = user1[ ( user1. date. notnull( ) ) & ( user1. coupon_id. notnull( ) )
] [ [ 'user_id' , 'distance' ] ] . copy( )
t3 = t2. groupby( 'user_id' ) . min ( ) . reset_index( )
t3. rename( columns= { 'distance' : 'user_min_distance' } , inplace= True )
t4 = t2. groupby( 'user_id' ) . max ( ) . reset_index( )
t4. rename( columns= { 'distance' : 'user_max_distance' } , inplace= True )
t5 = t2. groupby( 'user_id' ) . mean( ) . reset_index( )
t5. rename( columns= { 'distance' : 'user_mean_distance' } , inplace= True )
t6 = t2. groupby( 'user_id' ) . median( ) . reset_index( )
t6. rename( columns= { 'distance' : 'user_median_distance' } , inplace= True )
t6. head( 2 )
user_id
user_median_distance
0
184
0.0
1
417
0.0
t7 = user1[ ( user1. date. notnull( ) ) & (
user1. coupon_id. notnull( ) ) ] [ [ 'user_id' ] ] . copy( )
t7[ 'buy_use_coupon' ] = 1
t7 = t7. groupby( 'user_id' ) . sum ( ) . reset_index( )
t7. head( 2 )
user_id
buy_use_coupon
0
184
1
1
417
1
t8 = user1[ user1. date. notnull( ) ] [ [ 'user_id' ] ] . copy( )
t8[ 'buy_total' ] = 1
t8 = t8. groupby( 'user_id' ) . count( ) . reset_index( )
t8. head( 2 )
user_id
buy_total
0
165
6
1
184
1
t9 = user1[ user1. coupon_id. notnull( ) ] [ [ 'user_id' ] ] . copy( )
t9[ 'coupon_received' ] = 1
t9 = t9. groupby( 'user_id' ) . count( ) . reset_index( )
t9. head( 2 )
user_id
coupon_received
0
4
1
1
35
4
t10 = user1[ ( user1. date_received. notnull( ) ) & (
user1. date. notnull( ) ) ] [ [ 'user_id' , 'date_received' , 'date' ] ] . copy( )
t10[ [ 'date_received' , 'date' ] ] = t10[ [
'date_received' , 'date' ] ] . astype( int ) . astype( str )
t10[ 'user_date_datereceived_gap' ] = t10. date + ':' + t10. date_received
t10. user_date_datereceived_gap = t10. user_date_datereceived_gap. apply (
get_user_date_datereceived_gap)
t10 = t10[ [ 'user_id' , 'user_date_datereceived_gap' ] ]
t10. head( 2 )
user_id
user_date_datereceived_gap
38
2881376
8
75
114747
25
t11 = t10. groupby( 'user_id' ) . mean( ) . reset_index( )
t11. rename( columns= {
'user_date_datereceived_gap' : 'avg_user_date_datereceived_gap' } , inplace= True )
t12 = t10. groupby( 'user_id' ) . min ( ) . reset_index( )
t12. rename( columns= {
'user_date_datereceived_gap' : 'min_user_date_datereceived_gap' } , inplace= True )
t13 = t10. groupby( 'user_id' ) . max ( ) . reset_index( )
t13. rename( columns= {
'user_date_datereceived_gap' : 'max_user_date_datereceived_gap' } , inplace= True )
t13. head( 2 )
user_id
max_user_date_datereceived_gap
0
184
30
1
417
14
user1_feature = pd. merge( t, t1, on= 'user_id' , how= 'left' )
user1_feature = pd. merge( user1_feature, t3, on= 'user_id' , how= 'left' )
user1_feature = pd. merge( user1_feature, t4, on= 'user_id' , how= 'left' )
user1_feature = pd. merge( user1_feature, t5, on= 'user_id' , how= 'left' )
user1_feature = pd. merge( user1_feature, t6, on= 'user_id' , how= 'left' )
user1_feature = pd. merge( user1_feature, t7, on= 'user_id' , how= 'left' )
user1_feature = pd. merge( user1_feature, t8, on= 'user_id' , how= 'left' )
user1_feature = pd. merge( user1_feature, t9, on= 'user_id' , how= 'left' )
user1_feature = pd. merge( user1_feature, t11, on= 'user_id' , how= 'left' )
user1_feature = pd. merge( user1_feature, t12, on= 'user_id' , how= 'left' )
user1_feature = pd. merge( user1_feature, t13, on= 'user_id' , how= 'left' )
user1_feature. buy_use_coupon = user1_feature. buy_use_coupon. replace( np. nan, 0 )
user1_feature. count_merchant = user1_feature. count_merchant. replace( np. nan, 0 )
user1_feature[ 'buy_use_coupon_rate' ] = user1_feature. buy_use_coupon / \
user1_feature. buy_total
user1_feature[ 'user_coupon_transfer_rate' ] = user1_feature. buy_use_coupon / \
user1_feature. coupon_received
user1_feature. buy_total = user1_feature. buy_total. replace( np. nan, 0 )
user1_feature. coupon_received = user1_feature. coupon_received. replace(
np. nan, 0 )
user1_feature. to_csv( 'user1_feature.csv' , index= None )
user1_feature. head( )
user_id
count_merchant
user_min_distance
user_max_distance
user_mean_distance
user_median_distance
buy_use_coupon
buy_total
coupon_received
avg_user_date_datereceived_gap
min_user_date_datereceived_gap
max_user_date_datereceived_gap
buy_use_coupon_rate
user_coupon_transfer_rate
0
1439408
1.0
NaN
NaN
NaN
NaN
0.0
1.0
2.0
NaN
NaN
NaN
0.0
0.0
1
2029232
0.0
NaN
NaN
NaN
NaN
0.0
0.0
1.0
NaN
NaN
NaN
NaN
0.0
2
2223968
0.0
NaN
NaN
NaN
NaN
0.0
0.0
1.0
NaN
NaN
NaN
NaN
0.0
3
73611
0.0
NaN
NaN
NaN
NaN
0.0
0.0
1.0
NaN
NaN
NaN
NaN
0.0
4
3273056
0.0
NaN
NaN
NaN
NaN
0.0
0.0
1.0
NaN
NaN
NaN
NaN
0.0
user_merchant交互特征
column
含义
user_merchant_buy_total
消费的用户-商家行数
user_merchant_received
领券的用户-商家行数
user_merchant_buy_use_coupon
领券且消费的行数
user_merchant_any
用户-商家全部行数
user_merchant_buy_common
没领券且消费的用户-商家行数
dataset3
all_user_merchant = feature3[ [ 'user_id' , 'merchant_id' ] ] . copy( )
all_user_merchant. drop_duplicates( inplace= True )
t = feature3[ [ 'user_id' , 'merchant_id' , 'date' ] ] . copy( )
t = t[ t. date. notnull( ) ] [ [ 'user_id' , 'merchant_id' ] ]
t[ 'user_merchant_buy_total' ] = 1
t = t. groupby( [ 'user_id' , 'merchant_id' ] ) . count( ) . reset_index( )
t. head( 2 )
user_id
merchant_id
user_merchant_buy_total
0
165
2934
6
1
165
4195
6
t1 = feature3[ [ 'user_id' , 'merchant_id' , 'coupon_id' ] ] . copy( )
t1 = t1[ t1. coupon_id. notnull( ) ] [ [ 'user_id' , 'merchant_id' ] ]
t1[ 'user_merchant_received' ] = 1
t1 = t1. groupby( [ 'user_id' , 'merchant_id' ] ) . count( ) . reset_index( )
t1. head( 2 )
user_id
merchant_id
user_merchant_received
0
4
1469
1
1
165
2934
1
t2 = feature3[ [ 'user_id' , 'merchant_id' , 'date' , 'date_received' ] ] . copy( )
t2 = t2[ ( t2. date. notnull( ) ) & ( t2. date_received. notnull( ) ) ] [ [ 'user_id' , 'merchant_id' ] ]
t2[ 'user_merchant_buy_use_coupon' ] = 1
t2 = t2. groupby( [ 'user_id' , 'merchant_id' ] ) . count( ) . reset_index( )
t2. head( 2 )
user_id
merchant_id
user_merchant_buy_use_coupon
0
417
775
1
1
687
8594
1
t3 = feature3[ [ 'user_id' , 'merchant_id' ] ] . copy( )
t3[ 'user_merchant_any' ] = 1
t3 = t3. groupby( [ 'user_id' , 'merchant_id' ] ) . count( ) . reset_index( )
t3. head( 2 )
user_id
merchant_id
user_merchant_any
0
4
1469
1
1
165
2934
7
t4 = feature3[ [ 'user_id' , 'merchant_id' , 'date' , 'coupon_id' ] ] . copy( )
t4 = t4[ ( t4. date. notnull( ) ) & ( t4. coupon_id. isnull( ) ) ] [ [ 'user_id' , 'merchant_id' ] ]
t4[ 'user_merchant_buy_common' ] = 1
t4 = t4. groupby( [ 'user_id' , 'merchant_id' ] ) . count( ) . reset_index( )
t4. head( 2 )
user_id
merchant_id
user_merchant_buy_common
0
165
2934
6
1
165
4195
6
user_merchant3 = pd. merge( all_user_merchant, t, on= [
'user_id' , 'merchant_id' ] , how= 'left' )
user_merchant3 = pd. merge( user_merchant3, t1, on= [
'user_id' , 'merchant_id' ] , how= 'left' )
user_merchant3 = pd. merge( user_merchant3, t2, on= [
'user_id' , 'merchant_id' ] , how= 'left' )
user_merchant3 = pd. merge( user_merchant3, t3, on= [
'user_id' , 'merchant_id' ] , how= 'left' )
user_merchant3 = pd. merge( user_merchant3, t4, on= [
'user_id' , 'merchant_id' ] , how= 'left' )
user_merchant3. user_merchant_buy_use_coupon = user_merchant3. user_merchant_buy_use_coupon. replace(
np. nan, 0 )
user_merchant3. user_merchant_buy_common = user_merchant3. user_merchant_buy_common. replace(
np. nan, 0 )
user_merchant3[ 'user_merchant_coupon_transfer_rate' ] = user_merchant3. user_merchant_buy_use_coupon / \
user_merchant3. user_merchant_received
user_merchant3[ 'user_merchant_coupon_buy_rate' ] = user_merchant3. user_merchant_buy_use_coupon / \
user_merchant3. user_merchant_buy_total
user_merchant3[ 'user_merchant_rate' ] = user_merchant3. user_merchant_buy_total / \
user_merchant3. user_merchant_any
user_merchant3[ 'user_merchant_common_buy_rate' ] = user_merchant3. user_merchant_buy_common / \
user_merchant3. user_merchant_buy_total
user_merchant3. user_merchant_received = user_merchant3. user_merchant_received. fillna( 0 )
user_merchant3. user_merchant_buy_total = user_merchant3. user_merchant_buy_total. fillna( 0 )
user_merchant3. user_merchant_any = user_merchant3. user_merchant_any. fillna( 0 )
user_merchant3. to_csv( 'user_merchant3.csv' , index= None )
dataset2
all_user_merchant = feature2[ [ 'user_id' , 'merchant_id' ] ] . copy( )
all_user_merchant. drop_duplicates( inplace= True )
t = feature2[ [ 'user_id' , 'merchant_id' , 'date' ] ] . copy( )
t = t[ t. date. notnull( ) ] [ [ 'user_id' , 'merchant_id' ] ]
t[ 'user_merchant_buy_total' ] = 1
t = t. groupby( [ 'user_id' , 'merchant_id' ] ) . count( ) . reset_index( )
t. head( 2 )
user_id
merchant_id
user_merchant_buy_total
0
165
2934
4
1
165
4195
4
t1 = feature2[ [ 'user_id' , 'merchant_id' , 'coupon_id' ] ] . copy( )
t1 = t1[ t1. coupon_id. notnull( ) ] [ [ 'user_id' , 'merchant_id' ] ]
t1[ 'user_merchant_received' ] = 1
t1 = t1. groupby( [ 'user_id' , 'merchant_id' ] ) . count( ) . reset_index( )
t1. head( 2 )
user_id
merchant_id
user_merchant_received
0
4
1433
1
1
144
1553
1
t2 = feature2[ [ 'user_id' , 'merchant_id' , 'date' , 'date_received' ] ] . copy( )
t2 = t2[ ( t2. date. notnull( ) ) & ( t2. date_received. notnull( ) ) ] [ [ 'user_id' , 'merchant_id' ] ]
t2[ 'user_merchant_buy_use_coupon' ] = 1
t2 = t2. groupby( [ 'user_id' , 'merchant_id' ] ) . count( ) . reset_index( )
t2. head( 2 )
user_id
merchant_id
user_merchant_buy_use_coupon
0
184
3381
1
1
417
775
1
t3 = feature2[ [ 'user_id' , 'merchant_id' ] ] . copy( )
t3[ 'user_merchant_any' ] = 1
t3 = t3. groupby( [ 'user_id' , 'merchant_id' ] ) . count( ) . reset_index( )
t3. head( 2 )
user_id
merchant_id
user_merchant_any
0
4
1433
1
1
144
1553
1
t4 = feature2[ [ 'user_id' , 'merchant_id' , 'date' , 'coupon_id' ] ] . copy( )
t4 = t4[ ( t4. date. notnull( ) ) & ( t4. coupon_id. isnull( ) ) ] [ [ 'user_id' , 'merchant_id' ] ]
t4[ 'user_merchant_buy_common' ] = 1
t4 = t4. groupby( [ 'user_id' , 'merchant_id' ] ) . count( ) . reset_index( )
t4. head( 2 )
user_id
merchant_id
user_merchant_buy_common
0
165
2934
4
1
165
4195
4
user_merchant2 = pd. merge( all_user_merchant, t, on= [
'user_id' , 'merchant_id' ] , how= 'left' )
user_merchant2 = pd. merge( user_merchant2, t1, on= [
'user_id' , 'merchant_id' ] , how= 'left' )
user_merchant2 = pd. merge( user_merchant2, t2, on= [
'user_id' , 'merchant_id' ] , how= 'left' )
user_merchant2 = pd. merge( user_merchant2, t3, on= [
'user_id' , 'merchant_id' ] , how= 'left' )
user_merchant2 = pd. merge( user_merchant2, t4, on= [
'user_id' , 'merchant_id' ] , how= 'left' )
user_merchant2. user_merchant_buy_use_coupon = user_merchant2. user_merchant_buy_use_coupon. replace(
np. nan, 0 )
user_merchant2. user_merchant_buy_common = user_merchant2. user_merchant_buy_common. replace(
np. nan, 0 )
user_merchant2[ 'user_merchant_coupon_transfer_rate' ] = user_merchant2. user_merchant_buy_use_coupon / \
user_merchant2. user_merchant_received
user_merchant2[ 'user_merchant_coupon_buy_rate' ] = user_merchant2. user_merchant_buy_use_coupon / \
user_merchant2. user_merchant_buy_total
user_merchant2[ 'user_merchant_rate' ] = user_merchant2. user_merchant_buy_total / \
user_merchant2. user_merchant_any
user_merchant2[ 'user_merchant_common_buy_rate' ] = user_merchant2. user_merchant_buy_common / \
user_merchant2. user_merchant_buy_total
user_merchant2. user_merchant_received = user_merchant2. user_merchant_received. fillna( 0 )
user_merchant2. user_merchant_buy_total = user_merchant2. user_merchant_buy_total. fillna( 0 )
user_merchant2. user_merchant_any = user_merchant2. user_merchant_any. fillna( 0 )
user_merchant2. to_csv( 'user_merchant2.csv' , index= None )
dataset1
all_user_merchant = feature1[ [ 'user_id' , 'merchant_id' ] ] . copy( )
all_user_merchant. drop_duplicates( inplace= True )
t = feature1[ [ 'user_id' , 'merchant_id' , 'date' ] ] . copy( )
t = t[ t. date. notnull( ) ] [ [ 'user_id' , 'merchant_id' ] ]
t[ 'user_merchant_buy_total' ] = 1
t = t. groupby( [ 'user_id' , 'merchant_id' ] ) . count( ) . reset_index( )
t. head( 2 )
user_id
merchant_id
user_merchant_buy_total
0
165
2934
4
1
165
4195
2
t1 = feature1[ [ 'user_id' , 'merchant_id' , 'coupon_id' ] ] . copy( )
t1 = t1[ t1. coupon_id. notnull( ) ] [ [ 'user_id' , 'merchant_id' ] ]
t1[ 'user_merchant_received' ] = 1
t1 = t1. groupby( [ 'user_id' , 'merchant_id' ] ) . count( ) . reset_index( )
t1. head( 2 )
user_id
merchant_id
user_merchant_received
0
4
1433
1
1
35
3381
4
t2 = feature1[ [ 'user_id' , 'merchant_id' , 'date' , 'date_received' ] ] . copy( )
t2 = t2[ ( t2. date. notnull( ) ) & ( t2. date_received. notnull( ) ) ] [ [ 'user_id' , 'merchant_id' ] ]
t2[ 'user_merchant_buy_use_coupon' ] = 1
t2 = t2. groupby( [ 'user_id' , 'merchant_id' ] ) . count( ) . reset_index( )
t2. head( 2 )
user_id
merchant_id
user_merchant_buy_use_coupon
0
184
3381
1
1
417
775
1
t3 = feature1[ [ 'user_id' , 'merchant_id' ] ] . copy( )
t3[ 'user_merchant_any' ] = 1
t3 = t3. groupby( [ 'user_id' , 'merchant_id' ] ) . count( ) . reset_index( )
t3. head( 2 )
user_id
merchant_id
user_merchant_any
0
4
1433
1
1
35
3381
4
t4 = feature1[ [ 'user_id' , 'merchant_id' , 'date' , 'coupon_id' ] ] . copy( )
t4 = t4[ ( t4. date. notnull( ) ) & ( t4. coupon_id. isnull( ) ) ] [ [ 'user_id' , 'merchant_id' ] ]
t4[ 'user_merchant_buy_common' ] = 1
t4 = t4. groupby( [ 'user_id' , 'merchant_id' ] ) . count( ) . reset_index( )
t4. head( 2 )
user_id
merchant_id
user_merchant_buy_common
0
165
2934
4
1
165
4195
2
user_merchant1 = pd. merge( all_user_merchant, t, on= [
'user_id' , 'merchant_id' ] , how= 'left' )
user_merchant1 = pd. merge( user_merchant1, t1, on= [
'user_id' , 'merchant_id' ] , how= 'left' )
user_merchant1 = pd. merge( user_merchant1, t2, on= [
'user_id' , 'merchant_id' ] , how= 'left' )
user_merchant1 = pd. merge( user_merchant1, t3, on= [
'user_id' , 'merchant_id' ] , how= 'left' )
user_merchant1 = pd. merge( user_merchant1, t4, on= [
'user_id' , 'merchant_id' ] , how= 'left' )
user_merchant1. user_merchant_buy_use_coupon = user_merchant1. user_merchant_buy_use_coupon. replace(
np. nan, 0 )
user_merchant1. user_merchant_buy_common = user_merchant1. user_merchant_buy_common. replace(
np. nan, 0 )
user_merchant1[ 'user_merchant_coupon_transfer_rate' ] = user_merchant1. user_merchant_buy_use_coupon / \
user_merchant1. user_merchant_received
user_merchant1[ 'user_merchant_coupon_buy_rate' ] = user_merchant1. user_merchant_buy_use_coupon / \
user_merchant1. user_merchant_buy_total
user_merchant1[ 'user_merchant_rate' ] = user_merchant1. user_merchant_buy_total / \
user_merchant1. user_merchant_any
user_merchant1[ 'user_merchant_common_buy_rate' ] = user_merchant1. user_merchant_buy_common / \
user_merchant1. user_merchant_buy_total
user_merchant1. user_merchant_received = user_merchant2. user_merchant_received. fillna(
0 )
user_merchant1. user_merchant_buy_total = user_merchant2. user_merchant_buy_total. fillna(
0 )
user_merchant1. user_merchant_any = user_merchant2. user_merchant_any. fillna( 0 )
user_merchant1. to_csv( 'user_merchant1.csv' , index= None )
生成训练集和测试集
def trans_date ( s) :
if s != s:
return str ( s)
else :
return str ( int ( s) )
def get_label ( s) :
s = s. split( ':' )
if s[ 0 ] == 'nan' :
return 0
elif s[ 1 ] == 'nan' :
return 1
elif ( date( int ( s[ 0 ] [ 0 : 4 ] ) , int ( s[ 0 ] [ 4 : 6 ] ) , int ( s[ 0 ] [ 6 : 8 ] ) ) -
date( int ( s[ 1 ] [ 0 : 4 ] ) , int ( s[ 1 ] [ 4 : 6 ] ) , int ( s[ 1 ] [ 6 : 8 ] ) ) ) . days <= 15 :
return 1
else :
return 0
测试集dataset3
coupon3 = pd. read_csv( 'coupon3_feature.csv' )
merchant3 = pd. read_csv( 'merchant3_feature.csv' )
user3 = pd. read_csv( 'user3_feature.csv' )
user_merchant3 = pd. read_csv( 'user_merchant3.csv' )
other_feature3 = pd. read_csv( 'other_feature3.csv' )
coupon3. shape
(113640, 13)
dataset3 = pd. merge( coupon3, merchant3, on= 'merchant_id' , how= 'left' )
dataset3 = pd. merge( dataset3, user3, on= 'user_id' , how= 'left' )
dataset3 = pd. merge( dataset3, user_merchant3, on= [
'user_id' , 'merchant_id' ] , how= 'left' )
dataset3 = pd. merge( dataset3, other_feature3, on= [
'user_id' , 'coupon_id' , 'date_received' ] , how= 'left' )
dataset3. shape
(116204, 50)
dataset3. total_sales = dataset3. total_sales. fillna( 0 )
dataset3. sales_use_coupon = dataset3. sales_use_coupon. fillna( 0 )
dataset3. total_coupon = dataset3. total_coupon. fillna( 0 )
dataset3. user_merchant_buy_total = dataset3. user_merchant_buy_total. replace(
np. nan, 0 )
dataset3. user_merchant_received = dataset3. user_merchant_received. replace(
np. nan, 0 )
dataset3. user_merchant_buy_use_coupon = dataset3. user_merchant_buy_use_coupon. replace(
np. nan, 0 )
dataset3. user_merchant_any = dataset3. user_merchant_any. replace( np. nan, 0 )
dataset3. user_merchant_buy_common = dataset3. user_merchant_buy_common. replace(
np. nan, 0 )
dataset3. count_merchant = dataset3. count_merchant. fillna( 0 )
dataset3. buy_use_coupon = dataset3. buy_use_coupon. fillna( 0 )
dataset3. buy_total = dataset3. buy_total. fillna( 0 )
dataset3. coupon_received = dataset3. coupon_received. fillna( 0 )
dataset3. isnull( ) . sum ( )
user_id 0
merchant_id 0
coupon_id 0
discount_rate 0
distance 12230
date_received 0
day_of_week 0
day_of_month 0
days_distance 0
discount_man 0
discount_jian 0
is_man_jian 0
coupon_count 0
total_sales 0
sales_use_coupon 0
total_coupon 0
merchant_min_distance 5096
merchant_max_distance 5096
merchant_mean_distance 5096
merchant_median_distance 5096
merchant_coupon_transfer_rate 898
coupon_rate 55
count_merchant 0
user_min_distance 97093
user_max_distance 97093
user_mean_distance 97093
user_median_distance 97093
buy_use_coupon 0
buy_total 0
coupon_received 0
avg_user_date_datereceived_gap 95450
min_user_date_datereceived_gap 95450
max_user_date_datereceived_gap 95450
buy_use_coupon_rate 47959
user_coupon_transfer_rate 21231
user_merchant_buy_total 0
user_merchant_received 0
user_merchant_buy_use_coupon 0
user_merchant_any 0
user_merchant_buy_common 0
user_merchant_coupon_transfer_rate 85190
user_merchant_coupon_buy_rate 90569
user_merchant_rate 90569
user_merchant_common_buy_rate 90569
this_month_user_receive_same_coupon_count 0
this_month_user_receive_all_coupon_count 0
this_month_user_receive_same_coupon_lastone 0
this_month_user_receive_same_coupon_firstone 0
this_day_user_receive_all_coupon_count 0
this_day_user_receive_same_coupon_count 0
dtype: int64
dataset3[ 'is_weekend' ] = dataset3. day_of_week. apply (
lambda x: 1 if x in ( 6 , 7 ) else 0 )
weekday_dummies = pd. get_dummies( dataset3. day_of_week)
weekday_dummies. columns = [ 'weekday' + str ( i+ 1 )
for i in range ( weekday_dummies. shape[ 1 ] ) ]
dataset3 = pd. concat( [ dataset3, weekday_dummies] , axis= 1 )
dataset3. drop( [ 'merchant_id' , 'day_of_week' , 'coupon_count' ] ,
axis= 1 , inplace= True )
dataset3. head( )
dataset3. to_csv( 'dataset3.csv' , index= None )
user_id
coupon_id
discount_rate
distance
date_received
day_of_month
days_distance
discount_man
discount_jian
is_man_jian
total_sales
sales_use_coupon
total_coupon
merchant_min_distance
merchant_max_distance
merchant_mean_distance
merchant_median_distance
merchant_coupon_transfer_rate
coupon_rate
count_merchant
user_min_distance
user_max_distance
user_mean_distance
user_median_distance
buy_use_coupon
buy_total
coupon_received
avg_user_date_datereceived_gap
min_user_date_datereceived_gap
max_user_date_datereceived_gap
buy_use_coupon_rate
user_coupon_transfer_rate
user_merchant_buy_total
user_merchant_received
user_merchant_buy_use_coupon
user_merchant_any
user_merchant_buy_common
user_merchant_coupon_transfer_rate
user_merchant_coupon_buy_rate
user_merchant_rate
user_merchant_common_buy_rate
this_month_user_receive_same_coupon_count
this_month_user_receive_all_coupon_count
this_month_user_receive_same_coupon_lastone
this_month_user_receive_same_coupon_firstone
this_day_user_receive_all_coupon_count
this_day_user_receive_same_coupon_count
is_weekend
weekday1
weekday2
weekday3
weekday4
weekday5
weekday6
weekday7
0
4129537
9983
0.833333
1.0
20160712
12
12
30
5
1
7757.0
1239.0
35933.0
0.0
10.0
0.894068
0.0
0.034481
0.159727
0.0
NaN
NaN
NaN
NaN
0.0
0.0
2.0
NaN
NaN
NaN
NaN
0.0
0.0
1.0
0.0
1.0
0.0
0.0
NaN
NaN
NaN
1
1
-1
-1
1
1
0
0
1
0
0
0
0
0
1
6949378
3429
0.833333
NaN
20160706
6
6
30
5
1
119.0
17.0
100.0
0.0
8.0
0.666667
0.0
0.170000
0.142857
1.0
0.0
0.0
0.0
0.0
1.0
2.0
1.0
3.0
3.0
3.0
0.5
1.0
0.0
0.0
0.0
0.0
0.0
NaN
NaN
NaN
NaN
1
1
-1
-1
1
1
0
0
0
1
0
0
0
0
2
2166529
6928
0.900000
5.0
20160727
27
27
200
20
1
2280.0
100.0
3529.0
0.0
10.0
2.592593
1.0
0.028337
0.043860
0.0
NaN
NaN
NaN
NaN
0.0
0.0
1.0
NaN
NaN
NaN
NaN
0.0
0.0
0.0
0.0
0.0
0.0
NaN
NaN
NaN
NaN
1
2
-1
-1
2
1
0
0
0
1
0
0
0
0
3
2166529
1808
0.900000
5.0
20160727
27
27
100
10
1
2280.0
100.0
3529.0
0.0
10.0
2.592593
1.0
0.028337
0.043860
0.0
NaN
NaN
NaN
NaN
0.0
0.0
1.0
NaN
NaN
NaN
NaN
0.0
0.0
0.0
0.0
0.0
0.0
NaN
NaN
NaN
NaN
1
2
-1
-1
2
1
0
0
0
1
0
0
0
0
4
6172162
6500
0.966667
2.0
20160708
8
8
30
1
1
1065.0
182.0
795.0
0.0
10.0
0.783439
0.0
0.228931
0.170892
1.0
NaN
NaN
NaN
NaN
0.0
1.0
1.0
NaN
NaN
NaN
0.0
0.0
0.0
0.0
0.0
0.0
0.0
NaN
NaN
NaN
NaN
1
1
-1
-1
1
1
0
0
0
0
0
1
0
0
dataset2
coupon2 = pd. read_csv( 'coupon2_feature.csv' )
merchant2 = pd. read_csv( 'merchant2_feature.csv' )
user2 = pd. read_csv( 'user2_feature.csv' )
user_merchant2 = pd. read_csv( 'user_merchant2.csv' )
other_feature2 = pd. read_csv( 'other_feature2.csv' )
coupon2. shape
(258446, 14)
dataset2 = pd. merge( coupon2, merchant2, on= 'merchant_id' , how= 'left' )
dataset2 = pd. merge( dataset2, user2, on= 'user_id' , how= 'left' )
dataset2 = pd. merge( dataset2, user_merchant2, on= [
'user_id' , 'merchant_id' ] , how= 'left' )
dataset2 = pd. merge( dataset2, other_feature2, on= [
'user_id' , 'coupon_id' , 'date_received' ] , how= 'left' )
dataset2. shape
(262240, 51)
dataset2[ 'date' ] = dataset2[ 'date' ] . apply ( trans_date)
dataset2[ 'date_received' ] = dataset2[ 'date_received' ] . apply ( trans_date)
dataset2. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 262240 entries, 0 to 262239
Data columns (total 51 columns):
user_id 262240 non-null int64
merchant_id 262240 non-null int64
coupon_id 262240 non-null float64
discount_rate 262240 non-null float64
distance 231874 non-null float64
date_received 262240 non-null object
date 262240 non-null object
day_of_week 262240 non-null int64
day_of_month 262240 non-null int64
days_distance 262240 non-null int64
discount_man 262240 non-null int64
discount_jian 262240 non-null int64
is_man_jian 262240 non-null int64
coupon_count 262240 non-null int64
total_sales 259460 non-null float64
sales_use_coupon 259460 non-null float64
total_coupon 259460 non-null float64
merchant_min_distance 217418 non-null float64
merchant_max_distance 217418 non-null float64
merchant_mean_distance 217418 non-null float64
merchant_median_distance 217418 non-null float64
merchant_coupon_transfer_rate 240217 non-null float64
coupon_rate 258655 non-null float64
count_merchant 146649 non-null float64
user_min_distance 21545 non-null float64
user_max_distance 21545 non-null float64
user_mean_distance 21545 non-null float64
user_median_distance 21545 non-null float64
buy_use_coupon 146649 non-null float64
buy_total 146649 non-null float64
coupon_received 146649 non-null float64
avg_user_date_datereceived_gap 23316 non-null float64
min_user_date_datereceived_gap 23316 non-null float64
max_user_date_datereceived_gap 23316 non-null float64
buy_use_coupon_rate 112596 non-null float64
user_coupon_transfer_rate 112753 non-null float64
user_merchant_buy_total 73321 non-null float64
user_merchant_received 73321 non-null float64
user_merchant_buy_use_coupon 73321 non-null float64
user_merchant_any 73321 non-null float64
user_merchant_buy_common 73321 non-null float64
user_merchant_coupon_transfer_rate 36972 non-null float64
user_merchant_coupon_buy_rate 64125 non-null float64
user_merchant_rate 64125 non-null float64
user_merchant_common_buy_rate 64125 non-null float64
this_month_user_receive_same_coupon_count 262240 non-null int64
this_month_user_receive_all_coupon_count 262240 non-null int64
this_month_user_receive_same_coupon_lastone 262240 non-null int64
this_month_user_receive_same_coupon_firstone 262240 non-null int64
this_day_user_receive_all_coupon_count 262240 non-null int64
this_day_user_receive_same_coupon_count 262240 non-null int64
dtypes: float64(34), int64(15), object(2)
memory usage: 104.0+ MB
dataset2. total_sales = dataset2. total_sales. fillna( 0 )
dataset2. sales_use_coupon = dataset2. sales_use_coupon. fillna( 0 )
dataset2. total_coupon = dataset2. total_coupon. fillna( 0 )
dataset2. user_merchant_buy_total = dataset2. user_merchant_buy_total. replace(
np. nan, 0 )
dataset2. user_merchant_received = dataset2. user_merchant_received. replace(
np. nan, 0 )
dataset2. user_merchant_buy_use_coupon = dataset2. user_merchant_buy_use_coupon. replace(
np. nan, 0 )
dataset2. user_merchant_any = dataset2. user_merchant_any. replace( np. nan, 0 )
dataset2. user_merchant_buy_common = dataset2. user_merchant_buy_common. replace(
np. nan, 0 )
dataset2. count_merchant = dataset2. count_merchant. fillna( 0 )
dataset2. buy_use_coupon = dataset2. buy_use_coupon. fillna( 0 )
dataset2. buy_total = dataset2. buy_total. fillna( 0 )
dataset2. coupon_received = dataset2. coupon_received. fillna( 0 )
dataset2. isnull( ) . sum ( )
user_id 0
merchant_id 0
coupon_id 0
discount_rate 0
distance 30366
date_received 0
date 0
day_of_week 0
day_of_month 0
days_distance 0
discount_man 0
discount_jian 0
is_man_jian 0
coupon_count 0
total_sales 0
sales_use_coupon 0
total_coupon 0
merchant_min_distance 44822
merchant_max_distance 44822
merchant_mean_distance 44822
merchant_median_distance 44822
merchant_coupon_transfer_rate 22023
coupon_rate 3585
count_merchant 0
user_min_distance 240695
user_max_distance 240695
user_mean_distance 240695
user_median_distance 240695
buy_use_coupon 0
buy_total 0
coupon_received 0
avg_user_date_datereceived_gap 238924
min_user_date_datereceived_gap 238924
max_user_date_datereceived_gap 238924
buy_use_coupon_rate 149644
user_coupon_transfer_rate 149487
user_merchant_buy_total 0
user_merchant_received 0
user_merchant_buy_use_coupon 0
user_merchant_any 0
user_merchant_buy_common 0
user_merchant_coupon_transfer_rate 225268
user_merchant_coupon_buy_rate 198115
user_merchant_rate 198115
user_merchant_common_buy_rate 198115
this_month_user_receive_same_coupon_count 0
this_month_user_receive_all_coupon_count 0
this_month_user_receive_same_coupon_lastone 0
this_month_user_receive_same_coupon_firstone 0
this_day_user_receive_all_coupon_count 0
this_day_user_receive_same_coupon_count 0
dtype: int64
dataset2[ 'is_weekend' ] = dataset2. day_of_week. apply (
lambda x: 1 if x in ( 6 , 7 ) else 0 )
weekday_dummies = pd. get_dummies( dataset2. day_of_week)
weekday_dummies. columns = [ 'weekday' + str ( i+ 1 )
for i in range ( weekday_dummies. shape[ 1 ] ) ]
dataset2 = pd. concat( [ dataset2, weekday_dummies] , axis= 1 )
dataset2[ 'label' ] = dataset2. date + ':' + dataset2. date_received
dataset2. label = dataset2. label. apply ( get_label)
dataset2[ [ 'date_received' , 'date' , 'label' ] ] . head( 30 )
date_received
date
label
0
20160528
nan
0
1
20160613
nan
0
2
20160516
20160613
0
3
20160530
nan
0
4
20160519
nan
0
5
20160606
nan
0
6
20160606
nan
0
7
20160518
nan
0
8
20160528
nan
0
9
20160524
nan
0
10
20160523
nan
0
11
20160515
20160521
1
12
20160521
nan
0
13
20160523
nan
0
14
20160602
nan
0
15
20160523
20160605
1
16
20160605
nan
0
17
20160607
nan
0
18
20160605
nan
0
19
20160601
nan
0
20
20160530
nan
0
21
20160522
nan
0
22
20160611
nan
0
23
20160524
nan
0
24
20160526
nan
0
25
20160520
nan
0
26
20160527
nan
0
27
20160602
nan
0
28
20160526
nan
0
29
20160521
nan
0
dataset2. drop( [ 'merchant_id' , 'day_of_week' , 'date' , 'date_received' ,
'coupon_id' , 'coupon_count' ] , axis= 1 , inplace= True )
dataset2. to_csv( 'dataset2.csv' , index= None )
dataset1
coupon1 = pd. read_csv( 'coupon1_feature.csv' )
merchant1 = pd. read_csv( 'merchant1_feature.csv' )
user1 = pd. read_csv( 'user1_feature.csv' )
user_merchant1 = pd. read_csv( 'user_merchant1.csv' )
other_feature1 = pd. read_csv( 'other_feature1.csv' )
dataset1 = pd. merge( coupon1, merchant1, on= 'merchant_id' , how= 'left' )
dataset1 = pd. merge( dataset1, user1, on= 'user_id' , how= 'left' )
dataset1 = pd. merge( dataset1, user_merchant1, on= [
'user_id' , 'merchant_id' ] , how= 'left' )
dataset1 = pd. merge( dataset1, other_feature1, on= [
'user_id' , 'coupon_id' , 'date_received' ] , how= 'left' )
dataset1. shape
(139785, 51)
dataset1. total_sales = dataset1. total_sales. fillna( 0 )
dataset1. sales_use_coupon = dataset1. sales_use_coupon. fillna( 0 )
dataset1. total_coupon = dataset1. total_coupon. fillna( 0 )
dataset1. user_merchant_buy_total = dataset1. user_merchant_buy_total. replace(
np. nan, 0 )
dataset1. user_merchant_received = dataset1. user_merchant_received. replace(
np. nan, 0 )
dataset1. user_merchant_buy_use_coupon = dataset1. user_merchant_buy_use_coupon. replace(
np. nan, 0 )
dataset1. user_merchant_any = dataset1. user_merchant_any. replace( np. nan, 0 )
dataset1. user_merchant_buy_common = dataset1. user_merchant_buy_common. replace(
np. nan, 0 )
dataset1. count_merchant = dataset1. count_merchant. fillna( 0 )
dataset1. buy_use_coupon = dataset1. buy_use_coupon. fillna( 0 )
dataset1. buy_total = dataset1. buy_total. fillna( 0 )
dataset1. coupon_received = dataset1. coupon_received. fillna( 0 )
dataset1. isnull( ) . sum ( )
user_id 0
merchant_id 0
coupon_id 0
discount_rate 0
distance 15492
date_received 0
date 127747
day_of_week 0
day_of_month 0
days_distance 0
discount_man 0
discount_jian 0
is_man_jian 0
coupon_count 0
total_sales 0
sales_use_coupon 0
total_coupon 0
merchant_min_distance 33627
merchant_max_distance 33627
merchant_mean_distance 33627
merchant_median_distance 33627
merchant_coupon_transfer_rate 25176
coupon_rate 3950
count_merchant 0
user_min_distance 131176
user_max_distance 131176
user_mean_distance 131176
user_median_distance 131176
buy_use_coupon 0
buy_total 0
coupon_received 0
avg_user_date_datereceived_gap 130508
min_user_date_datereceived_gap 130508
max_user_date_datereceived_gap 130508
buy_use_coupon_rate 84347
user_coupon_transfer_rate 86167
user_merchant_buy_total 0
user_merchant_received 0
user_merchant_buy_use_coupon 0
user_merchant_any 0
user_merchant_buy_common 0
user_merchant_coupon_transfer_rate 123040
user_merchant_coupon_buy_rate 103555
user_merchant_rate 103555
user_merchant_common_buy_rate 103555
this_month_user_receive_same_coupon_count 0
this_month_user_receive_all_coupon_count 0
this_month_user_receive_same_coupon_lastone 0
this_month_user_receive_same_coupon_firstone 0
this_day_user_receive_all_coupon_count 0
this_day_user_receive_same_coupon_count 0
dtype: int64
dataset1[ 'is_weekend' ] = dataset1. day_of_week. apply (
lambda x: 1 if x in ( 6 , 7 ) else 0 )
weekday_dummies = pd. get_dummies( dataset1. day_of_week)
weekday_dummies. columns = [ 'weekday' + str ( i+ 1 )
for i in range ( weekday_dummies. shape[ 1 ] ) ]
dataset1 = pd. concat( [ dataset1, weekday_dummies] , axis= 1 )
dataset1[ 'date' ] = dataset1[ 'date' ] . apply ( trans_date)
dataset1[ 'date_received' ] = dataset1[ 'date_received' ] . apply ( trans_date)
dataset1[ 'label' ] = dataset1. date + ':' + dataset1. date_received
dataset1. label = dataset1. label. apply ( get_label)
dataset1. drop( [ 'merchant_id' , 'day_of_week' , 'date' , 'date_received' ,
'coupon_id' , 'coupon_count' ] , axis= 1 , inplace= True )
dataset1. to_csv( 'dataset1.csv' , index= None )
模型建立与预测
from IPython. core. interactiveshell import InteractiveShell
from jupyterthemes import jtplot
import os
from datetime import date
import xgboost as xgb
from sklearn. model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn. pipeline import Pipeline
from sklearn. linear_model import SGDClassifier, LogisticRegression, LogisticRegressionCV
from sklearn. preprocessing import StandardScaler
from sklearn. metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn. preprocessing import MinMaxScaler
from sklearn. neighbors import KNeighborsClassifier
from sklearn. tree import DecisionTreeClassifier
from sklearn. ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
import matplotlib. dates as mdates
import seaborn as sns
% matplotlib inline
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
pd. set_option( 'expand_frame_repr' , False )
pd. set_option( 'display.max_rows' , 200 )
pd. set_option( 'display.max_columns' , 200 )
jtplot. style( theme= 'oceans16' )
InteractiveShell. ast_node_interactivity = "all"
dataset1 = pd. read_csv( 'dataset1.csv' )
dataset2 = pd. read_csv( 'dataset2.csv' )
dataset3 = pd. read_csv( 'dataset3.csv' )
'''dataset1.shape,dataset2.shape,dataset3.shape
dataset1.drop_duplicates(inplace=True)
dataset2.drop_duplicates(inplace=True)
dataset3.drop_duplicates(inplace=True)
dataset1.shape,dataset2.shape,dataset3.shape'''
'dataset1.shape,dataset2.shape,dataset3.shape\ndataset1.drop_duplicates(inplace=True)\ndataset2.drop_duplicates(inplace=True)\ndataset3.drop_duplicates(inplace=True)\ndataset1.shape,dataset2.shape,dataset3.shape'
dataset12 = pd. concat( [ dataset1, dataset2] , axis= 0 )
dataset1_y = dataset1. label
dataset1_x = dataset1. drop(
[ 'user_id' , 'label' ] , axis= 1 )
dataset2_y = dataset2. label
dataset2_x = dataset2. drop(
[ 'user_id' , 'label' ] , axis= 1 )
dataset12_y = dataset12. label
dataset12_x = dataset12. drop(
[ 'user_id' , 'label' ] , axis= 1 )
dataset3_preds = dataset3[ [ 'user_id' , 'coupon_id' , 'date_received' ] ]
dataset3_x = dataset3. drop(
[ 'user_id' , 'coupon_id' , 'date_received' ] , axis= 1 )
dataset1_x. shape, dataset2_x. shape, dataset3_x. shape
((139785, 52), (262240, 52), (116204, 52))
dataset1 = xgb. DMatrix( dataset1_x, label= dataset1_y)
dataset2 = xgb. DMatrix( dataset2_x, label= dataset2_y)
dataset12 = xgb. DMatrix( dataset12_x, label= dataset12_y)
dataset3 = xgb. DMatrix( dataset3_x)
D:\anaconda3\lib\site-packages\xgboost\core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version
if getattr(data, 'base', None) is not None and \
D:\anaconda3\lib\site-packages\xgboost\core.py:588: FutureWarning: Series.base is deprecated and will be removed in a future version
data.base is not None and isinstance(data, np.ndarray) \
params= { 'booster' : 'gbtree' ,
'objective' : 'rank:pairwise' ,
'eval_metric' : 'auc' ,
'gamma' : 0.1 ,
'min_child_weight' : 1.1 ,
'max_depth' : 5 ,
'lambda' : 10 ,
'subsample' : 0.7 ,
'colsample_bytree' : 0.7 ,
'colsample_bylevel' : 0.7 ,
'eta' : 0.01 ,
'tree_method' : 'exact' ,
'seed' : 0 ,
'nthread' : 12
}
watchlist = [ ( dataset12, 'train' ) ]
model = xgb. train( params, dataset12, num_boost_round= 3500 , evals= watchlist)
dataset3_preds[ 'label' ] = model. predict( dataset3)
dataset3_preds. label = MinMaxScaler( ) . fit_transform(
dataset3_preds. label. values. reshape( - 1 , 1 ) )
dataset3_preds. sort_values( by= [ 'coupon_id' , 'label' ] , inplace= True )
dataset3_preds. to_csv( "xgb_preds.csv" , index= None , header= None )
dataset3_preds. describe( )
user_id
coupon_id
date_received
label
count
1.162040e+05
116204.000000
1.162040e+05
116204.000000
mean
3.678026e+06
9014.669702
2.016072e+07
0.310783
std
2.122548e+06
4136.488170
9.016543e+00
0.129064
min
2.090000e+02
3.000000
2.016070e+07
0.000000
25%
1.858042e+06
5143.000000
2.016071e+07
0.229695
50%
3.666466e+06
9983.000000
2.016072e+07
0.290300
75%
5.516537e+06
13602.000000
2.016072e+07
0.373554
max
7.361024e+06
14045.000000
2.016073e+07
1.000000