Linear regression and logistic regression comprehensive combat
Comprehensive application
In the previous two chapters, the user stratification of the second regression and the graphic production of lift (lift) were introduced respectively. This chapter focuses on comprehensive presentation and stratifies users from two dimensions.
1 Logistic regression modeling
#########################################################################################################################################
############################################################ Part1: logistic model #######################################################
#########################################################################################################################################
file_path_logistic<-"data_response_model.csv" #change the location
raw_logistic<-read.csv(file_path_logistic,stringsAsFactors = F) #read in your csv data
train_logistic<-raw_logistic[raw_logistic$segment=='build',] #select build sample
var_list_logistic<-c('m1_WEB_MNTHS_SINCE_LAST_SES',
'm1_POS_MNTHS_LAST_ORDER',
'm1_POS_NUM_ORDERS_24MO',
'm1_pos_mo_btwn_fst_lst_order',
'm1_EM_COUNT_VALID',
'm1_POS_TOT_REVPERSYS',
'm1_EM_MONTHS_LAST_OPEN',
'm1_POS_LAST_ORDER_DPA'
) #put the final model variables
mods_logistic<-train_logistic[,c('dv_response',var_list_logistic)] #select Y and varibales you want to try
model_glm<-glm(dv_response~.,data=mods_logistic,family =binomial(link ="logit")) #logistic model
summary(model_glm) #model summary
Logistic modeling of data directly with sorted variables and data
2 Linear regression modeling
#########################################################################################################################################
############################################################ Part2: Linear model #######################################################
#########################################################################################################################################
file_path_linear<-"data_revenue_model.csv" #change the location
raw_linear<-read.csv(file_path_linear,stringsAsFactors = F) #read in your csv data
train_linear<-raw_linear[raw_linear$segment=='build',] #select build sample
var_list_linear<-c('m2_POS_REVENUE_BASE',
'm2_POS_LAST_TOTAL_REVENUE',
'm2_POS_MNTHS_LAST_ORDER',
'm2_POS_REVENUE_BASE_SP_6MO',
'm2_POS_SP_QTY_24MO',
'm2_POS_TOT_REVPERSYS',
'm2_WEB_MNTHS_SINCE_LAST_SES',
'm2_SH_MNTHS_LAST_INQUIRED'
) #put the final model variables
mods_linear<-train_linear[,c('dv_revenue',var_list_linear)] #select Y and varibales you want to try
model_lm<-lm(dv_revenue~.,data=mods_linear) #Linear model
summary(model_lm) #model summary
3 Integrated applications
################################################## ################################################## #################################################
_ ############################################### Part3:application of 2 models
############################################## ################################################## ################################################## ###############################
Two model predictions
file_path<-"two_stage_data.csv"
# read in data
raw<-read.csv(file_path,stringsAsFactors = F)
pred_prob_resp<-predict(model_glm,raw,type='response') #using the logistic model to predict response score
pred_prob_reve<-predict(model_lm,raw,type='response') #using the linear model to predict revenue
#################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################### ########### Calculate
the predicted amount * predicted probability to get the user rating
combo<-pred_prob_resp*pred_prob_reve #usage 1
head(combo)
# 3.1.1 separate to 10 gorups based on combo
decile_combo<-cut(combo,unique(quantile(combo,(0:10)/10)),labels=10:1, include.lowest = T)
table(decile_combo)
Group by 10 quantiles according to the obtained scores, sort and divide user levels, and perform group aggregation to calculate the active user ratio. Similar to the count(*) group by in sql, calculate the active user rate of this group/total user active rate to get the grouping After the effect lift graph
performance for response based on decile_combo
library(plyr)
#put actual response,predicted response,decile together
combo_resp<-data.frame(actual=raw$dv_response,pred_prob_resp=pred_prob_resp,decile_combo_resp=decile_combo)
#group by decile_combo_resp
combo_decile_sum_resp<-ddply(combo_resp,.(decile_combo_resp),summarise,cnt=length(actual),resp=sum(actual))
combo_decile_sum_resp
combo_decile_sum_resp2<-within(combo_decile_sum_resp,
{rr<-resp/cnt
index<-100*rr/(sum(resp)/sum(cnt))
}) #add rr,index
combo_decile_sum_resp3<-combo_decile_sum_resp2[order(combo_decile_sum_resp2[,1],decreasing=T),] # order decile
View(combo_decile_sum_resp3)
It can be seen from the figure that the improvement effect is obvious
Then analyze the user consumption amount predicted by linear regression, and also perform group by, and use the average of this group / the average of the whole * 100 to get the lift degree lift
performance for revenue based on decile_combo
put actual revenue,predicted revenue,decile together
combo_reve<-data.frame(actual=raw$dv_revenue,pred_prob_reve=pred_prob_reve,decile_combo_reve=decile_combo)
#group by decile_combo_reve
combo_decile_sum_reve<-ddply(combo_reve,.(decile_combo_reve),summarise,cnt=length(actual),rev=sum(actual))
combo_decile_sum_reve
combo_decile_sum_reve2<-within(combo_decile_sum_reve,
{rev_avg<-rev/cnt
index<-100*rev_avg/(sum(rev)/sum(cnt))
}) #add rev_avg,index
combo_decile_sum_reve3<-combo_decile_sum_reve2[order(combo_decile_sum_reve2[,1],decreasing=T),] # order decile
View(combo_decile_sum_reve3)
4 Cross-tabulation of consumption amount and response rate
response part
separate into 10 groups based on predict_response
decile_resp<-cut(pred_prob_resp,unique(quantile(pred_prob_resp,(0:10)/10)),labels=10:1, include.lowest = T)
table(decile_resp)
revenue part
separate into 10 groups based on predict_revenue
decile_rev<-cut(pred_prob_reve,unique(quantile(pred_prob_reve,(0:10)/10)),labels=10:1, include.lowest = T)
table(decile_rev)
set together
decile_cross<-data.frame( #rid=raw$rid,
dv_response=raw$dv_response,
dv_revenue=raw$dv_revenue,
pred_prob_resp=pred_prob_resp,
pred_prob_reve=pred_prob_reve,
decile_resp=decile_resp,
decile_reve=decile_rev
)
View(decile_cross)
decile_cross
put decile_resp,decile_reve together
cross_table_freq<-table(decile_resp=decile_cross$decile_resp,decile_reve=decile_cross$decile_reve)
View(cross_table_freq)
cross_table_pct<-prop.table(cross_table_freq)
View(cross_table_pct)
From the results, it is found that users are clearly divided into 10 levels of consumption amount and 10 levels of user activity. 10*10=100 users with a total of 100 levels
library(sqldf)
a=sqldf(“select decile_reve as rank,
sum(case decile_resp when 1 then Freq else 0 end) as ‘1’,
sum(case decile_resp when 2 then Freq else 0 end) as ‘2’,
sum(case decile_resp when 3 then Freq else 0 end) as ‘3’,
sum(case decile_resp when 4 then Freq else 0 end) as ‘4’,
sum(case decile_resp when 5 then Freq else 0 end) as ‘5’,
sum(case decile_resp when 6 then Freq else 0 end) as ‘6’,
sum(case decile_resp when 7 then Freq else 0 end) as ‘7’,
sum(case decile_resp when 8 then Freq else 0 end) as ‘8’,
sum(case decile_resp when 9 then Freq else 0 end) as ‘9’,
sum(case decile_resp when 10 then Freq else 0 end) as ‘10’
from cross_table_freq
group by rank*1”)
Execute the sql statement, and the obtained sort is the cross table of the number of user consumption and response level customer groups
Combining the grouped customers with the original customer data, and then using visualization tools to group and present the data, the customer's label f distribution is presented.