R语言建立决策树模型(movie数据集)

导入数据集

将movie、MPAA、competition、star、genre、TechEffect按字符型读入其他变量按数值型读入

learn<- 
  read.csv("E:\\RHome\\movie_learning.csv",

           colClasses = c(rep("character",6),

                          rep("numeric",4))) %>%


  mutate(MPAA = as.factor(MPAA)) %>%

  mutate(competition = as.factor(competition)) %>%

  mutate(star = as.factor(star)) %>%

  mutate(genre = as.factor(genre)) %>%

  mutate(TechEffect = as.factor(TechEffect))%>%

  mutate(TechEffect = as.factor(TechEffect))%>%

  mutate(GrossCat = as.factor(GrossCat))%>%

  mutate(GrossCat2 = as.factor(GrossCat2))

将不是哑变量形式的定类自变量转换成因子型变量。

test <-

  read.csv("E:\\RHome\\movie_test.csv",

           colClasses = c(rep("character",6),

                          rep("numeric",4))) %>%

  mutate(MPAA = as.factor(MPAA)) %>%

  mutate(competition = as.factor(competition)) %>%

  mutate(star = as.factor(star)) %>%

  mutate(genre = as.factor(genre)) %>%

  mutate(TechEffect = as.factor(TechEffect))%>%

  mutate(GrossCat = as.factor(GrossCat))%>%

  mutate(GrossCat2 = as.factor(GrossCat2))

在屏幕上查看movie_learning和movie_test数据集中各个变量的基本情况

str(learn)

str(test)

可以看出,movie_learning中genre变量的类型为“Factor w/ 8 levels”,而movie_test中genre变量的类型为“Factor w/ 6 levels”。下面需要修改这个变量,使它们在movie_learning和movie_test数据集的因子水平一样

table(learn$genre)

table(test$genre)

通过查看两个数据集中genre变量的频数分布可以看出:

学习数据集movie_learning中genre的取值"Action"和"Docum"在测试数据集movie_test中没有出现。修改test中的genre变量的因子水平。

使用factor()函数将genre变量设为因子型变量,因子水平与movie_learning中genre变量的因子水平一样。

对二值因变量GrossCat2建立决策树模型

找出二值因变量

for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='G'){insurance$MPAA[i]=as.numeric(1)}}

for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='PG'){insurance$MPAA[i]=as.numeric(2)}}

for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='PG13'){insurance$MPAA[i]=as.numeric(3)}}

for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='R'){insurance$MPAA[i]=as.numeric(4)}}

for(i in 1:length(insurance$competition)){if(insurance$competition[i]=='High'){insurance$competition[i]=as.numeric(1)}}

for(i in 1:length(insurance$competition)){if(insurance$competition[i]=='Medium'){insurance$competition[i]=as.numeric(2)}}

for(i in 1:length(insurance$competition)){if(insurance$competition[i]=='Low'){insurance$competition[i]=as.numeric(3)}}

for(i in 1:length(insurance$star)){if(insurance$star[i]=='A'){insurance$star[i]=as.numeric(1)}}

for(i in 1:length(insurance$star)){if(insurance$star[i]=='B'){insurance$star[i]=as.numeric(2)}}

for(i in 1:length(insurance$star)){if(insurance$star[i]=='C'){insurance$star[i]=as.numeric(3)}}

for(i in 1:length(insurance$TechEffect)){if(insurance$TechEffect[i]=='High'){insurance$TechEffect[i]=as.numeric(1)}}

for(i in 1:length(insurance$TechEffect)){if(insurance$TechEffect[i]=='Medium'){insurance$TechEffect[i]=as.numeric(2)}}

for(i in 1:length(insurance$TechEffect)){if(insurance$TechEffect[i]=='Low'){insurance$TechEffect[i]=as.numeric(3)}}

insurance=as.data.frame(lapply(insurance,as.numeric))

cor(insurance)

得出其相关度最高的两个项为sequel+screens

建立模型

model<- rpart(GrossCat2 ~ sequel+screens,learn)

查看模型结果

summary(model)

决策树可视化 

rpart.plot(model)

列出对应规则

asRules(model)

使用fancyRpartPlot展现更美观的决策树

fancyRpartPlot(model)   

查看交叉验证结果

model$cptable   

查看交叉验证结果图

plotcp(model) 

根据交叉验证结果,找出估计误差最小时的cp值,并重新建立模型。

xerr <-model$cptable[,"xerror"]

minxerr <- which.min(xerr)

选择交叉验证的估计误差最小时对应的cp

mincp <-model$cptable[minxerr, "CP"]

model.prune <- prune(model,cp=mincp)

新模型

fancyRpartPlot(model.prune)

进行预测

pred<-predict(model,test,type="class")

存为数据框

yucess= data.frame(test$GrossCat2,pred)

对多值因变量GrossCat建立决策树模型

决策树模型

model<- rpart(GrossCat ~ MPAA+competition+star+sequel+TechEffect+screens,learn)

查看模型结果

summary(model)

决策树可视化  

rpart.plot(model)

列出对应规则

asRules(model)

使用fancyRpartPlot展现更美观的决策树

fancyRpartPlot(model) 

查看交叉验证结果

model$cptable 

查看交叉验证结果图

plotcp(model)   

根据交叉验证结果,找出估计误差最小时的cp值,并重新建立模型。

xerr <-model$cptable[,"xerror"]

minxerr <- which.min(xerr)

选择交叉验证的估计误差最小时对应的cp

mincp <-model$cptable[minxerr, "CP"]

model.prune <- prune(model,cp=mincp)

新模型

fancyRpartPlot(model.prune)

进行预测

pred<-predict(model,test,type="class")

存为数据框

yucess= data.frame(test$GrossCat,pred)

#

源代码:

library(magrittr)
library(dplyr)
insurance<- read.csv(file="E:\\RHome\\movie_learning.csv",header=T,fileEncoding = "utf-8")

learn<- 
  read.csv("E:\\RHome\\movie_learning.csv",
           colClasses = c(rep("character",6),
                          rep("numeric",4))) %>%

  mutate(MPAA = as.factor(MPAA)) %>%
  mutate(competition = as.factor(competition)) %>%
  mutate(star = as.factor(star)) %>%
  mutate(genre = as.factor(genre)) %>%
  mutate(TechEffect = as.factor(TechEffect))%>%
  mutate(TechEffect = as.factor(TechEffect))%>%
  mutate(GrossCat = as.factor(GrossCat))%>%
  mutate(GrossCat2 = as.factor(GrossCat2))


test <- 
  read.csv("E:\\RHome\\movie_test.csv",
           colClasses = c(rep("character",6),
                          rep("numeric",4))) %>%
  mutate(MPAA = as.factor(MPAA)) %>%
  mutate(competition = as.factor(competition)) %>%
  mutate(star = as.factor(star)) %>%
  mutate(genre = as.factor(genre)) %>%
  mutate(GrossCat = as.factor(GrossCat))%>%
  mutate(GrossCat2 = as.factor(GrossCat2))


str(learn)
str(test)


table(learn$genre)
table(test$genre)

test <- test %>%
  mutate(genre=
           factor(genre,
                  levels=c("Action","Cartoon","Comedy","Docum","Horror",
                           "ModerDrama","SciFi","Thriller")))

library(rpart)
library(rpart.plot)
library(rattle)
#-------------------------------------------------------------------------
#GrossCat2建立决策树模型
#决策树模型
for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='G'){insurance$MPAA[i]=as.numeric(1)}}
for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='PG'){insurance$MPAA[i]=as.numeric(2)}}
for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='PG13'){insurance$MPAA[i]=as.numeric(3)}}
for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='R'){insurance$MPAA[i]=as.numeric(4)}}
for(i in 1:length(insurance$competition)){if(insurance$competition[i]=='High'){insurance$competition[i]=as.numeric(1)}}
for(i in 1:length(insurance$competition)){if(insurance$competition[i]=='Medium'){insurance$competition[i]=as.numeric(2)}}
for(i in 1:length(insurance$competition)){if(insurance$competition[i]=='Low'){insurance$competition[i]=as.numeric(3)}}
for(i in 1:length(insurance$star)){if(insurance$star[i]=='A'){insurance$star[i]=as.numeric(1)}}
for(i in 1:length(insurance$star)){if(insurance$star[i]=='B'){insurance$star[i]=as.numeric(2)}}
for(i in 1:length(insurance$star)){if(insurance$star[i]=='C'){insurance$star[i]=as.numeric(3)}}
for(i in 1:length(insurance$TechEffect)){if(insurance$TechEffect[i]=='High'){insurance$TechEffect[i]=as.numeric(1)}}
for(i in 1:length(insurance$TechEffect)){if(insurance$TechEffect[i]=='Medium'){insurance$TechEffect[i]=as.numeric(2)}}
for(i in 1:length(insurance$TechEffect)){if(insurance$TechEffect[i]=='Low'){insurance$TechEffect[i]=as.numeric(3)}}
insurance=as.data.frame(lapply(insurance,as.numeric))
cor(insurance)#得出其相关度最高的两个项为sequel+screens
model<- rpart(GrossCat2 ~ sequel+screens,learn)

summary(model)
rpart.plot(model)
asRules(model)


fancyRpartPlot(model)   

model$cptable 
plotcp(model)   



xerr <-model$cptable[,"xerror"]
minxerr <- which.min(xerr)

mincp <-model$cptable[minxerr, "CP"]

model.prune <- prune(model,cp=mincp) 


fancyRpartPlot(model.prune)

pred<-predict(model,test,type="class")

yucess= data.frame(test$GrossCat2,pred)

#-------------------------------------------------------------------------
#GrossCat建立决策树模型


model<- rpart(GrossCat ~ MPAA+competition+star+sequel+TechEffect+screens,learn)

summary(model)
rpart.plot(model)
asRules(model) 


fancyRpartPlot(model)    

model$cptable 
 
plotcp(model)  

xerr <-model$cptable[,"xerror"]

minxerr <- which.min(xerr)

mincp <-model$cptable[minxerr, "CP"]

model.prune <- prune(model,cp=mincp) 

fancyRpartPlot(model.prune)

pred<-predict(model,test,type="class")

yucess= data.frame(test$GrossCat,pred)


猜你喜欢

转载自blog.csdn.net/weixin_45987577/article/details/124929887