R programming for feature selection and regression

  • data introduction
  • Select packages
  • Split dataset
  • feature selection
  • tune parameters
  • prediciton

1. data introduction

我的数据包含一千五百多条样例,92个属性,待预测项目有性别这样的分类问题,也有年龄这样的回归问题。

2. select packages

library(ggplot2) # Data visualization
library(randomForest)
library(ROCR) #visualizing classifier performance in R
library(caret) 
library("nnet")
# library(rpart)
# library("rfUtilities") # for crross validation 
library("openxlsx") # write to excel

3. data splitting

data_splitting <- function(number_of_predictors,prediction_name){
	set.seed(123)
	#get selected predictors
	if (number_of_predictors < 92){
		#get the variables
		set.seed(100)
		variables <- read.csv(file.path(paste0("D:/Program Files/R/R-3.5.1/bin/","thr_",prediction_name,".csv")), header = TRUE, sep = ",")
		variables <- variables[,"thres"]
		var_vec <- as.vector(variables)
		vars_inds <- var_vec[1:number_of_predictors]+3
	}else{
		vars_inds <- seq(4,95)
		#print(vars_inds)
	}
	#select data according to selected predictors
	data_frame <- read.csv(file.path("C:/Users/ntu/Downloads/LEARNING/personality/pers_desk.csv"), header = TRUE, sep = ",")
	features <- data_frame[,vars_inds]
	##the attributes waiting for prediction
	lab <- data_frame[c(2,3,96,97,98,99,100)]
	
	#split training and test dataset, 70% training
	sample.ind <- sample(2,nrow(features),replace = T,prob = c(0.7,0.3))
	train <- features[sample.ind==1,]
	test <- features[sample.ind==2,]
	train_label <- lab[sample.ind==1,]
	test_label <- lab[sample.ind==2,]
	#print(test[,c(1,2)])
	
	return (list(train = train, train_label = train_label, test = test, test_label = test_label))
}

4. feature selection via VSURF

feature_selection <- function(label_name, x, y){

	library("VSURF")
	
	vsurf <- VSURF(x = x, y = y, mtry = 100)
	print(summary(vsurf))
	#thres <- VSURF_thres(extract_features(new_data), new_data$keyword, mtry <- 10)
	#print(thres)
	#plot(vsurf, step = "thres", imp.sd = FALSE, var.names = TRUE)
	imp_dec = vsurf$imp.mean.dec
	imp_ind = vsurf$imp.mean.dec.ind
	thres = vsurf$varselect.thres
	interp = vsurf$varselect.interp
	pred = vsurf$varselect.pred
	thr_df = data.frame(thres)
	int_df = data.frame(interp)
	prd_df = data.frame(pred)
	imp_df = data.frame(imp_ind,imp_dec)
	write.csv(thr_df,paste0("thr_",label_name,".csv"))
	number <- c(1:3, 5:30)
	print(number[vsurf$varselect.thres])
	print(number[vsurf$varselect.interp])
	print(number[vsurf$varselect.pred])
	return (c(92, number[vsurf$varselect.thres], number[vsurf$varselect.interp], number[vsurf$varselect.pred]))
}

4.1 VSURF简介

vsurf 会将所有的属性划分为三个等级,threshold, interpretation and prediction. number[varselect.thres] 就是threshold 属性个数。 vsurf$$varselect.thres会返回threshold属性列表。一般来说prediciton的数量最少,threshold属性数量最多。这些属性的选择原理描述: Three steps variable selection procedure based on random forests. First step is dedicated to eliminate irrelevant variables from the dataset. Second step aims to select all variables related to the response for interpretation purpose. Third step refines the selection by eliminating redundancy in the set of variables selected by the second step, for prediction purpose. 也就是说其实VSURF的原理还是基于random forest的重要性,但是它在此基础上通过属性排列组合回归或者分类的效果将属性分成三个等级。因此该特征选择是要考虑label的,它本质上是运行random forest得到评价属性的指标并对模型性能进行分析确定应该使用哪些属性。

在这里插入图片描述

4.1 VSURF解释

  • thresholding step: 第一步(“阈值处理步骤”):首先,使用带有参数importance = TRUE的函数randomForest计算nfor.thres rf,并选择ntree和mtry的默认值。 然后,按属性的平均属性重要性(VI)降序排序。 接下来,计算阈值:min.thres,修剪的CART树的最小预测值拟合到VI的标准偏差的曲线。 最后,执行实际的“阈值处理步骤”:仅保留平均VI大于nmin * min.thres的变量。
  • 第二步(“解释步骤”):考虑第一步选择的变量。 nfor.interp嵌入随机森林模型的增长,一开始只选择最重要的变量,知道考虑完所有的第一步选择的属性结束。 然后,err.min计算这些模型的最小平均袋外(OOB)误差及其相关的标准偏差sd.min。 最后,选择平均OOB误差小于err.min + nsd * sd.min的最小模型(及其相应的变量)。
  • 第三步(“预测步骤”):起点与第二步相同。 但是,现在变量逐步添加到模型中。 mean.jump,平均跳跃值是使用第二步遗漏的变量计算的,并设置为一个模型的平均OOB误差与其第一个跟随模型之间的平均绝对差值。 因此,如果平均OOB误差减小大于nmj * mean.jump,则变量包含在模型中。

5. tune parameters

对每一个预测都选择最优的ntrees作为fitting model
5.1 回归问题
choose_best_rf_reg <- function(train, test, train_label, test_label){

	testing_mse_arr <- c()
	training_mse_arr <- c()
	variance_explained <- c()
	correlation <- c()
	num_trees <- c()
	for (n_trees in seq(10,510,20)){
		i <- (n_trees+10)/20
		#fit Random Forest Model
		rf <- randomForest(x = train, y = train_label, ntree = n_trees, importance = TRUE)
		#using model rf fit on training data to predict test data
		predicted <- predict(rf, test)
		#calculate mse
		testing_mse_arr[i] <- mean((predicted-test_label)^2)
		training_mse_arr[i] <- min(rf$mse)
		#calculate variance explained
		variance_explained[i] <- max(100*rf$rsq)
		#calculate correlation between prediction and actual test_labels
		correlation[i] <- cor(predicted,test_label)
		num_trees[i] <- n_trees
		#print(testing_mse_arr[i])
		}
	metrics <- data.frame(num_trees = num_trees, training_mse = training_mse_arr, variance_explained = variance_explained,testing_mse = testing_mse_arr, correlation = correlation)
	#print("metrics generated")
	#print(testing_mse_arr)
	n = which.min(testing_mse_arr)*20-10
	print(n)
	#print("metrics generated")
	rf = randomForest(x = train, y = train_label, ntree = n, importance = TRUE)
	imp = importance(rf)
	importances = data.frame(att_names = rownames(imp), imp = imp)
	write_csv(importances,"imp.csv")
	#print(importances)
	return (list(met=metrics,imp=importances))
}
5.2 分类问题
choose_best_rf_clf <- function(train, test, train_label, test_label){

	accuracy <- c()
	precision <- c()
	recall <- c()
	F1 <- c()
	for (n_trees in seq(10,510,20)){
		i <- (n_trees+10)/20
		#Fit Random Forest Model
		rf <- randomForest(x = train, y = as.factor(train_label), ntree = n_trees, importance = TRUE)
		#rf.cv <- rf.crossValidation(rf, train, p = 0.1, n = 10)
		# using model rf fit on training data to predict test data
		predicted <- predict(rf, test)  
		#print(predicted)
		# write results to CSV file
		cm <-  confusionMatrix(data = predicted, factor(test_label, levels = 1:2))
		acc <- cm$overall['Accuracy']
		p <- posPredValue(predicted, as.factor(test_label), positive = "1")
		r <- sensitivity(predicted, as.factor(test_label), positive = "1")

		f <- (2 * p * r) / (p + r)

		#print(acc)
		accuracy[i] <- acc
		precision[i] <- p
		recall[i] <- r
		F1[i] <- f
		#print(i,accuracy[i])
		}
	metrics <- data.frame(Accuracy = accuracy, P = precision, R = recall, F1 <- F1)
	#write_csv(index_datafram,paste0(prediction_name, number_of_predictors,"_metrics.csv"))
	n_trees = which.max(F1)*20-10
	print(n_trees)
	rf = randomForest(x = train, y = train_label, ntree = n_trees, importance = TRUE)
	importances = importance(rf)
	write_csv(metrics,"gender_metrics.csv")
	return (list(met=metrics,imp=importances))
}

5. prediction

main <- function(prediction_name){
	splitted_data4_fs = data_splitting(number_of_predictors = 92, "all")
	print(splitted_data4_fs$train_label)
	splitted_data4_fs$train_label = splitted_data4_fs$train_label[,prediction_name]
	print(splitted_data4_fs$train_label)
	print(paste("selecting feature for", prediction_name))
	selected_numbers_list = feature_selection(prediction_name, splitted_data4_fs$train, splitted_data4_fs$train_label)
	#selected_numbers_list = c(92,37,6,6)
	print("predicting")
	if (prediction_name == "gender"){
		i = 0
		wb <- createWorkbook()
		sheet_names = c("all","thre","inte","pred","all_imp","thre_imp","inte_imp","pred_imp")
		addWorksheet(wb, "all")
		addWorksheet(wb, "thre")
		addWorksheet(wb, "inte")
		addWorksheet(wb, "pred")
		addWorksheet(wb, "all_imp")
		addWorksheet(wb, "thre_imp")
		addWorksheet(wb, "inte_imp")
		addWorksheet(wb, "pred_imp")
		for (num_of_predictors in selected_numbers_list){
			i = i + 1
			print(i)
			imp_arr <- data.frame()
			df_arr <- data.frame()
			data = data_splitting(num_of_predictors, prediction_name)
			data$train_label = data$train_label[,prediction_name]
			data$test_label = data$test_label[,prediction_name]
			met_and_imp <- choose_best_rf_reg(data$train, data$test, data$train_label, data$test_label)
			#print(metrics_and_importances$metrics)
			df_arr <- rbind(df_arr,met_and_imp$met)
			imp_arr <- rbind(imp_arr,met_and_imp$imp)
			writeData(wb,sheet_names[i], df_arr, rowNames = TRUE, colNames = TRUE)
			writeData(wb,sheet_names[i+4], imp_arr, rowNames = TRUE, colNames = TRUE)
		}
		saveWorkbook(wb, paste0(prediction_name,"_metrics_and_imp.xlsx"), overwrite = TRUE)
	}else{
		i = 0
		wb <- createWorkbook()
		sheet_names = c("all","thre","inte","pred","all_imp","thre_imp","inte_imp","pred_imp")
		addWorksheet(wb, "all")
		addWorksheet(wb, "thre")
		addWorksheet(wb, "inte")
		addWorksheet(wb, "pred")
		addWorksheet(wb, "all_imp")
		addWorksheet(wb, "thre_imp")
		addWorksheet(wb, "inte_imp")
		addWorksheet(wb, "pred_imp")
		for (num_of_predictors in selected_numbers_list){
			i = i + 1
			print(i)
			imp_arr <- data.frame()
			df_arr <- data.frame()
			data = data_splitting(num_of_predictors, prediction_name)
			data$train_label = data$train_label[,prediction_name]
			data$test_label = data$test_label[,prediction_name]
			met_and_imp <- choose_best_rf_reg(data$train, data$test, data$train_label, data$test_label)
			#print(metrics_and_importances$metrics)
			df_arr <- rbind(df_arr,met_and_imp$met)
			imp_arr <- rbind(imp_arr,met_and_imp$imp)
			writeData(wb,sheet_names[i], df_arr, rowNames = TRUE, colNames = TRUE)
			writeData(wb,sheet_names[i+4], imp_arr, rowNames = TRUE, colNames = TRUE)
		}
		saveWorkbook(wb, paste0(prediction_name,"_metrics_and_imp.xlsx"), overwrite = TRUE)
	}
}

main("gender")

Note: 推荐使用openxlsx进行数据写入,先建立workbook,然后命名每一个sheet,并将每一个sheet添加入workbook,此时拥有一个空表,接下来写入数据,保存。

wb <- createWorkbook() # create
addWorksheet(wb, "all") # add sheet
addWorksheet(wb, "thre")
addWorksheet(wb, "inte")
addWorksheet(wb, "pred")
writeData(wb,“all”, df_arr, rowNames = TRUE, colNames = TRUE) # write data
saveWorkbook(wb, paste0(prediction_name,"_metrics_and_imp.xlsx"), overwrite = TRUE) # save

猜你喜欢

转载自blog.csdn.net/u014449866/article/details/85230428
今日推荐