R语言没有计算连续值的信息增益的包,连续值的信息增益需要不断在连续值之间找到最优的切分点,使得信息增益最大,用R循环来计算会非常慢。所以此次使用了RCpp来辅助计算,第一列是特征列,第二类是标签列,特征列的值需要先排序。以下是程序。
#Rcpp cppFunction( ' double inforGain(NumericVector x, NumericVector y) { int n = x.size(); int num_r = 0; int num_a = 0; int all_r = 0; for (int i = 0; i < n; i++) { if (y[i] == 1) { all_r++; } } double all_a = n - all_r; double gain = 0.0; double entropyBefore; if ( all_r == 0 || all_r == n) { entropyBefore = 0.0; } else { entropyBefore = - all_r * 1.0/ n * log2(all_r * 1.0 / n) - (1- all_r * 1.0 / n) * log2(1 - all_r * 1.0/ n); } for(int i = 0; i < n - 1; i++) { if (y[i] == 1) { num_r++; } else { num_a++; } if (x[i] != x[i+1]){ double p1 = num_r * 1.0 / (num_r + num_a); double entropy1; if (num_r == 0 || num_a == 0) { entropy1 = 0.0; } else { entropy1 = -((p1*log2(p1)) + (1-p1)*log2(1-p1)); } double entropy2; double p2 = (all_r - num_r) * 1.0 / (n - i - 1); if (all_r - num_r == 0 || all_a - num_a == 0) { entropy2 = 0.0; } else { entropy2 = -((p2*log2(p2)) + (1-p2)*log2(1-p2)); } double entropy = entropy1 * (i + 1) / n + entropy2 * (n - i - 1)/ n; double gainTemp = entropyBefore - entropy; if (gainTemp > gain) { gain = gainTemp; } } } return gain; } ') x <- c(8 ,18 , 18 , 21, 24, 25, 28) y <- c(0 ,1 , 0 , 0 , 0 , 0 , 0) inforGain(x,y)
[1] 0.1981174