In data mining, basically all algorithm requirements data are two-dimensional double
1 If it is two variables, one changes to 0 and the other changes to 1
2 Others are encoded by index:dimentionvalue, in each dimension, each value is represented by one dimension
//Remove (feature ID: feature) in train_cat_rdd and number them var oheMap = train_cat_rdd.flatMap(x => x).distinct().zipWithIndex().collectAsMap() //oheMap: scala.collection.Map[(Int, String),Long] = Map((7,608511e9) -> 31527, (7,b2d8fbed) -> 42207, // (7,1d3e2fdb) -> 52791 println("Number of features") println(oheMap.size) // parse vector oheMap.take(10).foreach(println) val parsesize = oheMap.size + 8 // 45790 // ((0,8907c166),32600) // ((6,7b177be1),28570) // ((7,ae5eeb59),23866) // ((7,2be70f8c),41143) // ((7,105627d8),14562) // ((7,060acc61),21043) // ((7,a3234c93),7884) // ((7,1d3e2fdb),34934) // ((7,b2d8fbed),17166) // ((7,608511e9), 44647) // 45790 size parse add val doubleSizeIndex = Array(0, 1, 2, 3, 4, 5, 6, 7) val demeansionValue = Array(1.0, 1.0, 1.0, 1.0,1.0, 1.0, 1.0, 1.0, 1.0) //create OHE for train data val ohe_train_rdd = train_rdd.map{ case (key, cateorical_features, numerical_features) => val cat_features_indexed = parseCatFeatures(cateorical_features) val cat_feature_ohe = new ArrayBuffer[Int] for (k <- cat_features_indexed) { if(oheMap contains k){ cat_feature_ohe += (oheMap get (k)).get.toInt }else { cat_feature_ohe += 0 } } val numerical_features_dbl = numerical_features.map{ x => val x1 = if (x.toInt < 0) "0" else x x1.toDouble } val vs = Vectors.sparse(parsesize, doubleSizeIndex ++ cat_feature_ohe, numerical_features_dbl ++ demeansionValue) //Create a sparse vector // val vs = Vectors.sparse(parsesize, doubleSizeIndex , numerical_features_dbl ) //Create a sparse vector // var features = numerical_features_dbl // LabeledPoint(key.split("::")(1).toInt,vs) LabeledPoint(key.split("::")(1).toInt,vs.toDense) } ohe_train_rdd.cache () ohe_train_rdd.take(10).foreach(println)