1. rdd 例子
package com.immooc import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} object LinearRegressionTest { def main(args:Array[String]): Unit ={ val conf = new SparkConf().setAppName("LinearRegressionWithSGD").setMaster("local[2]") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val data_path1 = "file:///Users/walle/Documents/D3/sparkmlib/lpsa.data" val data = sc.textFile(data_path1) val examples = data.map{ line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() val numExamples = examples.count() val numIterations = 100 val stepSize = 1 val miniBatchFraction = 1.0 val algorithm = new LinearRegressionWithSGD() algorithm.optimizer .setNumIterations(numIterations) .setStepSize(stepSize) // val model = LinearRegressionWithSGD.train(examples, numIterations, stepSize, miniBatchFraction) val model = algorithm.run(examples) model.weights model.intercept val prediction = model.predict(examples.map(_.features)) val predictionAndLabel = prediction.zip(examples.map(_.label)) val print_predict = predictionAndLabel.take(20) println("prediction" + "\t" + "label") for (i <- 0 to print_predict.length - 1) { println(print_predict(i)._1 + "\t" + print_predict(i)._2) } } }
2. DataFrame 例子
package com.immooc.spark import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.{SparkConf, SparkContext} object SparkLinearRegression { def main(args: Array[String]): Unit = { //设置环境 val conf=new SparkConf().setAppName("tianchi").setMaster("local") val sc=new SparkContext(conf) val sqc=new SQLContext(sc) //准备训练集合 val raw_data=sc.textFile("file:///Users/walle/Documents/D3/sparkmlib/LR_data") val map_data=raw_data.map{x=> val split_list=x.split(",") (split_list(0).toDouble,split_list(1).toDouble,split_list(2).toDouble,split_list(3).toDouble,split_list(4).toDouble,split_list(5).toDouble,split_list(6).toDouble,split_list(7).toDouble) } val df=sqc.createDataFrame(map_data) val data = df.toDF("Population", "Income", "Illiteracy", "LifeExp", "Murder", "HSGrad", "Frost", "Area") val colArray = Array("Population", "Income", "Illiteracy", "LifeExp", "HSGrad", "Frost", "Area") val assembler = new VectorAssembler().setInputCols(colArray).setOutputCol("features") val vecDF: DataFrame = assembler.transform(data) //准备预测集合 val raw_data_predict=sc.textFile("file:///Users/walle/Documents/D3/sparkmlib/LR_data_for_predict") val map_data_for_predict=raw_data_predict.map{x=> val split_list=x.split(",") (split_list(0).toDouble,split_list(1).toDouble,split_list(2).toDouble,split_list(3).toDouble,split_list(4).toDouble,split_list(5).toDouble,split_list(6).toDouble,split_list(7).toDouble) } val df_for_predict=sqc.createDataFrame(map_data_for_predict) val data_for_predict = df_for_predict.toDF("Population", "Income", "Illiteracy", "LifeExp", "Murder", "HSGrad", "Frost", "Area") val colArray_for_predict = Array("Population", "Income", "Illiteracy", "LifeExp", "HSGrad", "Frost", "Area") val assembler_for_predict = new VectorAssembler().setInputCols(colArray_for_predict).setOutputCol("features") val vecDF_for_predict: DataFrame = assembler_for_predict.transform(data_for_predict) // 建立模型,预测谋杀率Murder // 设置线性回归参数 val lr1 = new LinearRegression() val lr2 = lr1.setFeaturesCol("features").setLabelCol("Murder").setFitIntercept(true) // RegParam:正则化 val lr3 = lr2.setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8) val lr = lr3 // 将训练集合代入模型进行训练 val lrModel = lr.fit(vecDF) // 输出模型全部参数 lrModel.extractParamMap() // Print the coefficients and intercept for linear regression println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}") // 模型进行评价 val trainingSummary = lrModel.summary println(s"numIterations: ${trainingSummary.totalIterations}") println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}") trainingSummary.residuals.show() println(s"RMSE: ${trainingSummary.rootMeanSquaredError}") println(s"r2: ${trainingSummary.r2}") val predictions: DataFrame = lrModel.transform(vecDF_for_predict) // val predictions = lrModel.transform(vecDF) println("输出预测结果") val predict_result: DataFrame =predictions.selectExpr("features","Murder", "round(prediction,1) as prediction") predict_result.foreach(println(_)) sc.stop() } }
http://www.waitingfy.com/archives/4641