版本1:
import org.apache.spark.SparkContext
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
import org.apache.spark.mllib.regression.LabeledPoint
/**
* Created by shaokai on 14-9-12.
*/
object LinearRegression {
def main(args : Array[String]){
val sc = new SparkContext("local[2]","BinaryClassification","/Users/software/spark-0.9.0-incubating-bin-hadoop1")
val data = sc.textFile("/Users/workspace/chinahadoop/data/ridge-data/lpsa.data")
val parsedData = data.map { line =>
val parts = line.split(',')
LabeledPoint(parts(0).toDouble, parts(1).split(' ').map(x => x.toDouble).toArray)
}
//构建模型
val numIterations = 20
val model = LinearRegressionWithSGD.train(parsedData, numIterations)
//预测
val valuesAndPreds = parsedData.map { point =>
val prediction = model.predict(point.features)
(point.label, prediction)
}
//计算MSE
val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2)}.reduce(_ + _)/valuesAndPreds.count
println("training Mean Squared Error = " + MSE)
}
}
版本2:
package RegressionMetrics
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
object RegressionMetricsExample {
def main(args: Array[String]) : Unit = {
//information set
val conf = new SparkConf().setAppName("RegressionMetricsExample").setMaster("local")//.setMaster("local")为本地运行程序
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
// Load the data
//载入数据 data(label,[feature1,feature2,...])
val data = MLUtils.loadLibSVMFile(sc, "F:/HDFSinputfile/sample_linear_regression_data.txt").cache()//Data 为 RDD
// Build the model
val numIterations = 100
val model = LinearRegressionWithSGD.train(data, numIterations)//SGD:stochastic gradient descent 线性回归
// Get predictions
val valuesAndPreds = data.map{ point =>
val prediction = model.predict(point.features)
(prediction, point.label)//(预测值,实际值)
}//返回的是一个RDD数据类型
println(valuesAndPreds.getClass)//class org.apache.spark.rdd.MapPartitionsRDD RDD数据类型
println("value and predict")
//valuesAndPreds.foreach(println)//返回的是(1.1470019382890901,-9.490009878824548) (-0.5402104097029286,0.2577820163584905)
// Instantiate metrics object
val metrics = new RegressionMetrics(valuesAndPreds)
// Squared error
println("Model Parameter")
var i=1
model.weights.toArray.foreach(
a=> {
println("Parameter" + i + ":" + a)
i+=1
}
)
println("model intercept:"+model.intercept)
println("+MSE = "+metrics.meanSquaredError)//${metrics.meanSquaredError} println中打印变量 平均平方误差
println(s"RMSE = ${metrics.rootMeanSquaredError}")//另外一种打印字符与程序中变量值方式 标准平均平法误差
// R-squared
println(s"R-squared = ${metrics.r2}")
// Mean absolute error
println(s"MAE = ${metrics.meanAbsoluteError}")
// Explained variance
println(s"Explained variance = ${metrics.explainedVariance}")
sc.stop()
}
}