package mllib;
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.mllib.regression.{ LabeledPoint, LinearRegressionWithSGD }
import org.apache.spark.sql.{SparkSession,DataFrame,SQLContext}
import org.apache.spark.sql.Row
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.feature.VectorAssembler
object App {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("kimiYang");
val sc = new SparkContext(conf);
val sqc=new SQLContext(sc)
val spark= SparkSession.builder().appName("test").config("spark.some.config.option", "some-value").getOrCreate()
import spark.implicits._
//val data = sc.textFile("/test/kimi.txt");
// val training = spark.read.format("libsvm").load(data_path)
val data = spark.read.format("libsvm").load("/home/hadoop/mllibdata/kimi_svm.txt")//spark.read.text("/home/hadoop/mllibdata/kimi.txt");
data.show();
//val model = LinearRegressionWithSGD.train(parseData, 100, 0.1) //建立模型
// 建立模型,预测谋杀率Murder
// 设置线性回归参数
val lr1 = new LinearRegression()
val lr2 = lr1.setFitIntercept(true)
// RegParam:正则化
val lr3 = lr2.setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
val lr = lr3
// 将训练集合代入模型进行训练
val model = lr.fit(data)
// 输出模型全部参数
model.extractParamMap()
println(s"Coefficients: ${model.coefficients} Intercept: ${model.intercept}")
//通过模型预测模型
// 对样本进行测试
// 模型进行评价
val trainingSummary = model.summary
println(s"numIterations: ${trainingSummary.totalIterations}")
println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}")
trainingSummary.residuals.show()
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")//RMSE:均方根差
println(s"r2: ${trainingSummary.r2}")//r2:判定系数,也称为拟合优度,越接近1越好
trainingSummary.predictions.show()
sc.stop
}
}
console输出结果:
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
18/10/22 11:03:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
18/10/22 11:03:19 WARN Utils: Your hostname, dblab-VirtualBox resolves to a loopback address: 127.0.1.1; using 10.0.2.4 instead (on interface enp0s3)
18/10/22 11:03:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
18/10/22 11:03:22 WARN SparkSession$Builder: Using an existing SparkSession; some configuration may not take effect.
+-----+-------------------+
|label| features|
+-----+-------------------+
| 5.0|(2,[0,1],[1.0,1.0])|
| 7.0|(2,[0,1],[2.0,1.0])|
| 9.0|(2,[0,1],[3.0,2.0])|
| 11.0|(2,[0,1],[4.0,1.0])|
| 19.0|(2,[0,1],[5.0,3.0])|
| 18.0|(2,[0,1],[6.0,2.0])|
+-----+-------------------+
18/10/22 11:03:27 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
18/10/22 11:03:27 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
Coefficients: [2.1834836708105665,2.0791699982781844] Intercept: 0.392523821699377
numIterations: 6
objectiveHistory: List(0.5, 0.3905266284749149, 0.07485179411449853, 0.07261563579181778, 0.06909771042678862, 0.06909771037948713)
+-------------------+
| residuals|
+-------------------+
| 0.3448225092118724|
|0.16133883840130547|
|-2.1013148306874463|
|-0.2056285032198275|
| 1.4525478294132341|
|0.34823415688085646|
+-------------------+
RMSE: 1.0672317857734335
r2: 0.9592005844334871
+-----+-------------------+------------------+
|label| features| prediction|
+-----+-------------------+------------------+
| 5.0|(2,[0,1],[1.0,1.0])| 4.655177490788128|
| 7.0|(2,[0,1],[2.0,1.0])|6.8386611615986945|
| 9.0|(2,[0,1],[3.0,2.0])|11.101314830687446|
| 11.0|(2,[0,1],[4.0,1.0])|11.205628503219828|
| 19.0|(2,[0,1],[5.0,3.0])|17.547452170586766|
| 18.0|(2,[0,1],[6.0,2.0])|17.651765843119144|
+-----+-------------------+------------------+