Spark machine learning, operation principle, performance tuning, Figure computing, storage and dispatch monitoring and analysis project combat

I. Introduction
machine learning, artificial design algorithm requires a certain knowledge accumulation.
While others use machine learning library designed as Spark 2.0 ML, that is the basic foundation of what is not required, out of the box.
First, look at a simple, complete, standardized cases, it is undoubtedly the best way.

Previous article (containing dapper case):
the Spark 2.0 ML Machine Learning Library: feature extraction, transformation, select (Scala version)
the Spark 2.0 ML Machine Learning Library: machine learning workflow, cross - validation method (Scala version)
the Spark 2.0 machine learning library ML: data analysis methods (Scala version)

Second, the code
the following code from the Internet, very good, it is refined

1. Linear regression
package change

import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql._
import org.apache.spark.sql.SparkSession

/ **
* Linear Regression
* /
Object linearTest {

def main(args: Array[String]): Unit = {

// 0. Spark objects constructed
Val = SparkSession the Spark
.builder ()
.master ( "local") // local test, otherwise an error Master the URL of the MUST BE the SET A in the Configuration your AT org.apache.spark.SparkContext.
.AppName ( "the Test")
.enableHiveSupport ()
.getOrCreate () // to get there without the creation

spark.sparkContext.setCheckpointDir ( "C: \\ LLLLLLLLLLLLLLLLLLL \\ BigData_AI \\ sparkmlTest") // settings file read, store directory, HDFS best
import spark.implicits._

1 // training samples prepared
Val Training = spark.createDataFrame (Seq (
(5.601801561245534, Vectors.sparse (10, the Array (0, 1, 2,. 3,. 4,. 5,. 6,. 7,. 8,. 9), the Array (.6949189734965766 , -0.32697929564739403, -0.15359663581829275, -0.8951865090520432, .2057889391931318, -0.6676656789571533, -0.03553655732400762, 0.14550349954571096, .034600542078191854, 0.4223352065067103))),
(0.2577820163584905, Vectors.sparse (10, the Array (0,. 1, 2,. 3,. 4,. 5, 6, 7, 8, 9) , Array (0.8386555657374337, -0.1270180511534269, 0.499812362510895, -0.22686625128130267, -0.6452430441812433, 0.18869982177936828, -0.5804648622673358, 0.651931743775642, -0.6555641246242951, 0.17485476357259122))),
(1.5299675726687754, Vectors.sparse(10, Array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), Array(-0.13079299081883855, 0.0983382230287082, 0.15347083875928424, 0.45507300685816965, 0.1921083467305864, 0.6361110540492223, 0.7675261182370992, -0.2543488202081907, 0.2927051050236915, 0.680182444769418))))).toDF("label", "features")
training.show(false)

// create a logical regression model 2
Val = LR new new LinearRegression ()
.setMaxIter (100)
.setRegParam (0.1)
.setElasticNetParam (0.5)

// 2 training model based on training samples
val lrModel = lr.fit (training)

2 // print model information
println (s "Coefficients: $ { lrModel.coefficients} Intercept: $ {lrModel.intercept}")

/**
* Coefficients: [0.0,-0.8840148895400428,-4.451571521834594,-0.42090140779272434,0.857395634491616,-1.237347818637769,0.0,0.0,0.0,0.0] Intercept: 3.1417724655192645
*/

println(s"Intercept: ${lrModel.intercept}")

/**
* Intercept: 3.1417724655192645
*/

//4 测试样本
val test = spark.createDataFrame(Seq(
(5.601801561245534, Vectors.sparse(10, Array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), Array(0.6949189734965766, -0.32697929564739403, -0.15359663581829275, -0.8951865090520432, 0.2057889391931318, -0.6676656789571533, -0.03553655732400762, 0.14550349954571096, 0.034600542078191854, 0.4223352065067103))),
(0.2577820163584905, Vectors.sparse(10, Array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), Array(0.8386555657374337, -0.1270180511534269, 0.499812362510895, -0.22686625128130267, -0.6452430441812433, 0.18869982177936828, -0.5804648622673358, 0.651931743775642, -0.6555641246242951, 0.17485476357259122))),
(1.5299675726687754, Vectors.sparse(10, Array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), Array(-0.13079299081883855, 0.0983382230287082, 0.15347083875928424, 0.45507300685816965, 0.1921083467305864, 0.6361110540492223, 0.7675261182370992, -0.2543488202081907, 0.2927051050236915, 0.680182444769418))))).toDF("label", "features")
test.show(false)

/**
* +------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
* |label |features |
* +------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
* |5.601801561245534 |(10,[0,1,2,3,4,5,6,7,8,9],[0.6949189734965766,-0.32697929564739403,-0.15359663581829275,-0.8951865090520432,0.2057889391931318,-0.6676656789571533,-0.03553655732400762,0.14550349954571096,0.034600542078191854,0.4223352065067103])|
* |0.2577820163584905|(10,[0,1,2,3,4,5,6,7,8,9],[0.8386555657374337,-0.1270180511534269,0.499812362510895,-0.22686625128130267,-0.6452430441812433,0.18869982177936828,-0.5804648622673358,0.651931743775642,-0.6555641246242951,0.17485476357259122]) |
* |1.5299675726687754|(10,[0,1,2,3,4,5,6,7,8,9],[-0.13079299081883855,0.0983382230287082,0.15347083875928424,0.45507300685816965,0.1921083467305864,0.6361110540492223,0.7675261182370992,-0.2543488202081907,0.2927051050236915,0.680182444769418]) |
* +------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
*/

// 5 pairs to test the model
Val test_predict = lrModel.transform (the Test)
test_predict
.Select ( "label", "Prediction", "Features")
.Show (false)

/**
* +------------------+-------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
* |label |prediction |features |
* +------------------+-------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
* |5.601801561245534 |5.493935912726037 |(10,[0,1,2,3,4,5,6,7,8,9],[0.6949189734965766,-0.32697929564739403,-0.15359663581829275,-0.8951865090520432,0.2057889391931318,-0.6676656789571533,-0.03553655732400762,0.14550349954571096,0.034600542078191854,0.4223352065067103])|
* |0.2577820163584905|0.33788027718672575|(10,[0,1,2,3,4,5,6,7,8,9],[0.8386555657374337,-0.1270180511534269,0.499812362510895,-0.22686625128130267,-0.6452430441812433,0.18869982177936828,-0.5804648622673358,0.651931743775642,-0.6555641246242951,0.17485476357259122]) |
* |1.5299675726687754|1.557734960360036 |(10,[0,1,2,3,4,5,6,7,8,9],[-0.13079299081883855,0.0983382230287082,0.15347083875928424,0.45507300685816965,0.1921083467305864,0.6361110540492223,0.7675261182370992,-0.2543488202081907,0.2927051050236915,0.680182444769418]) |
* +------------------+-------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
*/

// 6 Model Summary
val trainingSummary = lrModel.summary

Each iteration // target
Val objectiveHistory = trainingSummary.objectiveHistory
the println (S "numIterations: trainingSummary.totalIterations $ {}")

/ **
* numIterations 101
* /

println(s"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(",")}]")

/**
* objectiveHistory: [0.5,0.3343138710353481,0.03498698228406803,0.034394702527331365,0.03361752133820051,0.033440576313009396,0.032492999827506586,0.03209249818672103,0.03201118276878801,0.0318030335506653,0.031556141484809515,0.03146914334471842,0.03132368104987874,0.030906857778152226,0.030829631969772512,0.030792601096269995,0.03075807300477159,0.03064409361649658,0.03057645418974434,0.03048720940080922,0.030450452329432418,0.0303403006892938,0.03022336621283447,0.030105231797686347,0.03005248564337978,0.029952523828252434,0.029901762708870988,0.029901114112460842,0.029897992643680316,0.029897097909156505,0.029892358780083193,0.029890487541861296,0.029883508098656905,0.02986342331315129,0.029846157576330717,0.02983921669719768,0.029837621981381814,0.029832343881027193,0.029818011565517288,0.0298174329753425,0.029816619127868163,0.029815897918569062,0.029815813156609985,0.029815635355907394,0.029814914126549,0.029813735638819686,0.02981357400967502,0.02981340129452729,0.029813363218666296,0.029813104482615992,0.029813066188642295,0.02981290111924657,0.029812867201451012,0.029812730285385426,0.029812706953398726,0.02981259780704471,0.02981258478371474,0.02981249810105761,0.029812492058484363,0.029812414896583955,0.02981239284306545,0.02981217952516655,0.029812093354005524,0.029812078847204722,0.02981204606864486,0.029812029284085127,0.029812008170753846,0.029812001127453244,0.02981198610905457,0.029811978179336476,0.029811968590860403,0.029811960922339894,0.02981195510843637,0.029811951516538388,0.02981194560589678,0.029811931971338676,0.029811927559300986,0.02981192583464405,0.029811923533256,0.02981192147493291,0.029811919101372975,0.0298119178536648,0.029811915692737362,0.02981191417259256,0.029811912340872517,0.02981191111669305,0.02981190922210416,0.029811908328486812,0.029811906376022823,0.029811905682559023,0.02981190386743857,0.029811903165691635,0.02981190159751578,0.029811901021202986,0.02981189985181355,0.02981189892054736,0.029811897724408266,0.02981189698790617,0.02981189562974597,0.029811894938092554,0.029811894064851477]
*/

trainingSummary.residuals.show(false)

/**
* +---------------------+
* |residuals |
* +---------------------+
* |0.1078656485194962 |
* |-0.08009826082823523 |
* |-0.027767387691260526|
* +---------------------+
*/

println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")

/**
* RMSE: 0.07920807479341203
*/

println(s"r2: ${trainingSummary.r2}")

/ **
* r2: 0.998792363204057
* /

// 7 models are saved and loaded (when published to the server django, View + add the following code file)
lrModel.save ( "C: \\ LLLLLLLLLLLLLLLLLLL sparkmlTest \\ \\ \\ BigData_AI lrmodel2")
Val load_lrModel = LinearRegressionModel.load ( " C: \\ LLLLLLLLLLLLLLLLLLL \\ BigData_AI \\ sparkmlTest \\ lrmodel2 ")

}

}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136


2. logistic regression
package change

import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession

object logicTest {

def main(args: Array[String]): Unit = {

// 0. Spark objects constructed
Val = SparkSession the Spark
.builder ()
.master ( "local") // local test, otherwise an error Master the URL of the MUST BE the SET A in the Configuration your AT org.apache.spark.SparkContext.
.AppName ( "the Test")
.enableHiveSupport ()
.getOrCreate () // to get there without the creation

spark.sparkContext.setCheckpointDir ( "C: \\ LLLLLLLLLLLLLLLLLLL \\ BigData_AI \\ sparkmlTest") // settings file read, store directory, HDFS best
import spark.implicits._

//1 训练样本准备
val training = spark.createDataFrame(Seq(
(1.0, Vectors.sparse(692, Array(10, 20, 30), Array(-1.0, 1.5, 1.3))),
(0.0, Vectors.sparse(692, Array(45, 175, 500), Array(-1.0, 1.5, 1.3))),
(1.0, Vectors.sparse(692, Array(100, 200, 300), Array(-1.0, 1.5, 1.3))))).toDF("label", "features")
training.show(false)

/**
* +-----+----------------------------------+
* |label|features |
* +-----+----------------------------------+
* |1.0 |(692,[10,20,30],[-1.0,1.5,1.3]) |
* |0.0 |(692,[45,175,500],[-1.0,1.5,1.3]) |
* |1.0 |(692,[100,200,300],[-1.0,1.5,1.3])|
* +-----+----------------------------------+
*/

// create a logical regression model 2
val lr = new LogisticRegression (). SetMaxIter (10) .setRegParam (0.3) .setElasticNetParam (0.8)

// 2 training model based on training samples
val lrModel = lr.fit (training)

2 // print model information
println (s "Coefficients: $ { lrModel.coefficients} Intercept: $ {lrModel.intercept}")

/**
* Coefficients: (692,[45,175,500],[0.48944928041408226,-0.32629952027605463,-0.37649944647237077]) Intercept: 1.251662793530725
*/

println(s"Intercept: ${lrModel.intercept}")

/**
* Intercept: 1.251662793530725
*/

3 // multiple regression model
val mlr = new LogisticRegression (). SetMaxIter (10) .setRegParam (0.3) .setElasticNetParam (0.8) .setFamily ( "multinomial")

// 3 model training based on training samples
val mlrModel = mlr.fit (training)

// print model 3 information
println (s "Multinomial coefficients: $ {mlrModel.coefficientMatrix}")

/**
* Multinomial coefficients: 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... (692 total)
* 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ...
*/

println(s"Multinomial intercepts: ${mlrModel.interceptVector}")

/**
* Multinomial intercepts: [-0.6449310568167714,0.6449310568167714]
*/

//4 测试样本
val test = spark.createDataFrame(Seq(
(1.0, Vectors.sparse(692, Array(10, 20, 30), Array(-1.0, 1.5, 1.3))),
(0.0, Vectors.sparse(692, Array(45, 175, 500), Array(-1.0, 1.5, 1.3))),
(1.0, Vectors.sparse(692, Array(100, 200, 300), Array(-1.0, 1.5, 1.3))))).toDF("label", "features")
test.show(false)

/**
* +-----+----------------------------------+
* |label|features |
* +-----+----------------------------------+
* |1.0 |(692,[10,20,30],[-1.0,1.5,1.3]) |
* |0.0 |(692,[45,175,500],[-1.0,1.5,1.3]) |
* |1.0 |(692,[100,200,300],[-1.0,1.5,1.3])|
* +-----+----------------------------------+
*/

// 5 pairs to test the model
Val test_predict = lrModel.transform (the Test)
test_predict
.Select ( "label", "Prediction", "Probability", "Features")
.Show (false)

/**
* +-----+----------+----------------------------------------+----------------------------------+
* |label|prediction|probability |features |
* +-----+----------+----------------------------------------+----------------------------------+
* |1.0 |1.0 |[0.22241243403014824,0.7775875659698517]|(692,[10,20,30],[-1.0,1.5,1.3]) |
* |0.0 |0.0 |[0.5539602964649871,0.44603970353501293]|(692,[45,175,500],[-1.0,1.5,1.3]) |
* |1.0 |1.0 |[0.22241243403014824,0.7775875659698517]|(692,[100,200,300],[-1.0,1.5,1.3])|
* +-----+----------+----------------------------------------+----------------------------------+
*/

// 6 Model Summary
val trainingSummary = lrModel.summary

// 6 each iteration of the target value
Val objectiveHistory = trainingSummary.objectiveHistory
println ( "objectiveHistory:")
objectiveHistory.foreach (Loss => println (Loss))

/**
* objectiveHistory:
* 0.6365141682948128
* 0.6212055977633174
* 0.5894552698389314
* 0.5844805633573479
* 0.5761098112571359
* 0.575517297029231
* 0.5754098875805627
* 0.5752562156795122
* 0.5752506337221737
* 0.5752406742715199
* 0.5752404945106846
*/

6 // index data calculation model
val binarySummary = trainingSummary.asInstanceOf [BinaryLogisticRegressionSummary]

// 6 AUC indicators
Val ROC = binarySummary.roc
roc.show (false)

/**
* +---+---+
* |FPR|TPR|
* +---+---+
* |0.0|0.0|
* |0.0|1.0|
* |1.0|1.0|
* |1.0|1.0|
* +---+---+
*/

val AUC = binarySummary.areaUnderROC
println(s"areaUnderROC: ${binarySummary.areaUnderROC}")

6 // set threshold model
// different thresholds, calculate different F1, F1 of the maximum and then try to find the optimal threshold set model.
fMeasure = binarySummary.fMeasureByThreshold Val
fMeasure.show (to false)

/**
* +-------------------+---------+
* |threshold |F-Measure|
* +-------------------+---------+
* |0.7775875659698517 |1.0 |
* |0.44603970353501293|0.8 |
* +-------------------+---------+
*/

// get the maximum value F1
Val maxFMeasure fMeasure.select = (max ( "the Measure-F.")). Head (). O getDouble (0)
// find the maximum value corresponding to the threshold value F1 (the optimal threshold)
Val = bestThreshold fMeasure.where ($ "F.-the Measure" === maxFMeasure) .Select ( "threshold"). head (). o getDouble (0)
// set the threshold for the selected model and the best classification threshold
lrModel.setThreshold (bestThreshold)

// 7 models are saved and loaded (when published to the server django, View + add the following code file)
lrModel.save ( "C: \\ LLLLLLLLLLLLLLLLLLL sparkmlTest \\ \\ \\ BigData_AI lrmodel")
Val load_lrModel = LogisticRegressionModel.load ( " C: \\ LLLLLLLLLLLLLLLLLLL \\ BigData_AI \\ sparkmlTest \\ lrmodel ")

}

}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
to save a good model


3. The decision trees, random forest, GBDT
easier to understand an algorithm

package tree

import org.apache.spark.ml.feature._
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.classification.{ DecisionTreeClassifier, DecisionTreeClassificationModel }
import org.apache.spark.ml.classification.GBTClassifier
import org.apache.spark.ml.evaluation.{ MulticlassClassificationEvaluator, BinaryClassificationEvaluator }
import org.apache.spark.ml.{ Pipeline, PipelineModel }
import org.apache.spark.sql.SparkSession

object tree {

def main(args: Array[String]): Unit = {

// 0. Spark objects constructed
Val = SparkSession the Spark
.builder ()
.master ( "local") // local test, otherwise an error Master the URL of the MUST BE the SET A in the Configuration your AT org.apache.spark.SparkContext.
.AppName ( "the Test")
.enableHiveSupport ()
.getOrCreate () // to get there without the creation

spark.sparkContext.setCheckpointDir ( "C: \\ LLLLLLLLLLLLLLLLLLL \\ BigData_AI \\ sparkmlTest") // settings file read, store directory, HDFS best
import spark.implicits._

// 1 training sample preparation
Val the Data = spark.read.format ( "libsvm") the Load. ( "C: \\ LLLLLLLLLLLLLLLLLLL sparkmlTest \\ \\ \\ BigData_AI sample_libsvm_data.txt")
data.show

// tag index number 2
Val = labelIndexer new new StringIndexer ().
SetInputCol ( "label").
SetOutputCol ( "indexedLabel").
Fit (Data)

// discrete features mark an index to be used to determine which features are discrete features
// If the value of a feature of more than 4, the feature as a continuous feature, otherwise it will have to mark discrete features and index number
val featureIndexer VectorIndexer new new = ().
setInputCol ( "Features").
setOutputCol ( "indexedFeatures").
setMaxCategories (. 4).
Fit (Data)

// 3 sample into
val Array (trainingData, testData) = data.randomSplit (Array (0.7, 0.3))

// 4 training decision tree model
Val dt = new new DecisionTreeClassifier ().
SetLabelCol ( "indexedLabel").
SetFeaturesCol ( "indexedFeatures")

4 // Random Forest model train
Val new new RandomForestClassifier the RF = ()
.setLabelCol ( "indexedLabel")
.setFeaturesCol ( "indexedFeatures")
.setNumTrees (10)

//4 训练GBDT模型
val gbt = new GBTClassifier()
.setLabelCol("indexedLabel")
.setFeaturesCol("indexedFeatures")
.setMaxIter(10)

// 5 tags index reversed original tag
Val labelConverter new new IndexToString = ().
SetInputCol ( "Prediction").
SetOutputCol ( "predictedLabel").
SetLabels (labelIndexer.labels)

//6 构建Pipeline
val pipeline1 = new Pipeline().
setStages(Array(labelIndexer, featureIndexer, dt, labelConverter))

val pipeline2 = new Pipeline().
setStages(Array(labelIndexer, featureIndexer, rf, labelConverter))

val pipeline3 = new Pipeline().
setStages(Array(labelIndexer, featureIndexer, gbt, labelConverter))

// 7 Pipeline began training
val model1 = pipeline1.fit (trainingData)

val model2 = pipeline2.fit(trainingData)

val model3 = pipeline3.fit(trainingData)

8 // test model
Val Predictions = model1.transform (testData)
predictions.show (. 5)

//8 测试结果
predictions.select("predictedLabel", "label", "features").show(5)

//9 分类指标
// 正确率
val evaluator1 = new MulticlassClassificationEvaluator().
setLabelCol("indexedLabel").
setPredictionCol("prediction").
setMetricName("accuracy")
val accuracy = evaluator1.evaluate(predictions)
println("Test Error = " + (1.0 - accuracy))

// f1
val evaluator2 = new MulticlassClassificationEvaluator().
setLabelCol("indexedLabel").
setPredictionCol("prediction").
setMetricName("f1")
val f1 = evaluator2.evaluate(predictions)
println("f1 = " + f1)

// Precision
val evaluator3 = new MulticlassClassificationEvaluator().
setLabelCol("indexedLabel").
setPredictionCol("prediction").
setMetricName("weightedPrecision")
val Precision = evaluator3.evaluate(predictions)
println("Precision = " + Precision)

// Recall
val evaluator4 = new MulticlassClassificationEvaluator().
setLabelCol("indexedLabel").
setPredictionCol("prediction").
setMetricName("weightedRecall")
val Recall = evaluator4.evaluate(predictions)
println("Recall = " + Recall)

// AUC
val evaluator5 = new BinaryClassificationEvaluator().
setLabelCol("indexedLabel").
setRawPredictionCol("prediction").
setMetricName("areaUnderROC")
val AUC = evaluator5.evaluate(predictions)
println("Test AUC = " + AUC)

// aupr
val evaluator6 = new BinaryClassificationEvaluator().
setLabelCol("indexedLabel").
setRawPredictionCol("prediction").
setMetricName("areaUnderPR")
val aupr = evaluator6.evaluate(predictions)
println("Test aupr = " + aupr)

//10 决策树打印
val treeModel = model1.stages(2).asInstanceOf[DecisionTreeClassificationModel]
println("Learned classification tree model:\n" + treeModel.toDebugString)

//11 模型保存与加载
model1.save("C:\\LLLLLLLLLLLLLLLLLLL\\BigData_AI\\sparkmlTest\\dtmodel")
val load_treeModel = PipelineModel.load("C:\\LLLLLLLLLLLLLLLLLLL\\BigData_AI\\sparkmlTest\\dtmodel")

}

}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
1 16
117
1 18
119
120
121
122
123
124
125
126
127
128
129
130.
131 is
132
133
134
135
136
137
138
139
140
141 is
142
143
144
145
146
147
148
149
150
151
152
153
154
4.KMeans
and a bit like KNN algorithm

package juhe

import org.apache.spark.ml.clustering.{KMeans, KMeansModel}
import org.apache.spark.sql.SparkSession

object Kmeans {

def main(args: Array[String]): Unit = {

// 0. Spark objects constructed
Val = SparkSession the Spark
.builder ()
.master ( "local") // local test, otherwise an error Master the URL of the MUST BE the SET A in the Configuration your AT org.apache.spark.SparkContext.
.AppName ( "the Test")
.enableHiveSupport ()
.getOrCreate () // to get there without the creation

spark.sparkContext.setCheckpointDir ( "C: \\ LLLLLLLLLLLLLLLLLLL \\ BigData_AI \\ sparkmlTest") // settings file read, store directory, HDFS best
import spark.implicits._

// 读取样本
val dataset = spark.read.format("libsvm").load("C:\\LLLLLLLLLLLLLLLLLLL\\BigData_AI\\sparkmlTest\\sample_kmeans_data.txt")
dataset.show()

// 训练 a k-means model.
val kmeans = new KMeans().setK(2).setSeed(1L)
val model = kmeans.fit(dataset)

// index calculation model.
Val WSSSE = model.computeCost (DataSet)
the println (S "of the Within the Set the Sum Squared = $ WSSSE the Errors")

// display the results.
Println ( "Cluster Centers:")
model.clusterCenters.foreach (println)

// 模型保存与加载
model.save("C:\\LLLLLLLLLLLLLLLLLLL\\BigData_AI\\sparkmlTest\\kmmodel")
val load_treeModel = KMeansModel.load("C:\\LLLLLLLLLLLLLLLLLLL\\BigData_AI\\sparkmlTest\\kmmodel")
spark.stop()


}

}
. 1
2
. 3
. 4
. 5
. 6
. 7
. 8
. 9
10
. 11
12 is
13 is
14
15
16
. 17
18 is
. 19
20 is
21 is
22 is
23 is
24
25
26 is
27
28
29
30
31 is
32
33 is
34 is
35
36
37 [
38 is
39
40
41 is
42 is
43 is
44 is
45
46 is
5.LDA
classification can be used to topic

package juhe

import org.apache.spark.ml.clustering.{LDA, LDAModel}
import org.apache.spark.sql.SparkSession

object ldaTest {

def main(args: Array[String]): Unit = {

// 0. Spark objects constructed
Val = SparkSession the Spark
.builder ()
.master ( "local") // local test, otherwise an error Master the URL of the MUST BE the SET A in the Configuration your AT org.apache.spark.SparkContext.
.AppName ( "the Test")
.enableHiveSupport ()
.getOrCreate () // to get there without the creation

// 1.读取样本
val dataset = spark.read.format("libsvm").load("C:\\LLLLLLLLLLLLLLLLLLL\\BigData_AI\\data.txt")
dataset.show()

// 2.训练 LDA model.
val lda = new LDA().setK(10).setMaxIter(10)
val model = lda.fit(dataset)

val ll = model.logLikelihood(dataset)
val lp = model.logPerplexity(dataset)
println(s"The lower bound on the log likelihood of the entire corpus: $ll")
println(s"The upper bound on perplexity: $lp")

// 3.主题 topics.
val topics = model.describeTopics(3)
println("The topics described by their top-weighted terms:")
topics.show(false)

val aa = model.topicsMatrix
model.estimatedDocConcentration
model.getTopicConcentration

// 4.测试结果.
val transformed = model.transform(dataset)
transformed.show(false)
transformed.columns

// 5. Save and load model
model.save ( "C: \\ LLLLLLLLLLLLLLLLLLL \\ BigData_AI \\ ldamodel")

spark.stop()

}

}

Guess you like

Origin www.cnblogs.com/spark88/p/11226957.html