SparkMlLib算法示例学习记录

支持向量机:

package com.spark.milib

import org.apache.spark.mllib.classification.{
    
    SVMModel, SVMWithSGD}
import org.apache.spark.{
    
    SparkConf, SparkContext}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.log4j.{
    
    Level, Logger}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{
    
    DataFrame, Dataset, Row, SparkSession}

object SVMTest {
    
    
  def main(args: Array[String]): Unit = {
    
    

    val conf=new SparkConf().setAppName("svm").setMaster("local[4]")
    val sc=new SparkContext(conf)
    //读取样本数据，格式为 libsvm
    //val data = sparkSession.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
    val data: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc,"data/mllib/sample_libsvm_data.txt")
    //样本数据划分为训练样本和测试样本
    //val splits: Array[Dataset[Row]] = data.randomSplit(Array(0.6,0.4),11L)
    val splits= data.randomSplit(Array(0.6,0.4), seed =11L)
    val training =splits(0).cache()
    //val training: Dataset[Row] = splits(0).cache()
    val test=splits(1)
    //创建逻辑回归模型并训练
    val numIterations=100//迭代次数
    val model = SVMWithSGD.train(training,numIterations)
    //对测试样本进行测试
    val predictionAndLabel=test.map {
    
    point=>
      val score=model.predict(point.features)
      (score,point.label)
    }
    val print_predict=predictionAndLabel.take(20)
    // 打印输出结果
    println("prediction"+"\t"+"label")
    for (i<- 0 to print_predict.length - 1)
    {
    
    
      println(print_predict(i)._1+"\t"+print_predict(i)._2)
    }
    //误差计算
    val accuracy =1.0 *predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
    println("Area under Roc = "+accuracy)
    //保存模型
    val ModelPath="/home/hadoop/test/svm/svm_model"
    model.save(sc,ModelPath)
  }
}

决策树

package com.spark.milib
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.DecisionTreeClassificationModel
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{
    
    IndexToString, StringIndexer, VectorIndexer}
import org.apache.spark.sql.SparkSession


/**
  * 决策树
  */
object StringIndexerExample {
    
    

  def main(args: Array[String]): Unit = {
    
    

    val sparkSession: SparkSession = SparkSession.builder().appName("test").master("local[4]").getOrCreate()
    // Load the data stored in LIBSVM format as a DataFrame.
    val data = sparkSession.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    // Index labels, adding metadata to the label column.
    // Fit on whole dataset to include all labels in index.
    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
      .fit(data)
    // Automatically identify categorical features, and index them.
    val featureIndexer = new VectorIndexer()
      .setInputCol("features")
      .setOutputCol("indexedFeatures")
      .setMaxCategories(4) // features with > 4 distinct values are treated as continuous.
      .fit(data)

    // Split the data into training and test sets (30% held out for testing).
    val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))

    // Train a DecisionTree model.
    val dt = new DecisionTreeClassifier()
      .setLabelCol("indexedLabel")
      .setFeaturesCol("indexedFeatures")

    // Convert indexed labels back to original labels.
    val labelConverter = new IndexToString()
      .setInputCol("prediction")
      .setOutputCol("predictedLabel")
      .setLabels(labelIndexer.labels)

    // Chain indexers and tree in a Pipeline.
    val pipeline = new Pipeline()
      .setStages(Array(labelIndexer, featureIndexer, dt, labelConverter))

    // Train model. This also runs the indexers.
    val model = pipeline.fit(trainingData)

    // Make predictions.
    val predictions = model.transform(testData)

    // Select example rows to display.
    predictions.select("predictedLabel", "label", "features").show(5)

    // Select (prediction, true label) and compute test error.
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("indexedLabel")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val accuracy = evaluator.evaluate(predictions)
    println("Test Error = " + (1.0 - accuracy))

    val treeModel = model.stages(2).asInstanceOf[DecisionTreeClassificationModel]
    println("Learned classification tree model:\n" + treeModel.toDebugString)
  }
}

随机森林

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// scalastyle:off println
package com.spark.milib

import org.apache.spark.{
    
    SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.mllib.util.MLUtils
// $example off$

object RandomForestClassificationExample {
    
    
  def main(args: Array[String]): Unit = {
    
    
    val conf = new SparkConf().setAppName("RandomForestClassificationExample").setMaster("local[4]")
    val sc = new SparkContext(conf)
    // $example on$
    // Load and parse the data file.
    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    // Split the data into training and test sets (30% held out for testing)
    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))

    // Train a RandomForest model.
    // Empty categoricalFeaturesInfo indicates all features are continuous.
    val numClasses = 2
    val categoricalFeaturesInfo = Map[Int, Int]()
    val numTrees = 3 // Use more in practice.
    val featureSubsetStrategy = "auto" // Let the algorithm choose.
    val impurity = "gini"
    val maxDepth = 4
    val maxBins = 32

    val model = RandomForest.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo,
      numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins)

    // Evaluate model on test instances and compute test error
    val labelAndPreds = testData.map {
    
     point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    }
    val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
    println("Test Error = " + testErr)
    println("Learned classification forest model:\n" + model.toDebugString)

    // Save and load model
    model.save(sc, "target/tmp/myRandomForestClassificationModel")
    val sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestClassificationModel")
    // $example off$
  }
}
// scalastyle:on println

朴素贝叶斯:

package com.spark.milib

import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.sql.SparkSession

/**
  * 朴素贝叶斯
  */
object NaiveBayesExample {
    
    

  def main(args: Array[String]): Unit = {
    
    

    val sparkSession: SparkSession = SparkSession.builder().master("local[4]").appName("test").getOrCreate()

    // Load the data stored in LIBSVM format as a DataFrame.
    val data = sparkSession.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    // Split the data into training and test sets (30% held out for testing)
    val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3), seed = 1234L)

    // Train a NaiveBayes model.
    val model = new NaiveBayes()
      .fit(trainingData)

    // Select example rows to display.
    val predictions = model.transform(testData)
    predictions.show()

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val accuracy = evaluator.evaluate(predictions)
    println("Test set accuracy = " + accuracy)
  }
}

线性回归:

package com.spark.milib

import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.sql.SparkSession

/**
  * 线性回归
  */
object LinearRegressionExample {
    
    

  def main(args: Array[String]): Unit = {
    
    

    val sparkSession: SparkSession = SparkSession.builder().appName("test").master("local[4]").getOrCreate()

    // Load training data
    val training = sparkSession.read.format("libsvm")
      .load("data/mllib/sample_linear_regression_data.txt")

    val lr = new LinearRegression()
      .setMaxIter(10)
      .setRegParam(0.3)
      .setElasticNetParam(0.8)

    // Fit the model
    val lrModel = lr.fit(training)

    // Print the coefficients and intercept for linear regression
    println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

    // Summarize the model over the training set and print out some metrics
    val trainingSummary = lrModel.summary
    println(s"numIterations: ${trainingSummary.totalIterations}")
    println(s"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(",")}]")
    trainingSummary.residuals.show()
    println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
    println(s"r2: ${trainingSummary.r2}")
  }
}

逻辑回归:

package com.spark.milib

import org.apache.spark
import org.apache.spark.sql.SparkSession

/**
  * 逻辑回归
  */
object CorrelationExample {
    
    

  def main(args: Array[String]): Unit = {
    
    

    val sparkSession: SparkSession = SparkSession.builder().master("local[4]").appName("test").getOrCreate()

    import org.apache.spark.ml.classification.LogisticRegression

    // Load training data
    val training = sparkSession.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    val lr = new LogisticRegression()
      .setMaxIter(10)
      .setRegParam(0.3)
      .setElasticNetParam(0.8)

    // Fit the model
    val lrModel = lr.fit(training)

    // Print the coefficients and intercept for logistic regression
    println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

    // We can also use the multinomial family for binary classification
    val mlr = new LogisticRegression()
      .setMaxIter(10)
      .setRegParam(0.3)
      .setElasticNetParam(0.8)
      .setFamily("multinomial")

    val mlrModel = mlr.fit(training)

    // Print the coefficients and intercepts for logistic regression with multinomial family
    println(s"Multinomial coefficients: ${mlrModel.coefficientMatrix}")
    println(s"Multinomial intercepts: ${mlrModel.interceptVector}")
  }
}

kmeans聚类

package com.spark.milib
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.sql.SparkSession

/**
  * kmeans聚类
  */
object KMeansExample {
    
    

  def main(args: Array[String]): Unit = {
    
    

    val sparkSession: SparkSession = SparkSession.builder().master("local[4]").appName("test").getOrCreate()

    // Loads data.
    val dataset = sparkSession.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")

    // Trains a k-means model.
    val kmeans = new KMeans().setK(2).setSeed(1L)
    val model = kmeans.fit(dataset)

    // Evaluate clustering by computing Within Set Sum of Squared Errors.
    val WSSSE = model.computeCost(dataset)
    println(s"Within Set Sum of Squared Errors = $WSSSE")

    // Shows the result.
    println("Cluster Centers: ")
    model.clusterCenters.foreach(println)
  }
}

LDA

package com.spark.milib
import org.apache.spark.ml.clustering.LDA
import org.apache.spark.sql.SparkSession

/**
  * LDA
  */
object LDAExample {
    
    

  def main(args: Array[String]): Unit = {
    
    

    val sparkSession: SparkSession = SparkSession.builder().appName("test").master("local[4]").getOrCreate()

    // Loads data.
    val dataset = sparkSession.read.format("libsvm")
      .load("data/mllib/sample_lda_libsvm_data.txt")

    // Trains a LDA model.
    val lda = new LDA().setK(10).setMaxIter(10)
    val model = lda.fit(dataset)

    val ll = model.logLikelihood(dataset)
    val lp = model.logPerplexity(dataset)
    println(s"The lower bound on the log likelihood of the entire corpus: $ll")
    println(s"The upper bound bound on perplexity: $lp")

    // Describe topics.
    val topics = model.describeTopics(3)
    println("The topics described by their top-weighted terms:")
    topics.show(false)

    // Shows the result.
    val transformed = model.transform(dataset)
    transformed.show(false)
  }
}

提示:

以上使用到的所有数据均位于spark源码中data目录下
在这里插入图片描述

SparkMlLib算法示例学习记录

支持向量机:

决策树

随机森林

朴素贝叶斯:

线性回归:

逻辑回归:

kmeans聚类

LDA

提示:

猜你喜欢