Spark mllib 逻辑回归

逻辑回归

逻辑回归其实是一个分类算法而不是回归算法。通常是利用已知的自变量来预测一个离散型因变量的值（像二进制值0/1，是/否，真/假）。简单来说，它就是通过拟合一个逻辑函数（logit fuction）来预测一个事件发生的概率。所以它预测的是一个概率值，自然，它的输出值应该在0到1之间。
假设你的一个朋友让你回答一道题。可能的结果只有两种：你答对了或没有答对。为了研究你最擅长的题目领域，你做了各种领域的题目。那么这个研究的结果可能是这样的：如果是一道十年级的三角函数题，你有70%的可能性能解出它。但如果是一道五年级的历史题，你会的概率可能只有30%。逻辑回归就是给你这样的概率结果。

Logistic回归简单分析

优点：计算代价不高，易于理解和实现
缺点：容易欠拟合，分类精度可能不高
适用数据类型：数值型和标称型数据

package com.immooc.spark

import org.apache.spark.mllib.classification.{LogisticRegressionModel, LogisticRegressionWithLBFGS}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkConf, SparkContext}

object logistic_regression {


  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("LogisticRegressionWithLBFGSExample").setMaster("local[2]")
    val sc = new SparkContext(conf)

    // $example on$
    // Load training data in LIBSVM format.
    val data = MLUtils.loadLibSVMFile(sc, "file:///Users/walle/Documents/D3/sparkmlib/wa.txt")

    // Split data into training (60%) and test (40%).
    val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

    // Run training algorithm to build the model
    val model = new LogisticRegressionWithLBFGS()
      .setNumClasses(10)
      .run(training)

    // Compute raw scores on the test set.
    val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
      val prediction = model.predict(features)
      (prediction, label)
    }

    val print_predict = predictionAndLabels.take(20)
    println("prediction" + "\t" + "label")

    for (i <- 0 to print_predict.length - 1){
       println(print_predict(i)._1 + "\t" + print_predict(i)._2)
    }

    val patient = Vectors.dense(Array(70,3,180.0,4,3))
    val prediction = model.predict(patient)
    println(prediction)

    // Get evaluation metrics.
    val metrics = new MulticlassMetrics(predictionAndLabels)
    val accuracy = metrics.accuracy
    println(s"Accuracy = $accuracy")

    // Save and load model
    //    model.save(sc, "target/tmp/scalaLogisticRegressionWithLBFGSModel")
    //    val sameModel = LogisticRegressionModel.load(sc,
    //      "target/tmp/scalaLogisticRegressionWithLBFGSModel")
    // $example off$

    sc.stop()
  }
}

0 1:59 2:2 3:43.4 4:2 5:1
0 1:36 2:1 3:57.2 4:1 5:1
0 1:61 2:2 3:190 4:2 5:1
1 1:58 2:3 3:128 4:4 5:3
1 1:55 2:3 3:80 4:3 5:4
0 1:61 2:1 3:94 4:4 5:2
0 1:38 2:1 3:76 4:1 5:1
0 1:42 2:1 3:240 4:3 5:2
0 1:50 2:1 3:74 4:1 5:1
0 1:58 2:2 3:68.6 4:2 5:2
0 1:68 2:3 3:132.8 4:4 5:2
1 1:25 2:2 3:94.6 4:4 5:3
0 1:52 2:1 3:56 4:1 5:1
0 1:31 2:1 3:47.8 4:2 5:1
1 1:36 2:3 3:31.6 4:3 5:1
0 1:42 2:1 3:66.2 4:2 5:1
1 1:14 2:3 3:138.6 4:3 5:3
0 1:32 2:1 3:114 4:2 5:3
0 1:35 2:1 3:40.2 4:2 5:1
1 1:70 2:3 3:177.2 4:4 5:3
1 1:65 2:2 3:51.6 4:4 5:4
0 1:45 2:2 3:124 4:2 5:4
1 1:68 2:3 3:127.2 4:3 5:3
0 1:31 2:2 3:124.8 4:2 5:3

输出

  prediction	label
0.0	0.0
0.0	1.0
0.0	0.0
0.0	0.0
1.0	1.0
0.0	0.0
1.0	1.0
0.0	0.0
0.0	1.0
0.0	0.0
0.0	1.0
0.0	0.0
0.0
Accuracy = 0.75

dataframe 版

package com.immooc.spark

import org.apache.spark
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{Row, SQLContext, SparkSession}
import org.apache.spark.ml.param.ParamMap

object LogisticRegression {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("LogisticRegression dataframe").setMaster("local[2]")
    val sc = new SparkContext(conf)
    val sqc=new SQLContext(sc)


    //准备训练集
    val training = sqc.createDataFrame(Seq(
      (1.0, Vectors.dense(0.0, 1.1, 0.1)),
      (0.0, Vectors.dense(2.0, 1.0, -1.0)),
      (0.0, Vectors.dense(2.0, 1.3, 1.0)),
      (1.0, Vectors.dense(0.0, 1.2, -0.5))
    )).toDF("label", "features")

    //准备测试集
    val test = sqc.createDataFrame(Seq(
      (1.0, Vectors.dense(-1.0, 1.5, 1.3)),
      (0.0, Vectors.dense(3.0, 2.0, -0.1)),
      (1.0, Vectors.dense(0.0, 2.2, -1.5))
    )).toDF("label", "features")

    //创建逻辑回归算法实例，并查看、设置相应参数
    val lr = new LogisticRegression()
    println("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
    lr.setMaxIter(10).setRegParam(0.01)

    //训练学习得到model1,查看model1的参数
    val model1 = lr.fit(training)
    println("Model 1 was fit using parameters: " + model1.parent.extractParamMap)
    //用paraMap来设置参数集
    val paramMap = ParamMap(lr.maxIter -> 20).put(lr.maxIter, 30)  .put(lr.regParam -> 0.1, lr.threshold -> 0.55)
    //可以将两个paraMap结合起来
    val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability")
    val paramMapCombined = paramMap ++ paramMap2
    //使用结合的paraMap训练学习得到model2
    val model2 = lr.fit(training, paramMapCombined)
    println("Model 2 was fit using parameters: " + model2.parent.extractParamMap)

    //使用测试集测试model2
    model2.transform(test).select("features", "label", "myProbability", "prediction").collect().foreach { case Row(features: Vector, label: Double, prob:Vector, prediction: Double) =>println(s"($features, $label) -> prob=$prob,prediction=$prediction")}
  }
}

Spark mllib 逻辑回归

逻辑回归

输出

dataframe 版

猜你喜欢