Spark（五）————MLlib

1、Spark机器学习库特点

[Estimator]
运行在包含了feature和label(结果)的dataFrame之上，对数据进行训练创建model。
该模型用于以后的预测。

[Transformer]
将包含feature的Dataframe变换成了包含了预测的dataframe.由Estimator创建的model就是Transformer。

[Parameter]
Estimator和Transformer使用的数据，通常和机器学习的算法相关。Spark API给出了一致性API针对算法。

[Pipeline]
将Estimators和Transformers组合在一起，形成机器学习工作流。

机器学习应用步骤
1.读取数据文件形成训练数据框
2.创建模型（LinearRegression）并设置参数
3.对训练数据进行模型拟合，完成评估管线.
4.创建包含测试数据的DataFrame，典型包含feature和label，可以通过比较预测标签和测试标签确认model是ok。
5.使用模型，对测试数据进行变换(应用模型),抽取feature ，label，prediction.

2、案例

导入pom.xml

<dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-mllib_2.11</artifactId>
            <version>2.1.0</version>
</dependency>

2.1 线性回归模型对白酒质量进行预测

/**
  * Created by Administrator on 2017/4/8.
  */
//评估酒的质量

import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{Row, SparkSession}

object SparkMLDemo1 {
  def main(args: Array[String]): Unit = {
    val sess = SparkSession.builder().appName("ml").master("local[4]").getOrCreate();
    val sc = sess.sparkContext;
    /*
    //数据目录
    val dataDir = "file:///D:/downloads/bigdata/ml/winequality-white.csv"
    //定义样例类
    case class Wine(FixedAcidity: Double, VolatileAcidity: Double,
                    CitricAcid: Double, ResidualSugar: Double, Chlorides: Double,
                    FreeSulfurDioxide: Double, TotalSulfurDioxide: Double, Density: Double, PH:
                    Double, Sulphates: Double, Alcohol: Double, Quality: Double)

    //变换
    val wineDataRDD = sc.textFile(dataDir).map(_.split(";")).map(w => Wine(w(0).toDouble, w(1).toDouble,
      w(2).toDouble, w(3).toDouble, w(4).toDouble, w(5).toDouble, w(6).toDouble, w(7).toDouble, w(8).toDouble
      , w(9).toDouble, w(10).toDouble, w(11).toDouble))

    import sess.implicits._

    //转换RDD成DataFrame
    val trainingDF = wineDataRDD.map(w => (w.Quality,
      Vectors.dense(w.FixedAcidity, w.VolatileAcidity, w.CitricAcid,
        w.ResidualSugar, w.Chlorides, w.FreeSulfurDioxide, w.TotalSulfurDioxide,
        w.Density, w.PH, w.Sulphates, w.Alcohol))).toDF("label", "features")
    //显式数据
    trainingDF.show()
    println("======================")

    //创建线性回归对象
    val lr = new LinearRegression()
    //设置最大迭代次数
    lr.setMaxIter(50)
    //通过线性回归拟合训练数据，生成模型
    val model = lr.fit(trainingDF)
    //保存模型（即参数）
    model.save("file:///d:/scala/model");
   */

    //加载模型（执行了上面注释的代码，将模型保存到了磁盘之后，下次再用只需要直接加载模型即可）
    val model = LinearRegressionModel.load("file:///d:/scala/model");



   //创建内存测试数据数据框
    val testDF = sess.createDataFrame(Seq(
      (5.0, Vectors.dense(7.4, 0.7, 0.0, 1.9, 0.076, 25.0, 67.0, 0.9968, 3.2, 0.68, 9.8)), 
      (5.0, Vectors.dense(7.8, 0.88, 0.0, 2.6, 0.098, 11.0, 34.0, 0.9978, 3.51, 0.56, 9.4)),
      (7.0, Vectors.dense(7.3, 0.65, 0.0, 1.2, 0.065, 15.0, 18.0, 0.9968, 3.36, 0.57, 9.5))))
      .toDF("label", "features")

    testDF.show()

    //创建临时视图
    testDF.createOrReplaceTempView("test")
    println("======================")
    //利用model对测试数据进行变化，得到新数据框，查询features", "label", "prediction方面值。
    val tested = model.transform(testDF).select("features", "label", "prediction");
    tested.show();

  }
}

2.2 逻辑回归模型用于白酒的分类

import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.SparkSession

object LogicRegressWineClassifyDemo{
  def main(args: Array[String]): Unit = {
    val sess = SparkSession.builder().appName("ml").master("local[4]").getOrCreate();
    val sc = sess.sparkContext;
    //数据目录
    val dataDir = "file:///D:/downloads/bigdata/ml/winequality-white.csv"
    //定义样例类
    case class Wine(FixedAcidity: Double, VolatileAcidity: Double,
                    CitricAcid: Double, ResidualSugar: Double, Chlorides: Double,
                    FreeSulfurDioxide: Double, TotalSulfurDioxide: Double, Density: Double, PH:
                    Double, Sulphates: Double, Alcohol: Double, Quality: Double)

    //变换
    val wineDataRDD = sc.textFile(dataDir).map(_.split(";")).map(w => Wine(w(0).toDouble, w(1).toDouble,
      w(2).toDouble, w(3).toDouble, w(4).toDouble, w(5).toDouble, w(6).toDouble, w(7).toDouble, w(8).toDouble
      , w(9).toDouble, w(10).toDouble, w(11).toDouble))

    import sess.implicits._

    //转换RDD成DataFrame
    val trainingDF = wineDataRDD.map(w => (if (w.Quality < 7) 0D else 1D,
      Vectors.dense(w.FixedAcidity, w.VolatileAcidity, w.CitricAcid,
        w.ResidualSugar, w.Chlorides, w.FreeSulfurDioxide, w.TotalSulfurDioxide,
        w.Density, w.PH, w.Sulphates, w.Alcohol))).toDF("label", "features")

    //创建逻辑回归对象
    val lr = new LogisticRegression()
    //设置最大迭代次数
    lr.setMaxIter(10).setRegParam(0.01)
    //拟合模型
    val model = lr.fit(trainingDF)
    //创建测试Dataframe
    val testDF = sess.createDataFrame(Seq(
      (1.0,Vectors.dense(6.1, 0.32, 0.24, 1.5, 0.036, 43, 140, 0.9894, 3.36, 0.64, 10.7)),
      (0.0, Vectors.dense(5.2, 0.44, 0.04, 1.4, 0.036, 38, 124, 0.9898, 3.29, 0.42, 12.4)),
      (0.0,Vectors.dense(7.2, 0.32, 0.47, 5.1, 0.044, 19, 65, 0.9951, 3.38, 0.36, 9)),
      (0.0, Vectors.dense(6.4, 0.595, 0.14, 5.2, 0.058, 15, 97, 0.991, 3.03, 0.41, 12.6)))
    ).toDF("label", "features")

    //显式测试数据
    testDF.show();


    println("========================")
    //预测测试数据(带标签),评测模型的质量。
    testDF.createOrReplaceTempView("test")
    val tested = model.transform(testDF).select("features", "label", "prediction")
    tested.show();

    println("========================")
    //预测无标签的测试数据。
    val predictDF = sess.sql("SELECT features FROM test")
    //预测结果
    val predicted = model.transform(predictDF).select("features", "prediction")
    predicted.show();
  }

}

2.3 采用逻辑回归模型对垃圾邮件过滤

import org.apache.spark.ml.classification.LogisticRegression

import org.apache.spark.sql.{Row, SparkSession}

import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{HashingTF, RegexTokenizer, StopWordsRemover, Tokenizer, Word2Vec}

object SpamFilterDemo1 {
  def main(args: Array[String]): Unit = {
    val sess = SparkSession.builder().appName("ml").master("local[4]").getOrCreate();
    val sc = sess.sparkContext;

     //垃圾邮件训练数据
    val training = sess.createDataFrame(Seq(
      ("[email protected]", "hope you are well", 0.0),
      ("[email protected]", "nice to hear from you", 0.0),
      ("[email protected]", "happy holidays", 0.0),
      ("[email protected]", "see you tomorrow", 0.0),
      ("[email protected]", "save loan money", 1.0),
      ("[email protected]", "save money", 1.0),
      ("[email protected]", "low interest rate", 1.0),
      ("[email protected]", "cheap loan", 1.0)
    )).toDF("email", "message", "label")

    //分词器,指定输入列，生成输出列
    val tokenizer = new Tokenizer().setInputCol("message").setOutputCol("words")
    //哈希词频,同一个单词分配到同一个分区
    val hashingTF = new HashingTF().setNumFeatures(1000).setInputCol("words").setOutputCol("features")
    /* //类似于切割动作。
    val wordsDF = tokenizer.transform(training)
    //wordsDF.show()

    val featurizedDF = hashingTF.transform(wordsDF)
    featurizedDF.show()*/
    //创建逻辑回归对象
    val lr = new LogisticRegression().setMaxIter(10).setRegParam(0.01)
    //设置管线
    val pipeline = new Pipeline().setStages(Array(tokenizer,hashingTF, lr))
    //拟合，产生模型
    val model = pipeline.fit(training)
    //测试数据，评判model的质量
    val test = sess.createDataFrame(Seq(
      ("[email protected]", "ab how are you"),
      ("[email protected]", "ab hope doing well"),
      ("[email protected]", "ab want some money"),
      ("[email protected]", "ab secure loan"),
      ("[email protected]", "ab need loan")
    )).toDF("email", "message")

    //对测试数据进行模型变换,得到模型的预测结果
    val prediction = model.transform(training).select("email", "message", "prediction")//将结果写入到email、message、prediction列
    //prediction.show()

  }
}

2.4 使用最小二乘模型实现推荐模型

import breeze.linalg.product
import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
import org.apache.spark.{SparkConf, SparkContext}

object RecommDemo{
  def main(args: Array[String]): Unit = {
    val conf=new SparkConf().setAppName("Recommend").setMaster("local[4]")
    val sc=new SparkContext(conf)
    //load and parse the data
    val data=sc.textFile("file:///F:\\test.data")
    //变换数据成为Rating
    val ratings=data.map(_.split(",") match {
      case Array(user,item,rate)=>
        Rating(user.toInt,item.toInt,rate.toDouble)
    })
    //Build the recommendation model using ALS
    val rank=10
    val numIterations=10//设置迭代次数
    //最小二乘算法构建推荐模型
    val model=ALS.train(ratings,rank,numIterations,0.01)

    //取出评分数据的(User,product)
    val usersProducts=ratings.map { case Rating(user, product, rate) =>
      (user, product)
    }
    //通过model对(user,product)进行预测， ((user,product),rate)
    val predicctions=
      model.predict(usersProducts).map{case Rating(user, product, rate)=>
        ((user,product),rate)
      }
    //对训练数据进行map映射成((user,product),rate)
    val ratesAndPreds=ratings.map{case Rating(user,product,rate)=>
      ((user,product),rate)
    }.join(predicctions)

    val MSE=ratesAndPreds.map{case ((user,product),(r1,r2))=>
        val err=(r1-r2)
        err*err
    }.mean()
    println("Mean Squared Error="+MSE)

    //
    model.save(sc,"target/tmp/myCollaborativeFilter")
    val sameModel=MatrixFactorizationModel.load(sc,"target/tmp/myCollaborativeFilter")

  }
}

Java版实现

package cn.ctgu.spark.java;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaDoubleRDD;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.recommendation.ALS;
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
import org.apache.spark.mllib.recommendation.Rating;

import scala.Tuple2;

public class JavaRecommendationExample {
    public static void main(String[] args) {
        SparkConf conf=new SparkConf().setAppName("Recommendation Example").setMaster("local[4]");
        JavaSparkContext jsc=new JavaSparkContext(conf);

        //load and parse the data
        String path="data/mllib/als/test.data";
        JavaRDD<String>data=jsc.textFile(path);
        //训练数据集
        JavaRDD<Rating>ratings=data.map(
                new Function<String, Rating>() {
                    public Rating call(String s) throws Exception {
                        String[] sarray=s.split(",");
                        return new Rating(Integer.parseInt(sarray[0]),Integer.parseInt(sarray[1]),
                                Double.parseDouble(sarray[2]));

                    }
                }
        );

        //Build the recommendation model using ALS
        int rank=10;
        int numIterations=10;
        MatrixFactorizationModel model= ALS.train(JavaRDD.toRDD(ratings),rank,numIterations,0.01);
        //提取训练数据的(user,product)信息构成元组
        JavaRDD<Tuple2<Object,Object>>userProducts=ratings.map(
                new Function<Rating, Tuple2<Object, Object>>() {
                    public Tuple2<Object, Object> call(Rating r) throws Exception {
                        return new Tuple2<Object, Object>(r.user(), r.product());
                    }
                }
        );
        //得到预测值
        JavaPairRDD<Tuple2<Integer,Integer>,Double>predictions=JavaPairRDD.fromJavaRDD(
                model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map(
                        new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
                            public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r) throws Exception {
                                return new Tuple2(new Tuple2(r.user(),r.product()),r.rating());
                            }
                        }
                ));
        //连接训练数据和预测数据形成对比元组信息
        JavaRDD<Tuple2<Double,Double>>ratesAndPreds=
                JavaPairRDD.fromJavaRDD(ratings.map(
                        new Function<Rating, Tuple2<Tuple2<Integer,Integer>,Double>>() {
                            public Tuple2<Tuple2<Integer,Integer>, Double> call(Rating r) throws Exception {
                                return new Tuple2(new Tuple2(r.user(),r.product()),r.rating());
                            }
                        }
                )).join(predictions).values();
        //差平方和的均值
        double MSE= JavaDoubleRDD.fromRDD(ratesAndPreds.map(
                new Function<Tuple2<Double,Double>, Object>() {

                    public Object call(Tuple2<Double, Double> pair) throws Exception {
                        Double err=pair._1()-pair._2();
                        return err*err;

                    }
                }
        ).rdd()).mean();
        System.out.println("Mean Squared Error="+MSE);
        //Sava and load model
        model.save(jsc.sc(),"target/tmp/myCollaborativeFilter");
        MatrixFactorizationModel sameModel=MatrixFactorizationModel.load(jsc.sc(),
                "target/tmp/myCollaborativeFilter");
        jsc.stop();
    }
}

2.5 使用最小二乘实现电影推荐

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.recommendation.ALS
import org.apache.spark.sql.SparkSession

/**
  * 电影推荐
  */
object MovieRecommDemo {

  //定义评级样例类
  case class Rating0(userId: Int, movieId: Int, rating: Float, timestamp: Long)

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf();
    conf.setAppName("movieRecomm");
    conf.setMaster("local[4]")

    val spark = SparkSession.builder().config(conf).getOrCreate() ;
    import spark.implicits._

    //解析评级
    def parseRating(str: String): Rating0 = {
      val fields = str.split("::")
      assert(fields.size == 4)
      Rating0(fields(0).toInt, fields(1).toInt, fields(2).toFloat, fields(3).toLong)
    }
    //转换成Rating的DF对象
    var ratings = spark.sparkContext.textFile("file:///D:\\scala\\ml\\recomm\\sample_movielens_ratings.txt");
    val ratings0 = ratings.map(parseRating)
    val df = ratings0.toDF()
    //随机切割训练数据，生成两个一个数组，第一个元素是training,第二个是test
    val Array(training, test) = df.randomSplit(Array(0.99, 0.01))

    //建ALS推荐算法并设置参数
    val als = new ALS().setMaxIter(5)
      .setRegParam(0.01)
      .setUserCol("userId")
      .setItemCol("movieId")
      .setRatingCol("rating")
    //通过als对象对训练数据进行拟合,生成推荐模型
    val model = als.fit(training)

    /*******向5个用户推荐8款商品********/
    //val res = model.recommendProducts(5,8);

    /*******将指定的商品推荐给n个用户********/
    //val res = model.recommendUsers(3,5)

    /*******向所有用户推荐3种商品********/
    val res = model.recommendProductsForUsers(3)

    //使用model对test数据进行变换，实现预测过程
    val predictions = model.transform(test);

    predictions.collect().foreach(println)

  }
}

Spark（五）————MLlib

猜你喜欢