玩玩Spark Cache

没有进行cache时耗时:

scala> dataRdd.flatMap(_.split(",")).map((_, 1)).reduceByKey(_+_).count
res5: Long = 10

 加了一个cache,第一次执行

scala> dataRdd.flatMap(_.split(",")).map((_, 1)).reduceByKey(_+_).cache.count
res6: Long = 10

注意,只有触发了action才会放到内存 

加了一个cache,第二次执行 

scala> dataRdd.flatMap(_.split(",")).map((_, 1)).reduceByKey(_+_).cache.count
res7: Long = 10

scala> import org.apache.spark.storage.StorageLevel
import org.apache.spark.storage.StorageLevel

scala> val cached1 = dataRdd1.flatMap(_.split(",")).map((_, 1)).reduceByKey(_+_).persist(StorageLevel.MEMORY_ONLY_SER)
cached1: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[10] at reduceByKey at <console>:26

scala> cached
cached   cached1

scala> cached1.count
res2: Long = 10

scala> cached1.count
res3: Long = 10

scala> cached.count
res4: Long = 10

带序列化的作业执行的时间会比不带序列化的时间要长,但是序列化之后体积变小了

下面使用IDEA编写代码重新来一次

一开始界面啥都没有

package com.ruozedata.spark.homework

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.storage.StorageLevel


object DataSerialization {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf()
      .setAppName("someTestApp").setMaster("local[2]")

    val sc = new SparkContext(sparkConf)
    val lines = sc.textFile("/Users/Aaron/Downloads/bigdata/data/hello_data.txt")
    val count = lines.flatMap(_.split(",")).map((_, 1)).reduceByKey(_+_).count
    println("------------------->" + ":" + count.toString)

    Thread.sleep(66666666)
    sc.stop()
  }
}

结果如下

下面来整一下cache

加了一个cache,第一次执行 

package com.ruozedata.spark.homework

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.storage.StorageLevel


object DataSerialization {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf()
      .setAppName("someTestApp").setMaster("local[2]")
//      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

    val sc = new SparkContext(sparkConf)
    val lines = sc.textFile("/Users/Aaron/Downloads/bigdata/data/hello_data.txt")

    //    MEMORY_ONLY

    val count = lines.flatMap(_.split(",")).map((_, 1)).reduceByKey(_ + _).count
    println("------------------->" + ":" + count.toString)

    val count1 = lines.flatMap(_.split(",")).map((_, 1)).reduceByKey(_ + _).cache.count
    println("------------------->" + ":" + count1.toString)


    Thread.sleep(66666666)
    sc.stop()
  }
}

结果如下 

点击job进去看一看

很明显,使用cache之后,作业执行的速度提升了

下面使用StorageLevel为MEMORY_ONLY_SER的序列化方式

package com.ruozedata.spark.homework

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.storage.StorageLevel


object DataSerialization {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf()
      .setAppName("someTestApp").setMaster("local[2]")
//      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

    val sc = new SparkContext(sparkConf)
    val lines = sc.textFile("/Users/Aaron/Downloads/bigdata/data/hello_data.txt")
//    val count = lines.flatMap(_.split(",")).map((_, 1)).reduceByKey(_+_).cache.count
//    println("------------------->" + ":" + count.toString)

//    val count1 = lines.flatMap(_.split(",")).map((_, 1)).reduceByKey(_+_).count
//    println("------------------->" + ":" + count1.toString)

    val countSer = lines.flatMap(_.split(","))
      .map((_, 1)).reduceByKey(_+_)
      .persist(StorageLevel.MEMORY_ONLY_SER)
      .count
    println("------------------->" + ":" + countSer.toString)

    val countSer1 = lines.flatMap(_.split(","))
      .map((_, 1)).reduceByKey(_+_)
      .count
    println("------------------->" + ":" + countSer1.toString)


    Thread.sleep(66666666)
    sc.stop()
  }
}

接着是使用Kryo但没有注册

package com.ruozedata.spark.homework

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.storage.StorageLevel


object DataSerialization {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf()
      .setAppName("someTestApp").setMaster("local[2]")
      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

    val sc = new SparkContext(sparkConf)
    val lines = sc.textFile("/Users/Aaron/Downloads/bigdata/data/hello_data.txt")

    //    MEMORY_ONLY

//    val count = lines.flatMap(_.split(",")).map((_, 1)).reduceByKey(_ + _).count
//    println("------------------->" + ":" + count.toString)
//
//    val count1 = lines.flatMap(_.split(",")).map((_, 1)).reduceByKey(_ + _).cache.count
//    println("------------------->" + ":" + count1.toString)

    //    MEMORY_ONLY_SER
//    val countSer = lines.flatMap(_.split(","))
//      .map((_, 1)).reduceByKey(_ + _)
//      .persist(StorageLevel.MEMORY_ONLY_SER)
//      .count
//    println("------------------->" + ":" + countSer.toString)
//
//    val countSer1 = lines.flatMap(_.split(","))
//      .map((_, 1)).reduceByKey(_ + _)
//      .count
//    println("------------------->" + ":" + countSer1.toString)

//    MEMORY_ONLY_SER_KRYO
    val countSerKryoWithoutRegister = lines.flatMap(_.split(","))
      .map((_, 1)).reduceByKey(_ + _)
      .persist(StorageLevel.MEMORY_ONLY_SER)
      .count
    println("------------------->" + ":" + countSerKryoWithoutRegister.toString)

    val countSerKryoWithoutRegister1 = lines.flatMap(_.split(","))
      .map((_, 1)).reduceByKey(_ + _)
      .count
    println("------------------->" + ":" + countSerKryoWithoutRegister1.toString)


    Thread.sleep(66666666)
    sc.stop()
  }
}

接着是使用Kryo并且注册

package com.ruozedata.spark.homework

import com.esotericsoftware.kryo.Kryo
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.serializer.KryoRegistrator


object DataSerialization {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf()
      .setAppName("someTestApp").setMaster("local[2]")
      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .registerKryoClasses(Array(classOf[MyRegistrator]))
//      .set("spark.kryo.registrationRequired","true")

    val sc = new SparkContext(sparkConf)
    val lines = sc.textFile("/Users/Aaron/Downloads/bigdata/data/hello_data.txt")

    //    MEMORY_ONLY



//    MEMORY_ONLY_SER_KRYO
    val countSerKryoWithRegister = lines.flatMap(_.split(","))
      .map((_, 1)).reduceByKey(_ + _)
      .persist(StorageLevel.MEMORY_ONLY_SER)
      .count
    println("------------------->" + ":" + countSerKryoWithRegister.toString)

    val countSerKryoWithRegister1 = lines.flatMap(_.split(","))
      .map((_, 1)).reduceByKey(_ + _)
      .count
    println("------------------->" + ":" + countSerKryoWithRegister1.toString)


    Thread.sleep(66666666)
    sc.stop()
  }
  class MyRegistrator extends KryoRegistrator {
    override def registerClasses(kryo: Kryo) {

      kryo.register(classOf[Qualify])
    }
  }

  case class Qualify(s1:String, s2:String, s3:String, s4:String)
}

总结

各种序列化方式对比
  map count size in moemory(KB)  
原始数据 16 0.3    
MEMORY_ONLY 12 0.2 3.3  
MEMORY_ONLY_SER 13 0.3    
MEMORY_ONLY_SER_KYRO        
MEMORY_ONLY_SER_KYRO_REGISTER        
         
         

猜你喜欢

转载自blog.csdn.net/xiaoxiongaa0/article/details/90268075