Spark API 之combineByKey

code: 可以copy 到自己的项目中测试

package spark_api

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

/**
  * combineByKey(createCombiner,mergeValue,mergeCombiners)
  * 这三个函数的作用:
  *              createCombiner:为key相同的每个value作为参数 调用一次该方法(value的个数大于1)
  *              mergeValue: 为createCombiner输出后的value计算
  *              mergeCombiners: 为mergeValue输出后的value计算
  */
object mh_combineByKey extends Serializable {
  def createCombiner(cc: Int): Int ={
    println("createCombiner-----------  "+cc)
    val thread_name: String = Thread.currentThread().getName
    println("当前线程的名称为: "+thread_name)
    cc*10
  }
  def mergeValue(a: Int,b: Int)={
    println("mergeValue "+b+a)
    a+b
  }
  def mergeCombiners(a: Int,b: Int)={
    println("mergeCombiners "+a+b)
    a+b
  }

  def createCombiner2(c: String)={
    var res = " |"+c+"| "  //传进来的元素数据,加以改变一下
    println("createCombiner-----------  "+res)
    val thread_name: String = Thread.currentThread().getName
    println("当前线程的名称为: "+thread_name)
    res
  }
  def mergeValue2(a: String,b: String)={
    println("mergeValue "+a+"+"+b)
    a+"+"+b
  }
  def mergeCombiners2(a: String,b: String)={
    println("mergeCombiners "+a+"+"+b)
    a+"+"+b
  }

  def main(args: Array[String]): Unit = {
    combineByKey_test2()
    
  }

  def combineByKey_test()={
    val conf: SparkConf = new SparkConf().setAppName("combinerByKey").setMaster("local")

    val sc: SparkContext = new SparkContext(conf)
    val rdd: RDD[String] = sc.textFile("F:\\test_data\\word.txt")
    val tuples: Array[(String, Int)] = rdd.flatMap(_.split(" ")).map((_,1)).combineByKey(createCombiner,mergeValue,mergeCombiners).collect()

    println("**************************************************************************************************")
    for(tp <- tuples){
      println(tp)
    }
    sc.stop()
  }

  def combineByKey_test2(){
    val conf: SparkConf = new SparkConf().setAppName("combinerByKey").setMaster("local")
    val sc: SparkContext = new SparkContext(conf)
    val rdd = sc.parallelize(List(("a","a"),("b","b"),("c","c") ,("e","e") ,("f","f"),("a","a"),("b","b"),("c","c") ,("e","e") ,("f","f"),("b","b"),("c","c") ,("e","e") ,("f","f")),2)
    rdd.mapPartitionsWithIndex(myfunc).collect()
    val rdd2 = rdd.combineByKey(createCombiner2,mergeValue2,mergeCombiners2)
    //打印现在的分区状况和数据,并输出最终结果
    val tuples: Array[(String, String)] = rdd2.mapPartitionsWithIndex(myfunc).collect()
    println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    for(tp <- tuples){
      println(tp)
    }
    sc.stop()
  }
  def myfunc(index: Int, iter: Iterator[(String,String)]) : Iterator[(String,String)] = {
    iter.map(x => {
      println("[partID:" +  index + ", val: " + x + "]")
      (x._1,x._2)
    })
  }


}

打印日志:

D:\Java\jdk1.8.0_191\bin\java.exe -agentlib:jdwp=transport=dt_socket,address=127.0.0.1:64438,suspend=y,server=n -javaagent:C:\Users\cmj\.IntelliJIdea2018.3\system\captureAgent\debugger-agent.jar -Dfile.encoding=UTF-8 -classpath "D:\Java\jdk1.8.0_191\jre\lib\charsets.jar;D:\Java\jdk1.8.0_191\jre\lib\deploy.jar;D:\Java\jdk1.8.0_191\jre\lib\ext\access-bridge-64.jar;D:\Java\jdk1.8.0_191\jre\lib\ext\cldrdata.jar;D:\Java\jdk1.8.0_191\jre\lib\ext\dnsns.jar;D:\Java\jdk1.8.0_191\jre\lib\ext\jaccess.jar;D:\Java\jdk1.8.0_191\jre\lib\ext\jfxrt.jar;D:\Java\jdk1.8.0_191\jre\lib\ext\localedata.jar;D:\Java\jdk1.8.0_191\jre\lib\ext\nashorn.jar;D:\Java\jdk1.8.0_191\jre\lib\ext\sunec.jar;D:\Java\jdk1.8.0_191\jre\lib\ext\sunjce_provider.jar;D:\Java\jdk1.8.0_191\jre\lib\ext\sunmscapi.jar;D:\Java\jdk1.8.0_191\jre\lib\ext\sunpkcs11.jar;D:\Java\jdk1.8.0_191\jre\lib\ext\zipfs.jar;D:\Java\jdk1.8.0_191\jre\lib\javaws.jar;D:\Java\jdk1.8.0_191\jre\lib\jce.jar;D:\Java\jdk1.8.0_191\jre\lib\jfr.jar;D:\Java\jdk1.8.0_191\jre\lib\jfxswt.jar;D:\Java\jdk1.8.0_191\jre\lib\jsse.jar;D:\Java\jdk1.8.0_191\jre\lib\management-agent.jar;D:\Java\jdk1.8.0_191\jre\lib\plugin.jar;D:\Java\jdk1.8.0_191\jre\lib\resources.jar;D:\Java\jdk1.8.0_191\jre\lib\rt.jar;D:\Users\cmj\IdeaProjects\sparkall_api\target\classes;F:\MavenLocalRepo\org\apache\spark\spark-core_2.11\2.1.0\spark-core_2.11-2.1.0.jar;F:\MavenLocalRepo\org\apache\avro\avro-mapred\1.7.7\avro-mapred-1.7.7-hadoop2.jar;F:\MavenLocalRepo\org\apache\avro\avro-ipc\1.7.7\avro-ipc-1.7.7.jar;F:\MavenLocalRepo\org\apache\avro\avro\1.7.7\avro-1.7.7.jar;F:\MavenLocalRepo\org\apache\avro\avro-ipc\1.7.7\avro-ipc-1.7.7-tests.jar;F:\MavenLocalRepo\org\codehaus\jackson\jackson-core-asl\1.9.13\jackson-core-asl-1.9.13.jar;F:\MavenLocalRepo\org\codehaus\jackson\jackson-mapper-asl\1.9.13\jackson-mapper-asl-1.9.13.jar;F:\MavenLocalRepo\com\twitter\chill_2.11\0.8.0\chill_2.11-0.8.0.jar;F:\MavenLocalRepo\com\esotericsoftware\kryo-shaded\3.0.3\kryo-shaded-3.0.3.jar;F:\MavenLocalRepo\com\esotericsoftware\minlog\1.3.0\minlog-1.3.0.jar;F:\MavenLocalRepo\org\objenesis\objenesis\2.1\objenesis-2.1.jar;F:\MavenLocalRepo\com\twitter\chill-java\0.8.0\chill-java-0.8.0.jar;F:\MavenLocalRepo\org\apache\xbean\xbean-asm5-shaded\4.4\xbean-asm5-shaded-4.4.jar;F:\MavenLocalRepo\org\apache\hadoop\hadoop-client\2.2.0\hadoop-client-2.2.0.jar;F:\MavenLocalRepo\org\apache\hadoop\hadoop-common\2.2.0\hadoop-common-2.2.0.jar;F:\MavenLocalRepo\commons-cli\commons-cli\1.2\commons-cli-1.2.jar;F:\MavenLocalRepo\org\apache\commons\commons-math\2.1\commons-math-2.1.jar;F:\MavenLocalRepo\xmlenc\xmlenc\0.52\xmlenc-0.52.jar;F:\MavenLocalRepo\commons-io\commons-io\2.1\commons-io-2.1.jar;F:\MavenLocalRepo\commons-lang\commons-lang\2.5\commons-lang-2.5.jar;F:\MavenLocalRepo\commons-configuration\commons-configuration\1.6\commons-configuration-1.6.jar;F:\MavenLocalRepo\commons-collections\commons-collections\3.2.1\commons-collections-3.2.1.jar;F:\MavenLocalRepo\commons-digester\commons-digester\1.8\commons-digester-1.8.jar;F:\MavenLocalRepo\commons-beanutils\commons-beanutils\1.7.0\commons-beanutils-1.7.0.jar;F:\MavenLocalRepo\commons-beanutils\commons-beanutils-core\1.8.0\commons-beanutils-core-1.8.0.jar;F:\MavenLocalRepo\com\google\protobuf\protobuf-java\2.5.0\protobuf-java-2.5.0.jar;F:\MavenLocalRepo\org\apache\hadoop\hadoop-auth\2.2.0\hadoop-auth-2.2.0.jar;F:\MavenLocalRepo\org\apache\commons\commons-compress\1.4.1\commons-compress-1.4.1.jar;F:\MavenLocalRepo\org\tukaani\xz\1.0\xz-1.0.jar;F:\MavenLocalRepo\org\apache\hadoop\hadoop-hdfs\2.2.0\hadoop-hdfs-2.2.0.jar;F:\MavenLocalRepo\org\mortbay\jetty\jetty-util\6.1.26\jetty-util-6.1.26.jar;F:\MavenLocalRepo\org\apache\hadoop\hadoop-mapreduce-client-app\2.2.0\hadoop-mapreduce-client-app-2.2.0.jar;F:\MavenLocalRepo\org\apache\hadoop\hadoop-mapreduce-client-common\2.2.0\hadoop-mapreduce-client-common-2.2.0.jar;F:\MavenLocalRepo\org\apache\hadoop\hadoop-yarn-client\2.2.0\hadoop-yarn-client-2.2.0.jar;F:\MavenLocalRepo\com\google\inject\guice\3.0\guice-3.0.jar;F:\MavenLocalRepo\javax\inject\javax.inject\1\javax.inject-1.jar;F:\MavenLocalRepo\aopalliance\aopalliance\1.0\aopalliance-1.0.jar;F:\MavenLocalRepo\org\apache\hadoop\hadoop-yarn-server-common\2.2.0\hadoop-yarn-server-common-2.2.0.jar;F:\MavenLocalRepo\org\apache\hadoop\hadoop-mapreduce-client-shuffle\2.2.0\hadoop-mapreduce-client-shuffle-2.2.0.jar;F:\MavenLocalRepo\org\apache\hadoop\hadoop-yarn-api\2.2.0\hadoop-yarn-api-2.2.0.jar;F:\MavenLocalRepo\org\apache\hadoop\hadoop-mapreduce-client-core\2.2.0\hadoop-mapreduce-client-core-2.2.0.jar;F:\MavenLocalRepo\org\apache\hadoop\hadoop-yarn-common\2.2.0\hadoop-yarn-common-2.2.0.jar;F:\MavenLocalRepo\org\apache\hadoop\hadoop-mapreduce-client-jobclient\2.2.0\hadoop-mapreduce-client-jobclient-2.2.0.jar;F:\MavenLocalRepo\org\apache\hadoop\hadoop-annotations\2.2.0\hadoop-annotations-2.2.0.jar;F:\MavenLocalRepo\org\apache\spark\spark-launcher_2.11\2.1.0\spark-launcher_2.11-2.1.0.jar;F:\MavenLocalRepo\org\apache\spark\spark-network-common_2.11\2.1.0\spark-network-common_2.11-2.1.0.jar;F:\MavenLocalRepo\org\fusesource\leveldbjni\leveldbjni-all\1.8\leveldbjni-all-1.8.jar;F:\MavenLocalRepo\com\fasterxml\jackson\core\jackson-annotations\2.6.5\jackson-annotations-2.6.5.jar;F:\MavenLocalRepo\org\apache\spark\spark-network-shuffle_2.11\2.1.0\spark-network-shuffle_2.11-2.1.0.jar;F:\MavenLocalRepo\org\apache\spark\spark-unsafe_2.11\2.1.0\spark-unsafe_2.11-2.1.0.jar;F:\MavenLocalRepo\net\java\dev\jets3t\jets3t\0.7.1\jets3t-0.7.1.jar;F:\MavenLocalRepo\commons-codec\commons-codec\1.3\commons-codec-1.3.jar;F:\MavenLocalRepo\commons-httpclient\commons-httpclient\3.1\commons-httpclient-3.1.jar;F:\MavenLocalRepo\org\apache\curator\curator-recipes\2.4.0\curator-recipes-2.4.0.jar;F:\MavenLocalRepo\org\apache\curator\curator-framework\2.4.0\curator-framework-2.4.0.jar;F:\MavenLocalRepo\org\apache\curator\curator-client\2.4.0\curator-client-2.4.0.jar;F:\MavenLocalRepo\org\apache\zookeeper\zookeeper\3.4.5\zookeeper-3.4.5.jar;F:\MavenLocalRepo\com\google\guava\guava\14.0.1\guava-14.0.1.jar;F:\MavenLocalRepo\javax\servlet\javax.servlet-api\3.1.0\javax.servlet-api-3.1.0.jar;F:\MavenLocalRepo\org\apache\commons\commons-lang3\3.5\commons-lang3-3.5.jar;F:\MavenLocalRepo\org\apache\commons\commons-math3\3.4.1\commons-math3-3.4.1.jar;F:\MavenLocalRepo\com\google\code\findbugs\jsr305\1.3.9\jsr305-1.3.9.jar;F:\MavenLocalRepo\org\slf4j\slf4j-api\1.7.16\slf4j-api-1.7.16.jar;F:\MavenLocalRepo\org\slf4j\jul-to-slf4j\1.7.16\jul-to-slf4j-1.7.16.jar;F:\MavenLocalRepo\org\slf4j\jcl-over-slf4j\1.7.16\jcl-over-slf4j-1.7.16.jar;F:\MavenLocalRepo\log4j\log4j\1.2.17\log4j-1.2.17.jar;F:\MavenLocalRepo\org\slf4j\slf4j-log4j12\1.7.16\slf4j-log4j12-1.7.16.jar;F:\MavenLocalRepo\com\ning\compress-lzf\1.0.3\compress-lzf-1.0.3.jar;F:\MavenLocalRepo\org\xerial\snappy\snappy-java\1.1.2.6\snappy-java-1.1.2.6.jar;F:\MavenLocalRepo\net\jpountz\lz4\lz4\1.3.0\lz4-1.3.0.jar;F:\MavenLocalRepo\org\roaringbitmap\RoaringBitmap\0.5.11\RoaringBitmap-0.5.11.jar;F:\MavenLocalRepo\commons-net\commons-net\2.2\commons-net-2.2.jar;F:\MavenLocalRepo\org\scala-lang\scala-library\2.11.8\scala-library-2.11.8.jar;F:\MavenLocalRepo\org\json4s\json4s-jackson_2.11\3.2.11\json4s-jackson_2.11-3.2.11.jar;F:\MavenLocalRepo\org\json4s\json4s-core_2.11\3.2.11\json4s-core_2.11-3.2.11.jar;F:\MavenLocalRepo\org\json4s\json4s-ast_2.11\3.2.11\json4s-ast_2.11-3.2.11.jar;F:\MavenLocalRepo\com\thoughtworks\paranamer\paranamer\2.6\paranamer-2.6.jar;F:\MavenLocalRepo\org\scala-lang\scalap\2.11.0\scalap-2.11.0.jar;F:\MavenLocalRepo\org\scala-lang\scala-compiler\2.11.0\scala-compiler-2.11.0.jar;F:\MavenLocalRepo\org\scala-lang\modules\scala-parser-combinators_2.11\1.0.1\scala-parser-combinators_2.11-1.0.1.jar;F:\MavenLocalRepo\org\glassfish\jersey\core\jersey-client\2.22.2\jersey-client-2.22.2.jar;F:\MavenLocalRepo\javax\ws\rs\javax.ws.rs-api\2.0.1\javax.ws.rs-api-2.0.1.jar;F:\MavenLocalRepo\org\glassfish\hk2\hk2-api\2.4.0-b34\hk2-api-2.4.0-b34.jar;F:\MavenLocalRepo\org\glassfish\hk2\hk2-utils\2.4.0-b34\hk2-utils-2.4.0-b34.jar;F:\MavenLocalRepo\org\glassfish\hk2\external\aopalliance-repackaged\2.4.0-b34\aopalliance-repackaged-2.4.0-b34.jar;F:\MavenLocalRepo\org\glassfish\hk2\external\javax.inject\2.4.0-b34\javax.inject-2.4.0-b34.jar;F:\MavenLocalRepo\org\glassfish\hk2\hk2-locator\2.4.0-b34\hk2-locator-2.4.0-b34.jar;F:\MavenLocalRepo\org\javassist\javassist\3.18.1-GA\javassist-3.18.1-GA.jar;F:\MavenLocalRepo\org\glassfish\jersey\core\jersey-common\2.22.2\jersey-common-2.22.2.jar;F:\MavenLocalRepo\javax\annotation\javax.annotation-api\1.2\javax.annotation-api-1.2.jar;F:\MavenLocalRepo\org\glassfish\jersey\bundles\repackaged\jersey-guava\2.22.2\jersey-guava-2.22.2.jar;F:\MavenLocalRepo\org\glassfish\hk2\osgi-resource-locator\1.0.1\osgi-resource-locator-1.0.1.jar;F:\MavenLocalRepo\org\glassfish\jersey\core\jersey-server\2.22.2\jersey-server-2.22.2.jar;F:\MavenLocalRepo\org\glassfish\jersey\media\jersey-media-jaxb\2.22.2\jersey-media-jaxb-2.22.2.jar;F:\MavenLocalRepo\javax\validation\validation-api\1.1.0.Final\validation-api-1.1.0.Final.jar;F:\MavenLocalRepo\org\glassfish\jersey\containers\jersey-container-servlet\2.22.2\jersey-container-servlet-2.22.2.jar;F:\MavenLocalRepo\org\glassfish\jersey\containers\jersey-container-servlet-core\2.22.2\jersey-container-servlet-core-2.22.2.jar;F:\MavenLocalRepo\io\netty\netty-all\4.0.42.Final\netty-all-4.0.42.Final.jar;F:\MavenLocalRepo\io\netty\netty\3.8.0.Final\netty-3.8.0.Final.jar;F:\MavenLocalRepo\com\clearspring\analytics\stream\2.7.0\stream-2.7.0.jar;F:\MavenLocalRepo\io\dropwizard\metrics\metrics-core\3.1.2\metrics-core-3.1.2.jar;F:\MavenLocalRepo\io\dropwizard\metrics\metrics-jvm\3.1.2\metrics-jvm-3.1.2.jar;F:\MavenLocalRepo\io\dropwizard\metrics\metrics-json\3.1.2\metrics-json-3.1.2.jar;F:\MavenLocalRepo\io\dropwizard\metrics\metrics-graphite\3.1.2\metrics-graphite-3.1.2.jar;F:\MavenLocalRepo\com\fasterxml\jackson\core\jackson-databind\2.6.5\jackson-databind-2.6.5.jar;F:\MavenLocalRepo\com\fasterxml\jackson\core\jackson-core\2.6.5\jackson-core-2.6.5.jar;F:\MavenLocalRepo\com\fasterxml\jackson\module\jackson-module-scala_2.11\2.6.5\jackson-module-scala_2.11-2.6.5.jar;F:\MavenLocalRepo\org\scala-lang\scala-reflect\2.11.7\scala-reflect-2.11.7.jar;F:\MavenLocalRepo\com\fasterxml\jackson\module\jackson-module-paranamer\2.6.5\jackson-module-paranamer-2.6.5.jar;F:\MavenLocalRepo\org\apache\ivy\ivy\2.4.0\ivy-2.4.0.jar;F:\MavenLocalRepo\oro\oro\2.0.8\oro-2.0.8.jar;F:\MavenLocalRepo\net\razorvine\pyrolite\4.13\pyrolite-4.13.jar;F:\MavenLocalRepo\net\sf\py4j\py4j\0.10.4\py4j-0.10.4.jar;F:\MavenLocalRepo\org\apache\spark\spark-tags_2.11\2.1.0\spark-tags_2.11-2.1.0.jar;F:\MavenLocalRepo\org\scalatest\scalatest_2.11\2.2.6\scalatest_2.11-2.2.6.jar;F:\MavenLocalRepo\org\scala-lang\modules\scala-xml_2.11\1.0.2\scala-xml_2.11-1.0.2.jar;F:\MavenLocalRepo\org\apache\commons\commons-crypto\1.0.0\commons-crypto-1.0.0.jar;F:\MavenLocalRepo\org\spark-project\spark\unused\1.0.0\unused-1.0.0.jar;C:\Users\cmj\.ivy2\cache\org.scala-lang\scala-library\jars\scala-library-2.12.2.jar;C:\Users\cmj\.ivy2\cache\org.scala-lang\scala-reflect\jars\scala-reflect-2.12.2.jar;C:\Users\cmj\.ivy2\cache\org.scala-lang\scala-library\srcs\scala-library-2.12.2-sources.jar;C:\Users\cmj\.ivy2\cache\org.scala-lang\scala-reflect\srcs\scala-reflect-2.12.2-sources.jar;D:\JetBrains\IntelliJ IDEA2018.3.1\lib\idea_rt.jar" spark_api.mh_combineByKey
Connected to the target VM, address: '127.0.0.1:64438', transport: 'socket'
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
18/12/13 16:35:50 INFO SparkContext: Running Spark version 2.1.0
18/12/13 16:35:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
18/12/13 16:35:51 INFO SecurityManager: Changing view acls to: cmj
18/12/13 16:35:51 INFO SecurityManager: Changing modify acls to: cmj
18/12/13 16:35:51 INFO SecurityManager: Changing view acls groups to: 
18/12/13 16:35:51 INFO SecurityManager: Changing modify acls groups to: 
18/12/13 16:35:51 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users  with view permissions: Set(cmj); groups with view permissions: Set(); users  with modify permissions: Set(cmj); groups with modify permissions: Set()
18/12/13 16:35:52 INFO Utils: Successfully started service 'sparkDriver' on port 64462.
18/12/13 16:35:52 INFO SparkEnv: Registering MapOutputTracker
18/12/13 16:35:53 INFO SparkEnv: Registering BlockManagerMaster
18/12/13 16:35:53 INFO BlockManagerMasterEndpoint: Using org.apache.spark.storage.DefaultTopologyMapper for getting topology information
18/12/13 16:35:53 INFO BlockManagerMasterEndpoint: BlockManagerMasterEndpoint up
18/12/13 16:35:53 INFO DiskBlockManager: Created local directory at C:\Users\cmj\AppData\Local\Temp\blockmgr-007d9800-e8d1-46b5-bd28-5d027ba2fb4c
18/12/13 16:35:53 INFO MemoryStore: MemoryStore started with capacity 4.1 GB
18/12/13 16:35:53 INFO SparkEnv: Registering OutputCommitCoordinator
18/12/13 16:35:53 INFO Utils: Successfully started service 'SparkUI' on port 4040.
18/12/13 16:35:53 INFO SparkUI: Bound SparkUI to 0.0.0.0, and started at http://192.168.191.1:4040
18/12/13 16:35:53 INFO Executor: Starting executor ID driver on host localhost
18/12/13 16:35:53 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 64471.
18/12/13 16:35:53 INFO NettyBlockTransferService: Server created on 192.168.191.1:64471
18/12/13 16:35:53 INFO BlockManager: Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy
18/12/13 16:35:53 INFO BlockManagerMaster: Registering BlockManager BlockManagerId(driver, 192.168.191.1, 64471, None)
18/12/13 16:35:53 INFO BlockManagerMasterEndpoint: Registering block manager 192.168.191.1:64471 with 4.1 GB RAM, BlockManagerId(driver, 192.168.191.1, 64471, None)
18/12/13 16:35:53 INFO BlockManagerMaster: Registered BlockManager BlockManagerId(driver, 192.168.191.1, 64471, None)
18/12/13 16:35:53 INFO BlockManager: Initialized BlockManager: BlockManagerId(driver, 192.168.191.1, 64471, None)
18/12/13 16:35:54 WARN ClosureCleaner: Expected a closure; got spark_api.mh_combineByKey$$$Lambda$2/366226635
18/12/13 16:35:55 INFO SparkContext: Starting job: collect at mh_combineByKey.scala:68
18/12/13 16:35:55 INFO DAGScheduler: Got job 0 (collect at mh_combineByKey.scala:68) with 2 output partitions
18/12/13 16:35:55 INFO DAGScheduler: Final stage: ResultStage 0 (collect at mh_combineByKey.scala:68)
18/12/13 16:35:55 INFO DAGScheduler: Parents of final stage: List()
18/12/13 16:35:55 INFO DAGScheduler: Missing parents: List()
18/12/13 16:35:55 INFO DAGScheduler: Submitting ResultStage 0 (MapPartitionsRDD[1] at mapPartitionsWithIndex at mh_combineByKey.scala:68), which has no missing parents
18/12/13 16:35:55 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 2.5 KB, free 4.1 GB)
18/12/13 16:35:55 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 1563.0 B, free 4.1 GB)
18/12/13 16:35:55 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on 192.168.191.1:64471 (size: 1563.0 B, free: 4.1 GB)
18/12/13 16:35:55 INFO SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:996
18/12/13 16:35:55 INFO DAGScheduler: Submitting 2 missing tasks from ResultStage 0 (MapPartitionsRDD[1] at mapPartitionsWithIndex at mh_combineByKey.scala:68)
18/12/13 16:35:55 INFO TaskSchedulerImpl: Adding task set 0.0 with 2 tasks
18/12/13 16:35:56 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, executor driver, partition 0, PROCESS_LOCAL, 6102 bytes)
18/12/13 16:35:56 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
[partID:0, val: (a,a)]
[partID:0, val: (b,b)]
[partID:0, val: (c,c)]
[partID:0, val: (e,e)]
[partID:0, val: (f,f)]
[partID:0, val: (a,a)]
[partID:0, val: (b,b)]

18/12/13 16:36:14 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 1174 bytes result sent to driver
18/12/13 16:36:14 INFO TaskSetManager: Starting task 1.0 in stage 0.0 (TID 1, localhost, executor driver, partition 1, PROCESS_LOCAL, 6103 bytes)
18/12/13 16:36:14 INFO Executor: Running task 1.0 in stage 0.0 (TID 1)
[partID:1, val: (c,c)]
[partID:1, val: (e,e)]
[partID:1, val: (f,f)]
[partID:1, val: (b,b)]
[partID:1, val: (c,c)]
[partID:1, val: (e,e)]
[partID:1, val: (f,f)]

18/12/13 16:36:14 INFO Executor: Finished task 1.0 in stage 0.0 (TID 1). 1082 bytes result sent to driver
18/12/13 16:36:14 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 18324 ms on localhost (executor driver) (1/2)
18/12/13 16:36:14 INFO TaskSetManager: Finished task 1.0 in stage 0.0 (TID 1) in 112 ms on localhost (executor driver) (2/2)
18/12/13 16:36:14 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool 
18/12/13 16:36:14 INFO DAGScheduler: ResultStage 0 (collect at mh_combineByKey.scala:68) finished in 18.459 s
18/12/13 16:36:14 INFO DAGScheduler: Job 0 finished: collect at mh_combineByKey.scala:68, took 19.166256 s
18/12/13 16:36:14 WARN ClosureCleaner: Expected a closure; got spark_api.mh_combineByKey$$$Lambda$10/636953520
18/12/13 16:36:14 WARN ClosureCleaner: Expected a closure; got spark_api.mh_combineByKey$$$Lambda$11/1252069894
18/12/13 16:36:14 WARN ClosureCleaner: Expected a closure; got spark_api.mh_combineByKey$$$Lambda$12/67985650
18/12/13 16:36:14 WARN ClosureCleaner: Expected a closure; got spark_api.mh_combineByKey$$$Lambda$13/748229733
18/12/13 16:36:14 INFO SparkContext: Starting job: collect at mh_combineByKey.scala:71
18/12/13 16:36:14 INFO DAGScheduler: Registering RDD 0 (parallelize at mh_combineByKey.scala:67)
18/12/13 16:36:14 INFO DAGScheduler: Got job 1 (collect at mh_combineByKey.scala:71) with 2 output partitions
18/12/13 16:36:14 INFO DAGScheduler: Final stage: ResultStage 2 (collect at mh_combineByKey.scala:71)
18/12/13 16:36:14 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 1)
18/12/13 16:36:14 INFO DAGScheduler: Missing parents: List(ShuffleMapStage 1)
18/12/13 16:36:14 INFO DAGScheduler: Submitting ShuffleMapStage 1 (ParallelCollectionRDD[0] at parallelize at mh_combineByKey.scala:67), which has no missing parents
18/12/13 16:36:14 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 3.0 KB, free 4.1 GB)
18/12/13 16:36:14 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 1771.0 B, free 4.1 GB)
18/12/13 16:36:14 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on 192.168.191.1:64471 (size: 1771.0 B, free: 4.1 GB)
18/12/13 16:36:14 INFO SparkContext: Created broadcast 1 from broadcast at DAGScheduler.scala:996
18/12/13 16:36:14 INFO DAGScheduler: Submitting 2 missing tasks from ShuffleMapStage 1 (ParallelCollectionRDD[0] at parallelize at mh_combineByKey.scala:67)
18/12/13 16:36:14 INFO TaskSchedulerImpl: Adding task set 1.0 with 2 tasks
18/12/13 16:36:14 INFO TaskSetManager: Starting task 0.0 in stage 1.0 (TID 2, localhost, executor driver, partition 0, PROCESS_LOCAL, 6091 bytes)
18/12/13 16:36:14 INFO Executor: Running task 0.0 in stage 1.0 (TID 2)
createCombiner-----------   |a| 
当前线程的名称为: Executor task launch worker-0
createCombiner-----------   |b| 
当前线程的名称为: Executor task launch worker-0
createCombiner-----------   |c| 
当前线程的名称为: Executor task launch worker-0
createCombiner-----------   |e| 
当前线程的名称为: Executor task launch worker-0
createCombiner-----------   |f| 
当前线程的名称为: Executor task launch worker-0
mergeValue  |a| +a
mergeValue  |b| +b

18/12/13 16:36:14 INFO Executor: Finished task 0.0 in stage 1.0 (TID 2). 1586 bytes result sent to driver
18/12/13 16:36:14 INFO TaskSetManager: Starting task 1.0 in stage 1.0 (TID 3, localhost, executor driver, partition 1, PROCESS_LOCAL, 6092 bytes)
18/12/13 16:36:14 INFO Executor: Running task 1.0 in stage 1.0 (TID 3)
18/12/13 16:36:14 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 2) in 122 ms on localhost (executor driver) (1/2)
createCombiner-----------   |c| 
当前线程的名称为: Executor task launch worker-0
createCombiner-----------   |e| 
当前线程的名称为: Executor task launch worker-0
createCombiner-----------   |f| 
当前线程的名称为: Executor task launch worker-0
createCombiner-----------   |b| 
当前线程的名称为: Executor task launch worker-0
mergeValue  |c| +c
mergeValue  |e| +e
mergeValue  |f| +f

18/12/13 16:36:14 INFO Executor: Finished task 1.0 in stage 1.0 (TID 3). 1499 bytes result sent to driver
18/12/13 16:36:14 INFO TaskSetManager: Finished task 1.0 in stage 1.0 (TID 3) in 209 ms on localhost (executor driver) (2/2)
18/12/13 16:36:14 INFO TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool 
18/12/13 16:36:14 INFO DAGScheduler: ShuffleMapStage 1 (parallelize at mh_combineByKey.scala:67) finished in 0.316 s
18/12/13 16:36:14 INFO DAGScheduler: looking for newly runnable stages
18/12/13 16:36:14 INFO DAGScheduler: running: Set()
18/12/13 16:36:14 INFO DAGScheduler: waiting: Set(ResultStage 2)
18/12/13 16:36:14 INFO DAGScheduler: failed: Set()
18/12/13 16:36:14 INFO DAGScheduler: Submitting ResultStage 2 (MapPartitionsRDD[3] at mapPartitionsWithIndex at mh_combineByKey.scala:71), which has no missing parents
18/12/13 16:36:14 INFO MemoryStore: Block broadcast_2 stored as values in memory (estimated size 4.2 KB, free 4.1 GB)
18/12/13 16:36:14 INFO MemoryStore: Block broadcast_2_piece0 stored as bytes in memory (estimated size 2.2 KB, free 4.1 GB)
18/12/13 16:36:14 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on 192.168.191.1:64471 (size: 2.2 KB, free: 4.1 GB)
18/12/13 16:36:14 INFO SparkContext: Created broadcast 2 from broadcast at DAGScheduler.scala:996
18/12/13 16:36:14 INFO DAGScheduler: Submitting 2 missing tasks from ResultStage 2 (MapPartitionsRDD[3] at mapPartitionsWithIndex at mh_combineByKey.scala:71)
18/12/13 16:36:14 INFO TaskSchedulerImpl: Adding task set 2.0 with 2 tasks
18/12/13 16:36:14 INFO TaskSetManager: Starting task 0.0 in stage 2.0 (TID 4, localhost, executor driver, partition 0, ANY, 5750 bytes)
18/12/13 16:36:14 INFO Executor: Running task 0.0 in stage 2.0 (TID 4)
18/12/13 16:36:14 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
18/12/13 16:36:14 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 9 ms
mergeCombiners  |b| +b+ |b| 
mergeCombiners  |f| + |f| +f
[partID:0, val: (b, |b| +b+ |b| )]
[partID:0, val: (f, |f| + |f| +f)]

18/12/13 16:36:14 INFO Executor: Finished task 0.0 in stage 2.0 (TID 4). 1739 bytes result sent to driver
18/12/13 16:36:14 INFO TaskSetManager: Starting task 1.0 in stage 2.0 (TID 5, localhost, executor driver, partition 1, ANY, 5750 bytes)
18/12/13 16:36:14 INFO TaskSetManager: Finished task 0.0 in stage 2.0 (TID 4) in 69 ms on localhost (executor driver) (1/2)
18/12/13 16:36:14 INFO Executor: Running task 1.0 in stage 2.0 (TID 5)
18/12/13 16:36:14 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
18/12/13 16:36:14 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 2 ms
mergeCombiners  |c| + |c| +c
mergeCombiners  |e| + |e| +e
[partID:1, val: (e, |e| + |e| +e)]
[partID:1, val: (a, |a| +a)]
[partID:1, val: (c, |c| + |c| +c)]

18/12/13 16:36:14 INFO Executor: Finished task 1.0 in stage 2.0 (TID 5). 1680 bytes result sent to driver
18/12/13 16:36:14 INFO TaskSetManager: Finished task 1.0 in stage 2.0 (TID 5) in 19 ms on localhost (executor driver) (2/2)
18/12/13 16:36:14 INFO TaskSchedulerImpl: Removed TaskSet 2.0, whose tasks have all completed, from pool 
18/12/13 16:36:14 INFO DAGScheduler: ResultStage 2 (collect at mh_combineByKey.scala:71) finished in 0.088 s
18/12/13 16:36:14 INFO DAGScheduler: Job 1 finished: collect at mh_combineByKey.scala:71, took 0.480243 s
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
(b, |b| +b+ |b| )
(f, |f| + |f| +f)
(e, |e| + |e| +e)
(a, |a| +a)
(c, |c| + |c| +c)

18/12/13 16:36:14 INFO SparkUI: Stopped Spark web UI at http://192.168.191.1:4040
18/12/13 16:36:14 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
18/12/13 16:36:15 INFO MemoryStore: MemoryStore cleared
18/12/13 16:36:15 INFO BlockManager: BlockManager stopped
18/12/13 16:36:15 INFO BlockManagerMaster: BlockManagerMaster stopped
18/12/13 16:36:15 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
18/12/13 16:36:15 INFO SparkContext: Successfully stopped SparkContext
18/12/13 16:36:15 INFO ShutdownHookManager: Shutdown hook called
18/12/13 16:36:15 INFO ShutdownHookManager: Deleting directory C:\Users\cmj\AppData\Local\Temp\spark-cc32ad5b-20f2-4269-9920-df64a44f33e8
Disconnected from the target VM, address: '127.0.0.1:64438', transport: 'socket'

Process finished with exit code 0
 

总结:

猜你喜欢

转载自blog.csdn.net/qq_29499107/article/details/84989457