spark1.6源码-----任务提交与执行之RDD的构建

概述：

spark的执行流程网上一大堆，我就不写了。

本文以sparkWordCount为例来解析

      val input=sc.textFile("C:\\Users\\pc\\PycharmProjects\\PyNews\\word*",1)
        .flatMap(_.split(" ")).map(x=>(x,1)).reduceByKey(_+_).saveAsTextFile("D:/niha")

ok，先看第一步textFile:

  /**
   * Read a text file from HDFS, a local file system (available on all nodes), or any
   * Hadoop-supported file system URI, and return it as an RDD of Strings.
   */
  //支持hadoop所能读的文件系统
  def textFile(
      path: String,
      minPartitions: Int = defaultMinPartitions): RDD[String] = withScope {
    assertNotStopped()
    //调用了hadoopFile
    //注意：TextInputFormat，LongWritable，Text 这些都是hadoop里面的数据类型哦
    hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],
      minPartitions).map(pair => pair._2.toString).setName(path)
  }
   /** Get an RDD for a Hadoop file with an arbitrary InputFormat
   *
   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
   * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
   * operation will create many references to the same object.
   * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
   * copy them using a `map` function.
   */
   //翻译：
   //使用任意的InputFormat 都能获取到HadoopRDD 意思就是说我们可以重写InputFormat或者使用其他InputFormat 来读取我们想要的数据。
   //
   //注意：这个RecordReader类能被相同的Writable对象多次使用，
   //直接缓存这个RDD或者拿来做聚合或者shuffle操作会创建很多相同的引用对同一个对象，如果你想这么操作的话，
   //要先使用map函数复制他们
  def hadoopFile[K, V](
      path: String,
      inputFormatClass: Class[_ <: InputFormat[K, V]],
      keyClass: Class[K],
      valueClass: Class[V],
      minPartitions: Int = defaultMinPartitions): RDD[(K, V)] = withScope {
    assertNotStopped()
    // A Hadoop configuration can be about 10 KB, which is pretty big, so broadcast it.
    //将 Hadoop configuration 广播出去
    val confBroadcast = broadcast(new SerializableConfiguration(hadoopConfiguration))
    //偏函数用于以后设置输入路径
    val setInputPathsFunc = (jobConf: JobConf) => FileInputFormat.setInputPaths(jobConf, path)
    //构建HadoopRDD
    new HadoopRDD(
      this,
      confBroadcast,
      Some(setInputPathsFunc),
      inputFormatClass,
      keyClass,
      valueClass,
      minPartitions).setName(path)
  }

在textFile方法获取到HadoopRDD之后又调用了RDD的map方法。

  /**
   * Return a new RDD by applying a function to all elements of this RDD.
   */
  def map[U: ClassTag](f: T => U): RDD[U] = withScope {
    //清除那些不能序列化的变量防止，RDD在网络传输过程中反序列话失败
    val cleanF = sc.clean(f)
    //调用了MapPartitionsRDD的父类RDD的辅助构造器
    new MapPartitionsRDD[U, T](this, (context, pid, iter) => iter.map(cleanF))
  }
  /** Construct an RDD with just a one-to-one dependency on one parent */
  def this(@transient oneParent: RDD[_]) =
    //将oneParent RDD封装为OneToOneDependency
    this(oneParent.context , List(new OneToOneDependency(oneParent)))
   //而OneToOneDependency继承自NarrowDependency（窄依赖）
  class OneToOneDependency[T](rdd: RDD[T]) extends NarrowDependency[T](rdd) {
  override def getParents(partitionId: Int): List[Int] = List(partitionId)
  }
  //调用RDD主构造器
  abstract class RDD[T: ClassTag](
    @transient private var _sc: SparkContext,
    //当前就为List(new OneToOneDependency(hadoopRDD))
    @transient private var deps: Seq[Dependency[_]]
  ) extends Serializable with Logging {
  //以下几个是RDD比较重要的方法
  /**
   * :: DeveloperApi ::
   * Implemented by subclasses to compute a given partition.
   */
  @DeveloperApi
  //RDD计算函数，由子类实现
  def compute(split: Partition, context: TaskContext): Iterator[T]
   /**
   * Implemented by subclasses to return the set of partitions in this RDD. This method will only
   * be called once, so it is safe to implement a time-consuming computation in it.
   */
  //获取这个RDD的分区的集合,由子类实现
  protected def getPartitions: Array[Partition]
  /**
   * Implemented by subclasses to return how this RDD depends on parent RDDs. This method will only
   * be called once, so it is safe to implement a time-consuming computation in it.
   */
  //获取当前RDD的父RDDs，由子类实现
  protected def getDependencies: Seq[Dependency[_]] = deps
  }

最后就是MapPartitionsRDD.deps------>List(new OneToOneDependency(hadoopRDD)) hadoopRDD.deps----->null

下一步flatMap

  /**
   *  Return a new RDD by first applying a function to all elements of this
   *  RDD, and then flattening the results.
   */
   //跟map相同
  def flatMap[U: ClassTag](f: T => TraversableOnce[U]): RDD[U] = withScope {
    val cleanF = sc.clean(f)
    new MapPartitionsRDD[U, T](this, (context, pid, iter) => iter.flatMap(cleanF))
  }

所以RDD链现在生成

MapPartitionsRDD.deps------>List(new OneToOneDependency(MapPartitionsRDD))

MapPartitionsRDD--------->List(new OneToOneDependency(hadoopRDD))

hadoopRDD.deps----->null

接下来又是map:

所以RDD链生成

MapPartitionsRDD.deps------>List(new OneToOneDependency(MapPartitionsRDD))

MapPartitionsRDD.deps--------->List(new OneToOneDependency(hadoopRDD))

hadoopRDD.deps----->null

接下来是reduceByKey：

首先 MapPartitionsRDD和RDD是没有reduceBykey这个函数的，是通过隐式转换实现调用的。

  /**
   * Merge the values for each key using an associative reduce function. This will also perform
   * the merging locally on each mapper before sending results to a reducer, similarly to a
   * "combiner" in MapReduce. Output will be hash-partitioned with the existing partitioner/
   * parallelism level.
   */
   //
  def reduceByKey(func: (V, V) => V): RDD[(K, V)] = self.withScope {
    reduceByKey(defaultPartitioner(self), func)
  }
  //defaultPartitioner 代码如下
  /**
   * Choose a partitioner to use for a cogroup-like operation between a number of RDDs.
   *
   * If any of the RDDs already has a partitioner, choose that one.
   *
   * Otherwise, we use a default HashPartitioner. For the number of partitions, if
   * spark.default.parallelism is set, then we'll use the value from SparkContext
   * defaultParallelism, otherwise we'll use the max number of upstream partitions.
   *
   * Unless spark.default.parallelism is set, the number of partitions will be the
   * same as the number of partitions in the largest upstream RDD, as this should
   * be least likely to cause out-of-memory errors.
   *
   * We use two method parameters (rdd, others) to enforce callers passing at least 1 RDD.
   */
   //简单翻译: 在进行cogroup这样的操作的时候选择一个分区器，用在这些RDD身上。
   //如果任何一个RDD已经有一个了，那就选它。
   //不然就用默认的HashPartitioner,对于分区数量，就看你设置了spark.default.parallelism这个值没有设.
   //没设就选择RDD依赖链中RDD分区数最多的那个数。
  def defaultPartitioner(rdd: RDD[_], others: RDD[_]*): Partitioner = {
    //partitions获取到每个RDD的分区数
    //排个序获取最大的那个
    val bySize = (Seq(rdd) ++ others).sortBy(_.partitions.size).reverse
    for (r <- bySize if r.partitioner.isDefined && r.partitioner.get.numPartitions > 0) {
      return r.partitioner.get
    }
    if (rdd.context.conf.contains("spark.default.parallelism")) {
      new HashPartitioner(rdd.context.defaultParallelism)
    } else {
      new HashPartitioner(bySize.head.partitions.size)
    }
  }
  
  /**
   * Get the array of partitions of this RDD, taking into account whether the
   * RDD is checkpointed or not.
   */
  final def partitions: Array[Partition] = {
    
    checkpointRDD.map(_.partitions).getOrElse {
      if (partitions_ == null) {
        //getPartitions获取该RDD的分区
        partitions_ = getPartitions
      }
      partitions_
    }
  }
  //子类MapPartitionsRDD的getPartitions实现
  //调用当前RDD所依赖的第一个父RDD的partitions,产生循环，一直循环到HadoopRDD，最终调用HadoopRDD的getPartitions
  override def getPartitions: Array[Partition] = firstParent[T].partitions  
  
  //firstParent的实现
  //获取当前RDD所依赖的第一个父RDD
  protected[spark] def firstParent[U: ClassTag]: RDD[U] = {
    dependencies.head.rdd.asInstanceOf[RDD[U]]
  }
  
  /**
   * Get the list of dependencies of this RDD, taking into account whether the
   * RDD is checkpointed or not.
   */
  //获取
  final def dependencies: Seq[Dependency[_]] = {
    checkpointRDD.map(r => List(new OneToOneDependency(r))).getOrElse {
      if (dependencies_ == null) {
        //获取当前RDD的依赖链
        dependencies_ = getDependencies
      }
      dependencies_
    }
  }
  //获取依赖链deps
  protected def getDependencies: Seq[Dependency[_]] = deps
  
  //HadoopRDD的getPartitions
  override def getPartitions: Array[Partition] = {
    val jobConf = getJobConf()
    // add the credentials here as this can be called before SparkContext initialized
    SparkHadoopUtil.get.addCredentials(jobConf)
    val inputFormat = getInputFormat(jobConf)
    //通过inputFormat.getSplits来获取的分区数
    val inputSplits = inputFormat.getSplits(jobConf, minPartitions)
    val array = new Array[Partition](inputSplits.size)
    for (i <- 0 until inputSplits.size) {
      array(i) = new HadoopPartition(id, i, inputSplits(i))
    }
    array
  }
  --------------------------
  最终调用如下方法
  /**
   * Merge the values for each key using an associative reduce function. This will also perform
   * the merging locally on each mapper before sending results to a reducer, similarly to a
   * "combiner" in MapReduce.
   */
  def reduceByKey(partitioner: Partitioner, func: (V, V) => V): RDD[(K, V)] = self.withScope {
    combineByKeyWithClassTag[V]((v: V) => v, func, func, partitioner)
  }
  
  
  /**
   * :: Experimental ::
   * Generic function to combine the elements for each key using a custom set of aggregation
   * functions. Turns an RDD[(K, V)] into a result of type RDD[(K, C)], for a "combined type" C
   * Note that V and C can be different -- for example, one might group an RDD of type
   * (Int, Int) into an RDD of type (Int, Seq[Int]). Users provide three functions:
   *
   *  - `createCombiner`, which turns a V into a C (e.g., creates a one-element list)
   *  - `mergeValue`, to merge a V into a C (e.g., adds it to the end of a list)
   *  - `mergeCombiners`, to combine two C's into a single one.
   *
   * In addition, users can control the partitioning of the output RDD, and whether to perform
   * map-side aggregation (if a mapper can produce multiple items with the same key).
   */
   //翻译：
   //一个通用（最底层）的函数：使用一个自定义的聚集函数来作用于每一个key
   //将 RDD[K,V]类型 变为 RDD[K,V]类型，对于 聚集后的类型C 可以与V 不同
   //比如(Int, Int) 聚集之后 (Int, Seq[Int])
   //createCombiner   将V类型变为C类型
   //mergeValue       将V的值合并到C中
   //mergeCombiners   将两个 C类型的值合并为一个
  @Experimental
  def combineByKeyWithClassTag[C](
      createCombiner: V => C,
      mergeValue: (C, V) => C,
      mergeCombiners: (C, C) => C,
      partitioner: Partitioner,
      mapSideCombine: Boolean = true,
      serializer: Serializer = null)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope {
    require(mergeCombiners != null, "mergeCombiners must be defined") // required as of Spark 0.9.0
    if (keyClass.isArray) {
      if (mapSideCombine) {
        throw new SparkException("Cannot use map-side combining with array keys.")
      }
      if (partitioner.isInstanceOf[HashPartitioner]) {
        throw new SparkException("Default partitioner cannot partition array keys.")
      }
    }
    val aggregator = new Aggregator[K, V, C](
      self.context.clean(createCombiner),
      self.context.clean(mergeValue),
      self.context.clean(mergeCombiners))
    if (self.partitioner == Some(partitioner)) {
      self.mapPartitions(iter => {
        val context = TaskContext.get()
        new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))
      }, preservesPartitioning = true)
    } else {
     //在这里创建了ShuffledRDD
      new ShuffledRDD[K, V, C](self, partitioner)
        .setSerializer(serializer)
        .setAggregator(aggregator)
        .setMapSideCombine(mapSideCombine)
    }
  }
  //ShuffledRDD实现 构造器中的prev为调用reduceBykey那个RDD即为MapPartitionRDD
  class ShuffledRDD[K: ClassTag, V: ClassTag, C: ClassTag](
    @transient var prev: RDD[_ <: Product2[K, V]],
    part: Partitioner)
  extends RDD[(K, C)](prev.context, Nil) {

  private var serializer: Option[Serializer] = None

  private var keyOrdering: Option[Ordering[K]] = None

  private var aggregator: Option[Aggregator[K, V, C]] = None

  private var mapSideCombine: Boolean = false

  /** Set a serializer for this RDD's shuffle, or null to use the default (spark.serializer) */
  def setSerializer(serializer: Serializer): ShuffledRDD[K, V, C] = {
    this.serializer = Option(serializer)
    this
  }

  /** Set key ordering for RDD's shuffle. */
  def setKeyOrdering(keyOrdering: Ordering[K]): ShuffledRDD[K, V, C] = {
    this.keyOrdering = Option(keyOrdering)
    this
  }

  /** Set aggregator for RDD's shuffle. */
  def setAggregator(aggregator: Aggregator[K, V, C]): ShuffledRDD[K, V, C] = {
    this.aggregator = Option(aggregator)
    this
  }

  /** Set mapSideCombine flag for RDD's shuffle. */
  def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = {
    this.mapSideCombine = mapSideCombine
    this
  }
  //只有在使用该方法时才会构建ShuffleDependency
  override def getDependencies: Seq[Dependency[_]] = {
    //其中prev为MapPartitionRDD
    //所以构建为new ShuffleDependency(MapPartitionRDD,part, serializer, keyOrdering, aggregator, mapSideCombine))
    List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine))
  }

  override val partitioner = Some(part)

  override def getPartitions: Array[Partition] = {
    Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i))
  }

  override protected def getPreferredLocations(partition: Partition): Seq[String] = {
    val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]
    val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
    tracker.getPreferredLocationsForShuffle(dep, partition.index)
  }

  override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = {
    val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
    SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context)
      .read()
      .asInstanceOf[Iterator[(K, C)]]
  }

  override def clearDependencies() {
    super.clearDependencies()
    prev = null
  }
}

ok现在的RDD链为：

ShuffleRDD.deps-------------->List(new ShuffleDependency(MapPartitionsRDD))

MapPartitionsRDD.deps------>List(new OneToOneDependency(MapPartitionsRDD))

MapPartitionsRDD.deps--------->List(new OneToOneDependency(hadoopRDD))

hadoopRDD.deps----->null

这也就是传说中的DAG图

spark1.6源码-----任务提交与执行之RDD的构建

猜你喜欢