DStream principle and source code

Demo

object KafkaDirectDstream {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("KafkaDirectDstream")
    sparkConf.setMaster("local[*]")
    sparkConf.set("spark.streaming.kafka.maxRatePerPartition", "1")
    sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    val streamingContext = new StreamingContext(sparkConf, Seconds(2))
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "s1:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "p1",
      "auto.offset.reset" -> "earliest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )
    val topics = Array("test_topic")
   
    val dstream = KafkaUtils.createDirectStream[String, String](
      streamingContext,
      PreferConsistent,
      Subscribe[String, String](topics, kafkaParams)
    )

    dstream.map(record => (record.key, record.value, record.partition(), record.offset()))
      .foreachRDD(rdd => {
            ....
        })

      })
    streamingContext.start()
    streamingContext.awaitTermination()
  }
}

Spark source code analysis

StreamingContext

Whole Dstream similar RDD "lazy loading", the starting point is streamingContext.start ().

For easy viewing, I removed some other code, leaving only the key logic code

def start(): Unit = synchronized {
  state match {
    case INITIALIZED =>
      StreamingContext.ACTIVATION_LOCK.synchronized {
        try {
          validate()

          // 最关键的地方 new 了一个线程去启动 scheduler
          ThreadUtils.runInNewThread("streaming-start") {
            sparkContext.setCallSite(startSite.get)
            sparkContext.clearJobGroup()
            sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "false")
            savedProperties.set(SerializationUtils.clone(sparkContext.localProperties.get()))
            // 
            scheduler.start()
          }
          state = StreamingContextState.ACTIVE
          scheduler.listenerBus.post(
            StreamingListenerStreamingStarted(System.currentTimeMillis()))
        } catch {
          
        }
        StreamingContext.setActiveContext(this)
      }
  }
}

job Scheduler

Opening the scheduler.start () is JobScheduler

def start(): Unit = synchronized {
  if (eventLoop != null) return // scheduler has already been started

  logDebug("Starting JobScheduler")
  
  // EventLoop底层就是一个简单的 BlockQueue 实现的消息接受和通知调度器,
  // 因为本身这些代码在Driver运行,所以不涉及到跨节点,所以也不涉及netty通信等
  eventLoop = new EventLoop[JobSchedulerEvent]("JobScheduler") {
    override protected def onReceive(event: JobSchedulerEvent): Unit = processEvent(event)

    override protected def onError(e: Throwable): Unit = reportError("Error in job scheduler", e)
  }
  // 关键  启动 事件调度,上面实现了 EventLoop 的抽象方法(有任务开始、任务完成、ERROR三个事件)
  eventLoop.start()

  // attach rate controllers of input streams to receive batch completion updates
  for {
    inputDStream <- ssc.graph.getInputStreams
    rateController <- inputDStream.rateController
  } ssc.addStreamingListener(rateController)

  listenerBus.start()
  receiverTracker = new ReceiverTracker(ssc)
  inputInfoTracker = new InputInfoTracker(ssc)

  val executorAllocClient: ExecutorAllocationClient = ssc.sparkContext.schedulerBackend match {
    case b: ExecutorAllocationClient => b.asInstanceOf[ExecutorAllocationClient]
    case _ => null
  }

  executorAllocationManager = ExecutorAllocationManager.createIfEnabled(
    executorAllocClient,
    receiverTracker,
    ssc.conf,
    ssc.graph.batchDuration.milliseconds,
    clock)
  executorAllocationManager.foreach(ssc.addStreamingListener)
  receiverTracker.start()
  // 关键 开启产生job 的调度
  jobGenerator.start()
  executorAllocationManager.foreach(_.start())
  logInfo("Started JobScheduler")
}

 private def processEvent(event: JobSchedulerEvent) {
    try {
      event match {
        case JobStarted(job, startTime) => handleJobStart(job, startTime)
        case JobCompleted(job, completedTime) => handleJobCompletion(job, completedTime)
        case ErrorReported(m, e) => handleError(m, e)
      }
    } catch {
      case e: Throwable =>
        reportError("Error in job scheduler", e)
    }
  }

 JobGenerator

def start(): Unit = synchronized {
  if (eventLoop != null) return // generator has already been started

  // Call checkpointWriter here to initialize it before eventLoop uses it to avoid a deadlock.
  // See SPARK-10125
  checkpointWriter
 // 又是一个事件调度
  eventLoop = new EventLoop[JobGeneratorEvent]("JobGenerator") {
    override protected def onReceive(event: JobGeneratorEvent): Unit = processEvent(event)

    override protected def onError(e: Throwable): Unit = {
      jobScheduler.reportError("Error in job generator", e)
    }
  }
  eventLoop.start()

  if (ssc.isCheckpointPresent) {
    restart()
  } else {
    // 这边开启 产生批次的事件
    startFirstTime()
  }
}

//几个事件
private def processEvent(event: JobGeneratorEvent) {
    logDebug("Got event " + event)
    event match {
      case GenerateJobs(time) => generateJobs(time)
      case ClearMetadata(time) => clearMetadata(time)
      case DoCheckpoint(time, clearCheckpointDataLater) =>
        doCheckpoint(time, clearCheckpointDataLater)
      case ClearCheckpointData(time) => clearCheckpointData(time)
    }
  }
// 我们关注主要流程 ,只需要关注产生 job
private def generateJobs(time: Time) {
    // Checkpoint all RDDs marked for checkpointing to ensure their lineages are
    // truncated periodically. Otherwise, we may run into stack overflows (SPARK-6847).
    ssc.sparkContext.setLocalProperty(RDD.CHECKPOINT_ALL_MARKED_ANCESTORS, "true")
    Try {
      jobScheduler.receiverTracker.allocateBlocksToBatch(time) // allocate received blocks to batch
      // 这边是通过Graph产生
      graph.generateJobs(time) // generate jobs using allocated block
    } match {
      case Success(jobs) =>
        val streamIdToInputInfos = jobScheduler.inputInfoTracker.getInfo(time)
        jobScheduler.submitJobSet(JobSet(time, jobs, streamIdToInputInfos))
      case Failure(e) =>
        jobScheduler.reportError("Error generating jobs for time " + time, e)
        PythonDStream.stopStreamingContextIfPythonProcessIsDead(e)
    }
    eventLoop.post(DoCheckpoint(time, clearCheckpointDataLater = false))
  }

In addition when to send it generated lots of events

ssc.graph.batchDuration.milliseconds is new StreamContext batch interval interval

private val timer = new RecurringTimer(clock, ssc.graph.batchDuration.milliseconds,
  longTime => eventLoop.post(GenerateJobs(new Time(longTime))), "JobGenerator")

RecurringTimer

开启线程
def start(startTime: Long): Long = synchronized {
  nextTime = startTime
  thread.start()
  logInfo("Started timer for " + name + " at time " + nextTime)
  nextTime
}

private val thread = new Thread("RecurringTimer - " + name) {
    setDaemon(true)
    override def run() { loop }
}


 private def loop() {
    try {
      while (!stopped) {
        triggerActionForNextInterval()
      }
      triggerActionForNextInterval()
    } catch {
      case e: InterruptedException =>
    }
  }
  
  
  private def triggerActionForNextInterval(): Unit = {
    //这边就是阻塞 有两中 clock,第一种是 SystemClock 是Thread.sleep 
    // 第二种是 ManualClock object.wait()
    clock.waitTillTime(nextTime)
    // 这个callback 就是   longTime => eventLoop.post(GenerateJobs(new Time(longTime)))
    // 发送 GenerateJobs 事件
    callback(nextTime)
    prevTime = nextTime
    nextTime += period
    logDebug("Callback for " + name + " called at time " + prevTime)
  }

Dshtreamgrf

//先找到 generateJobs 方法
def generateJobs(time: Time): Seq[Job] = {
  logDebug("Generating jobs for time " + time)
  val jobs = this.synchronized {
    outputStreams.flatMap { outputStream =>
      val jobOption = outputStream.generateJob(time)
      jobOption.foreach(_.setCallSite(outputStream.creationSite))
      jobOption
    }
  }
  logDebug("Generated " + jobs.length + " jobs for time " + time)
  jobs
}


// 定位到 outputStreams
private val outputStreams = new ArrayBuffer[DStream[_]]()


// 定位向这个list 加DStream的方法
def addOutputStream(outputStream: DStream[_]) {
    this.synchronized {
      outputStream.setGraph(this)
      outputStreams += outputStream
    }
  }

DStream

private[streaming] def register(): DStream[T] = {
  ssc.graph.addOutputStream(this)
  this
}

//最后是在调用ForeachRDD的时候加入进去
private def foreachRDD(
      foreachFunc: (RDD[T], Time) => Unit,
      displayInnerRDDOps: Boolean): Unit = {
    new ForEachDStream(this,
      context.sparkContext.clean(foreachFunc, false), displayInnerRDDOps).register()
  }

Several output methods is provided by the official website of Action is an indirect method of triggering DStream job

These methods are called foreachRDD method, which is an indirect trigger Action

def print(num: Int): Unit = ssc.withScope {
  def foreachFunc: (RDD[T], Time) => Unit = {
    (rdd: RDD[T], time: Time) => {
      val firstNum = rdd.take(num + 1)
      // scalastyle:off println
      println("-------------------------------------------")
      println(s"Time: $time")
      println("-------------------------------------------")
      firstNum.take(num).foreach(println)
      if (firstNum.length > num) println("...")
      println()
      // scalastyle:on println
    }
  }
  foreachRDD(context.sparkContext.clean(foreachFunc), displayInnerRDDOps = false)
}

def saveAsObjectFiles(prefix: String, suffix: String = ""): Unit = ssc.withScope {
    val saveFunc = (rdd: RDD[T], time: Time) => {
    val file = rddToFileName(prefix, suffix, time)
        rdd.saveAsObjectFile(file)
    }
    this.foreachRDD(saveFunc, displayInnerRDDOps = false)
}

Dshtreamgrf

Back to generateJobs method, traversing all through ForeachRDD added DStream, call generateJob method

def generateJobs(time: Time): Seq[Job] = {
  logDebug("Generating jobs for time " + time)
  val jobs = this.synchronized {
    outputStreams.flatMap { outputStream =>
      val jobOption = outputStream.generateJob(time)
      jobOption.foreach(_.setCallSite(outputStream.creationSite))
      jobOption
    }
  }
  logDebug("Generated " + jobs.length + " jobs for time " + time)
  jobs
}

DStream

Here's the final call getOrCompute get RDD, in fact getOrCompute inherited DStream rewritten, for example, calls the map method. The first is to rewrite compute Dstream method, getOrCompute call compute

private[streaming] def generateJob(time: Time): Option[Job] = {
  getOrCompute(time) match {
    case Some(rdd) =>
      val jobFunc = () => {
        val emptyFunc = { (iterator: Iterator[T]) => {} }
        // 最最最最关键的地方 这边就提交了这个批次产生的RDD 也就是开始RDD的流程
        context.sparkContext.runJob(rdd, emptyFunc)
      }
      Some(new Job(time, jobFunc))
    case None => None
  }
}

private[streaming] final def getOrCompute(time: Time): Option[RDD[T]] = {
    // If RDD was already generated, then retrieve it from HashMap,
    // or else compute the RDD
    generatedRDDs.get(time).orElse {
      // Compute the RDD if time is valid (e.g. correct time in a sliding window)
      // of RDD generation, else generate nothing.
      if (isTimeValid(time)) {

        val rddOption = createRDDWithLocalProperties(time, displayInnerRDDOps = false) {
          // Disable checks for existing output directories in jobs launched by the streaming
          // scheduler, since we may need to write output to an existing directory during checkpoint
          // recovery; see SPARK-4835 for more details. We need to have this call here because
          // compute() might cause Spark jobs to be launched.
          SparkHadoopWriterUtils.disableOutputSpecValidation.withValue(true) {
            // 关键方法
            compute(time)
          }
        }

        rddOption.foreach { case newRDD =>
          // Register the generated RDD for caching and checkpointing
          if (storageLevel != StorageLevel.NONE) {
            newRDD.persist(storageLevel)
            logDebug(s"Persisting RDD ${newRDD.id} for time $time to $storageLevel")
          }
          if (checkpointDuration != null && (time - zeroTime).isMultipleOf(checkpointDuration)) {
            newRDD.checkpoint()
            logInfo(s"Marking RDD ${newRDD.id} for time $time for checkpointing")
          }
          generatedRDDs.put(time, newRDD)
        }
        rddOption
      } else {
        None
      }
    }
  }

def map[U: ClassTag](mapFunc: T => U): DStream[U] = ssc.withScope {
    new MappedDStream(this, context.sparkContext.clean(mapFunc))
}

MappedDStream

// call the first call is getOrCompute method of parent DStream. Similarly RDD, because I was created by KafkaUtils, so the first one is DirectKafkaInputDStream.

private[streaming]
class MappedDStream[T: ClassTag, U: ClassTag] (
    parent: DStream[T],
    mapFunc: T => U
  ) extends DStream[U](parent.ssc) {

  override def dependencies: List[DStream[_]] = List(parent)

  override def slideDuration: Duration = parent.slideDuration

  override def compute(validTime: Time): Option[RDD[U]] = {
    // 这边的 .map(_.map[U](mapFunc))  实际上是 rdd.map(mapFunc)
    parent.getOrCompute(validTime).map(_.map[U](mapFunc))
  }
}

DirectKafkaInputDStream

This method does not override getOrCompute but rewrite compute, most returns Some (RDD)

override def compute(validTime: Time): Option[KafkaRDD[K, V]] = {
  val untilOffsets = clamp(latestOffsets())
  val offsetRanges = untilOffsets.map { case (tp, uo) =>
    val fo = currentOffsets(tp)
    OffsetRange(tp.topic, tp.partition, fo, uo)
  }
  val useConsumerCache = context.conf.getBoolean("spark.streaming.kafka.consumer.cache.enabled",
    true)
  val rdd = new KafkaRDD[K, V](context.sparkContext, executorKafkaParams, offsetRanges.toArray,
    getPreferredHosts, useConsumerCache)

  // Report the record number and metadata of this batch interval to InputInfoTracker.
  val description = offsetRanges.filter { offsetRange =>
    // Don't display empty ranges.
    offsetRange.fromOffset != offsetRange.untilOffset
  }.map { offsetRange =>
    s"topic: ${offsetRange.topic}\tpartition: ${offsetRange.partition}\t" +
      s"offsets: ${offsetRange.fromOffset} to ${offsetRange.untilOffset}"
  }.mkString("\n")
  // Copy offsetRanges to immutable.List to prevent from being modified by the user
  val metadata = Map(
    "offsets" -> offsetRanges.toList,
    StreamInputInfo.METADATA_KEY_DESCRIPTION -> description)
  val inputInfo = StreamInputInfo(id, rdd.count, metadata)
  ssc.scheduler.inputInfoTracker.reportInfo(validTime, inputInfo)

  currentOffsets = untilOffsets
  commitAll()
  Some(rdd)
}

Local Job submission

Guess you like

Origin www.cnblogs.com/chouc/p/12341925.html
Recommended