文章目录
参考博文: https://blog.csdn.net/klordy_123/article/details/84109501
1. Spark Streaming的checkpoint机制
1.1 Spark Streaming 的checkpoint示例
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka.KafkaUtils
object SaveDataToMysql {
def main(args: Array[String]): Unit = {
// 屏蔽不必要的日志 ,在终端上显示需要的日志
Logger.getLogger("org.apache.spark").setLevel(Level.OFF)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
Logger.getLogger("org.apache.kafka.clients.consumer").setLevel(Level.OFF)
//初始化sparkStreaming
val conf = new SparkConf().setAppName("SaveDataToMysql").setMaster("local[*]")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(10))
//连接s3需要的key和密码
ssc.sparkContext.hadoopConfiguration.set("fs.s3a.access.key","aws的key")
ssc.sparkContext.hadoopConfiguration.set("fs.s3a.secret.key","aws的密码")
ssc.sparkContext.hadoopConfiguration.set("fs.s3a.endpoint", "s3.cn-north-1.amazonaws.com.cn")
//设置连接Kafka的配置信息
val zkQuorum = "192.168.1.112:2181" //zookeeper集群的IP:port,IP:port,IP:port
val group = "testgroup" //在consumer.properties配置group.id
val topics = "myTopic1 myTopic2" //选择要连接的producer,它是以topic来区分每个producer的。例如:我这里的创建的topic是huiliyang
val numThreads = 2 //线程
val topicpMap = topics.split("\n").map((_,numThreads.toInt)).toMap //这个是有可能有好几个topic同时提供数据,那么我们要把它用空格分割开,然后映射成(topic,2),再转换成map集合
ssc.checkpoint("hdfs://10.47.85.158:9000/checkpointDir/")
val lines: DStream[String] = KafkaUtils.createStream(ssc,zkQuorum,group,topicpMap).map(_._2) //创建流
lines.print()
//保存到mysql
lines.map(x=>x.split(",")).foreachRDD(line =>{
line.foreachPartition(rdd =>{
val conn = ConnectPoolUtil.getConnection //ConnectPoolUtil是我创建的一个数据库连接池,getConnection是它的一个方法
conn.setAutoCommit(false); //设为手动提交
val stmt = conn.createStatement()
rdd.foreach(word=>{
stmt.addBatch("insert into test_log2(time, ip, user_id, user_type, source, scene) values('" + word(0)+"','"+word(1)+"','"+word(2)+"','"+word(3)+"','"+word(4)+"','"+word(5) + "')")
})
stmt.executeBatch()
conn.commit()
conn.close()
})
})
ssc.start()
ssc.awaitTermination()
}
}
1.2 源码分析
1.2.1 初始化和设置checkpoint目录
首先看上面的StreamContext的构造方法:
def this(sparkContext: SparkContext, batchDuration: Duration) = {
this(sparkContext, null, batchDuration)
}
将SparkContext变量和时间间隔设置进StreamContext中当做成员变量,此处没有设置checkpoint目录。
然后示例代码中执行了StreamContext方法中的checkpoint() 方法:
def checkpoint(directory: String) {
if (directory != null) {
val path = new Path(directory)
//默认是放在hdfs中的
val fs = path.getFileSystem(sparkContext.hadoopConfiguration)
fs.mkdirs(path)
val fullPath = fs.getFileStatus(path).getPath().toString
//和Spark中的方法一致,在SparkContext中设置变量——checkpointDir的值
sc.setCheckpointDir(fullPath)
checkpointDir = fullPath
} else {
checkpointDir = null
}
}
1.2.2 StreamingContext的start()方法
执行的入口是StreamingContext.scala中的start()方法,如下:
//StreamingContext的start方法
def start(): Unit = synchronized {
state match {
//初次启动时,state默认值就是INITIALIZED
case INITIALIZED =>
startSite.set(DStream.getCreationSite())
StreamingContext.ACTIVATION_LOCK.synchronized {
StreamingContext.assertNoOtherContextIsActive()
try {
validate()
//启动了一个叫streaming-start线程,这段代码后面的{...}部分其实就相当于java中的Runnable中的run方法中的内容;核心是通过线程调用了JobScheduler的start()方法
ThreadUtils.runInNewThread("streaming-start") {
sparkContext.setCallSite(startSite.get)
sparkContext.clearJobGroup()
sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "false")
savedProperties.set(SerializationUtils.clone(sparkContext.localProperties.get()))
/*重要的方法 —— start()*/
scheduler.start()
}
//修改运行状态
state = StreamingContextState.ACTIVE
//发送消息
scheduler.listenerBus.post(
StreamingListenerStreamingStarted(System.currentTimeMillis()))
} catch {
...
}
//设置变量
StreamingContext.setActiveContext(this)
}
//一些初始化的操作
logDebug("Adding shutdown hook") // force eager creation of logger
shutdownHookRef = ShutdownHookManager.addShutdownHook(
StreamingContext.SHUTDOWN_HOOK_PRIORITY)(stopOnShutdown)
// Registering Streaming Metrics at the start of the StreamingContext
assert(env.metricsSystem != null)
env.metricsSystem.registerSource(streamingSource)
uiTab.foreach(_.attach())
logInfo("StreamingContext started")
case ACTIVE =>
logWarning("StreamingContext has already been started")
case STOPPED =>
throw new IllegalStateException("StreamingContext has already been stopped")
}
}
这里的scheduler是JobScheduler对象——负责job生成的核心类。
JobScheduler初始化时,会创建一个ThreadPool(jobExecutor)和jobGenerator,其中:
- jobExecutor用于提交作业.ThreadPool中线程的数量为Job并发量,由”spark.streaming.concurrentJobs”指定,默认为1.
- JobScheduler是负责调度【JobSchedulerEvent类型的事件】,这个jobGenerator负责实施发布的命令【JobGeneratorEvent类型的事件】。
1.2.3 JobScheduler的start()方法
下面我们看这个JobScheduler的start方法,代码如下:
//JobGenerator 的 start方法
def start(): Unit = synchronized {
if (eventLoop != null) return // scheduler has already been started
logDebug("Starting JobScheduler")
//实例化eventLoop并启动进行JobScheduler发布的事件处理
eventLoop = new EventLoop[JobSchedulerEvent]("JobScheduler") {
//该方法中指定接收的事件【JobSchedulerEvent类型的事件】由processEvent(event)方法处理。
override protected def onReceive(event: JobSchedulerEvent): Unit = processEvent(event)
override protected def onError(e: Throwable): Unit = reportError("Error in job scheduler", e)
}
//EventLoop开始执行时,会开启一deamon线程用于处理队列中的事件。
eventLoop.start()
// attach rate controllers of input streams to receive batch completion updates
for {
inputDStream <- ssc.graph.getInputStreams
rateController <- inputDStream.rateController
} ssc.addStreamingListener(rateController)
listenerBus.start()
//用以管理ReceiverInputDStream中receiver的执行。
receiverTracker = new ReceiverTracker(ssc)
inputInfoTracker = new InputInfoTracker(ssc)
val executorAllocClient: ExecutorAllocationClient = ssc.sparkContext.schedulerBackend match {
case b: ExecutorAllocationClient => b.asInstanceOf[ExecutorAllocationClient]
case _ => null
}
executorAllocationManager = ExecutorAllocationManager.createIfEnabled(
executorAllocClient,
receiverTracker,
ssc.conf,
ssc.graph.batchDuration.milliseconds,
clock)
executorAllocationManager.foreach(ssc.addStreamingListener)
receiverTracker.start()
/*核心代码*/
jobGenerator.start()
executorAllocationManager.foreach(_.start())
logInfo("Started JobScheduler")
}
1.2.4 JobGenerator的start方法
JobScheuler用来调度运行在Spark上的作业, 它使用JobGenerator生成jobs【每隔一个批次就产生一批jobs】, 然后使用一个线程池并行运行提交作业,上述代码中我们重点关注jobGenerator的start方法,这个方法开始生成jobs,如下:
//JobScheuler 的 start方法
def start(): Unit = synchronized {
if (eventLoop != null) return // generator has already been started
// Call checkpointWriter here to initialize it before eventLoop uses it to avoid a deadlock.
// See SPARK-10125
checkpointWriter
//与上文JobScheduler中的一样,其定义事件【JobGeneratorEvent类型的事件】交由processEvent(event)
eventLoop = new EventLoop[JobGeneratorEvent]("JobGenerator") {
override protected def onReceive(event: JobGeneratorEvent): Unit = processEvent(event)
override protected def onError(e: Throwable): Unit = {
jobScheduler.reportError("Error in job generator", e)
}
}
eventLoop.start()
//如果之前checkpoint过了,再次重启直接加载上次checkpoint的文件进行恢复
//如果上次关闭是由于bug导致,而这次我们修改了代码逻辑,那么如果再次启动还是会用之前的checkpoint的内容,导致其实运行的代码还是老的代码,
//想要新代码生效必须删除checkpoint文件,但是这样会导致其它的数据消费记录也丢失,从而无法从上次失败开始消费数据,还是会有数据损失。
if (ssc.isCheckpointPresent) {
restart()
} else {
/*调用graph.start方法及开启Timer(即解发GeneratorJobs()事件)*/
startFirstTime()
}
}
1.2.5 startFirstTime()方法
JobGenerator的start方法中我们关注startFirstTime()方法,如下:
/** Starts the generator for the first time */
private def startFirstTime() {
val startTime = new Time(timer.getStartTime())
/*启动DAG Graph静态模版配置*/
graph.start(startTime - graph.batchDuration)
//开启Timer,定时触发事件,循环执行
timer.start(startTime.milliseconds)
logInfo("Started JobGenerator at " + startTime)
}
这里的Timer是一个RecurringTimer对象,在该对象中其会定期通过eventLoop发送GenerateJobs事件,就是通过它,JobGenerator会不断的定期去产生job,时间就是一个批次。我们看一下初始化代码:
private val timer = new RecurringTimer(clock, ssc.graph.batchDuration.milliseconds,
longTime => eventLoop.post(GenerateJobs(new Time(longTime))), "JobGenerator")
【关于timer的start方法内部实现,这里不再赘述。可以查看triggerActionForNextInterval()方法。】
这里我们看这个timer对象在ssc.graph.batchDuration.milliseconds这个周期上反复向eventLoop注册GenerateJobs事件:
上面我们知道在JobGenerator中的eventLoop专门接受JobGeneratorEvent类型的事件,所以此时收到消息,由eventLoop中的processEvent()方法判断并处理,如下:
/** Processes all events */
private def processEvent(event: JobGeneratorEvent) {
logDebug("Got event " + event)
event match {
//生成job的事件类型
case GenerateJobs(time) => generateJobs(time)
case ClearMetadata(time) => clearMetadata(time)
case DoCheckpoint(time, clearCheckpointDataLater) =>
doCheckpoint(time, clearCheckpointDataLater)
case ClearCheckpointData(time) => clearCheckpointData(time)
}
}
根据模式匹配,我们看generateJobs方法:
/** Generate jobs and perform checkpointing for the given `time`. */
private def generateJobs(time: Time) {
// Checkpoint all RDDs marked for checkpointing to ensure their lineages are
// truncated periodically. Otherwise, we may run into stack overflows (SPARK-6847).
ssc.sparkContext.setLocalProperty(RDD.CHECKPOINT_ALL_MARKED_ANCESTORS, "true")
Try {
jobScheduler.receiverTracker.allocateBlocksToBatch(time) // allocate received blocks to batch
graph.generateJobs(time) // generate jobs using allocated block
} match {
case Success(jobs) =>
val streamIdToInputInfos = jobScheduler.inputInfoTracker.getInfo(time)
jobScheduler.submitJobSet(JobSet(time, jobs, streamIdToInputInfos))
case Failure(e) =>
jobScheduler.reportError("Error generating jobs for time " + time, e)
PythonDStream.stopStreamingContextIfPythonProcessIsDead(e)
}
/*这里发送DoCheckpoint事件*/
eventLoop.post(DoCheckpoint(time, clearCheckpointDataLater = false))
}
这里终于到了做检查点的地方了,我们在上面的processEvent方法中匹配一下,发现执行doCheckpoint方法,方法如下:
/** Perform checkpoint for the given `time`. */
private def doCheckpoint(time: Time, clearCheckpointDataLater: Boolean) {
if (shouldCheckpoint && (time - graph.zeroTime).isMultipleOf(ssc.checkpointDuration)) {
logInfo("Checkpointing graph for time " + time)
ssc.graph.updateCheckpointData(time)
checkpointWriter.write(new Checkpoint(ssc, time), clearCheckpointDataLater)
} else if (clearCheckpointDataLater) {
markBatchFullyProcessed(time)
}
}
1)shouldCheckpoint取值:
private lazy val shouldCheckpoint = ssc.checkpointDuration != null && ssc.checkpointDir != null
2)graph.zeroTime取值:
def start(time: Time) {
this.synchronized {
require(zeroTime == null, "DStream graph computation already started")
zeroTime = time //第一次设置进去的开始时间
...
}
}
3) ssc.graph.updateCheckpointData(time)【具体查看1.2.7.1章节】
def updateCheckpointData(time: Time) {
logInfo("Updating checkpoint data for time " + time)
this.synchronized {
//将graph中的outputStreams数组中存放的DStream对象一个一个执行updateCheckpointData方法。
outputStreams.foreach(_.updateCheckpointData(time))
}
logInfo("Updated checkpoint data for time " + time)
}
updateCheckpointData内部其实是一个递归调用,由子DStream往父DStream调用updateCheckpointData方法:
private[streaming] def updateCheckpointData(currentTime: Time) {
logDebug(s"Updating checkpoint data for time $currentTime")
checkpointData.update(currentTime)
//里面存放了它的父亲,递归调用
dependencies.foreach(_.updateCheckpointData(currentTime))
logDebug(s"Updated checkpoint data for time $currentTime: $checkpointData")
}
其中update方法:
def update(time: Time) {
// Get the checkpointed RDDs from the generated RDDs
val checkpointFiles = dstream.generatedRDDs.filter(_._2.getCheckpointFile.isDefined)
.map(x => (x._1, x._2.getCheckpointFile.get))
logDebug("Current checkpoint files:\n" + checkpointFiles.toSeq.mkString("\n"))
// Add the checkpoint files to the data to be serialized
//不为空的话,就清楚,重新添加
if (!checkpointFiles.isEmpty) {
currentCheckpointFiles.clear()
currentCheckpointFiles ++= checkpointFiles
// Add the current checkpoint files to the map of all checkpoint files
// This will be used to delete old checkpoint files
timeToCheckpointFile ++= currentCheckpointFiles
// Remember the time of the oldest checkpoint RDD in current state
timeToOldestCheckpointFileTime(time) = currentCheckpointFiles.keys.min(Time.ordering)
}
}
4)checkpointWriter.write(new Checkpoint(ssc, time), clearCheckpointDataLater)
def write(checkpoint: Checkpoint, clearCheckpointDataLater: Boolean) {
try {
val bytes = Checkpoint.serialize(checkpoint, conf)
executor.execute(new CheckpointWriteHandler(
checkpoint.checkpointTime, bytes, clearCheckpointDataLater))
logInfo(s"Submitted checkpoint of time ${checkpoint.checkpointTime} to writer queue")
} catch {
case rej: RejectedExecutionException =>
logError("Could not submit checkpoint task to the thread pool executor", rej)
}
}
往线程池中添加一个CheckpointWriteHandler线程,用于将检查点数据写入HDFS中。
5)markBatchFullyProcessed(time)
如果要清除checkpoint数据的选项是true,那么标记时间:
private def markBatchFullyProcessed(time: Time) {
lastProcessedBatch = time
}
扩展
1.2.6 DStreamGraph的start方法(详细见1.2.8章节)
def start(time: Time) {
this.synchronized {
require(zeroTime == null, "DStream graph computation already started")
zeroTime = time
startTime = time
outputStreams.foreach(_.initialize(zeroTime))
outputStreams.foreach(_.remember(rememberDuration))
outputStreams.foreach(_.validateAtStart())
numReceivers = inputStreams.count(_.isInstanceOf[ReceiverInputDStream[_]])
inputStreamNameAndID = inputStreams.map(is => (is.name, is.id))
inputStreams.par.foreach(_.start())
}
}
这里是关于DStreamGraph
中关于outputStream
和inputStream
的配置,注意由于DStream
也是一种模版,所以这里既是对DStream
模版内容的配置,主要配置的就是缓存时间之类的内容。
小结
以上代码为主要的关于JobGenerator的成员和start方法的介绍,主要就是启动定时器的工作,定时器线程启动后会每个批次提交一个生成Job的时间插入到eventLoop中,然后对事件处理后生成对应jobs,返回生成的jobs插入到JobScheduler中的jobSets中,提交到线程池中等待处理。
1.2.7 DStreamGraph生成
上面描述了DStreamGraph的功能,那么它是何时产生的,我们接下来介绍。
首先上述的
graph.start(startTime - graph.batchDuration)
中的graph是DstreamGraph对象,在JobGenerator中定义如下:
private val graph = ssc.graph
我们继续在StreamingContext中查看它的定义,如下:
private[streaming] val graph: DStreamGraph = {
if (isCheckpointPresent) {
_cp.graph.setContext(this)
_cp.graph.restoreCheckpointData()
_cp.graph
} else {
//初次checkpoint时候
require(_batchDur != null, "Batch duration for StreamingContext cannot be null")
//空的
val newGraph = new DStreamGraph()
//设置批处理时间间隔
newGraph.setBatchDuration(_batchDur)
newGraph
}
}
这里只是在创建SparkStreamingContext
的时候顺带创建的一个空的DStreamGraph
对象,然后设置了一下它的批处理时间。
这个DStreamGraph这个类的成员变量如下:
final private[streaming] class DStreamGraph extends Serializable with Logging {
/*核心就是inputStreams和outputStreams这两个数组,这两个是用来构造DStreamGraph模版,
不过一开始在SparkStreamingContext创建DStreamGraph的时候很显然这些内容都是空的,而我们编写的各种map、filter、print、updateStateByKey等等算子,就是来进行这俩填充的
*/
private val inputStreams = new ArrayBuffer[InputDStream[_]]()
private val outputStreams = new ArrayBuffer[DStream[_]]()
@volatile private var inputStreamNameAndID: Seq[(String, Int)] = Nil
var rememberDuration: Duration = null
var checkpointInProgress = false
var zeroTime: Time = null
var startTime: Time = null
var batchDuration: Duration = null
@volatile private var numReceivers: Int = 0
def start(time: Time) {
this.synchronized {
require(zeroTime == null, "DStream graph computation already started")
zeroTime = time
startTime = time
outputStreams.foreach(_.initialize(zeroTime))
outputStreams.foreach(_.remember(rememberDuration))
outputStreams.foreach(_.validateAtStart())
numReceivers = inputStreams.count(_.isInstanceOf[ReceiverInputDStream[_]])
inputStreamNameAndID = inputStreams.map(is => (is.name, is.id))
inputStreams.par.foreach(_.start())
}
}
......
1.2.7.1 从DStream说起,如何填充outputStreams
我们可以看一个算子的例子,例如DStream中的map方法,如下:
def map[U: ClassTag](mapFunc: T => U): DStream[U] = ssc.withScope {
new MappedDStream(this, context.sparkContext.clean(mapFunc))
}
我们就拿MappedDStream举例,接着看:
private[streaming]
class MappedDStream[T: ClassTag, U: ClassTag] (
parent: DStream[T],
mapFunc: T => U
) extends DStream[U](parent.ssc) {
//当前的DStream的父亲
override def dependencies: List[DStream[_]] = List(parent)
override def slideDuration: Duration = parent.slideDuration
override def compute(validTime: Time): Option[RDD[U]] = {
parent.getOrCompute(validTime).map(_.map[U](mapFunc))
}
}
这里的parent就是声明MappedDStream时的对应DStream,就是代码rdda.map(…)中rdda的类型,也即每次调用map算子的时候,生成的新的DStream均会在自己的dependencies自己的父亲是谁,类似的其它所有的算子,都是这种结构,只要由一个DStream生成一个新的DStream,均会记录之间的关系,这样下来我们编写的处理逻辑所有算子之间就形成了一条龙了,从数据源到最后的foreachRDD这种算子。
然后再看action类型的算子(如print、saveAsFiles)最后都会执行foreachRDD方法,这个方法如下:
def foreachRDD(foreachFunc: RDD[T] => Unit): Unit = ssc.withScope {
val cleanedF = context.sparkContext.clean(foreachFunc, false)
foreachRDD((r: RDD[T], _: Time) => cleanedF(r), displayInnerRDDOps = true)
}
/**
* Apply a function to each RDD in this DStream. This is an output operator, so
* 'this' DStream will be registered as an output stream and therefore materialized.
*/
def foreachRDD(foreachFunc: (RDD[T], Time) => Unit): Unit = ssc.withScope {
// because the DStream is reachable from the outer object here, and because
// DStreams can't be serialized with closures, we can't proactively check
// it for serializability and so we pass the optional false to SparkContext.clean
foreachRDD(foreachFunc, displayInnerRDDOps = true)
}
//上面两个都会最终调用以下方法
private def foreachRDD(
foreachFunc: (RDD[T], Time) => Unit,
displayInnerRDDOps: Boolean): Unit = {
new ForEachDStream(this,
context.sparkContext.clean(foreachFunc, false), displayInnerRDDOps).register()
}
这里的register()方法在最后执行,之前就是套用SparkContext的方法,内部做了什么?接着看:
private[streaming] def register(): DStream[T] = {
ssc.graph.addOutputStream(this)
this
}
def addOutputStream(outputStream: DStream[_]) {
this.synchronized {
outputStream.setGraph(this)
outputStreams += outputStream
}
}
这里调用graph.addOutputStream(this),将我们进行foreachRDD得到的DStream插入到了outputStream数组中,如果再看其它action类算子和transformation类算子,会发现他们的逻辑和这里都一样,进行到这里就会发现只要调用了action的算子,都会被作为outputStream加入到DStreamGraph中。
1.2.7.2 如何填充inputStreams
在DirectKafkaInputDStream、
FileInputDStream、
SocketInputDStream等类中,会调用与addOutputStream对应的addInputStream方法,这些类都是InputDStream类的子类,也就是数据来源。在刚开始声明这些类的时候就会自动被加入到DStreamGraph中的inputStream中了。如下示例:
abstract class InputDStream[T: ClassTag](_ssc: StreamingContext)
extends DStream[T](_ssc) {
private[streaming] var lastValidTime: Time = null
ssc.graph.addInputStream(this)
...
}
1.2.7.3 小结
inputStream和outputStream分别就是我们处理逻辑的数据入口和最终输出的出口,DStreamGraph中记录了入口和出口,而且之前已经说过每个DStream之间通过dependencies已经建立过了联系,所以知道了开头和结尾,中间的关系又有了,就形成了一个graph。
2. WAL(Write Ahead Logs) 【部分未完善】
对于接收的数据存在于内存中存在丢失的风险,由于接收到的数据还存在于Executor的内存中,当Executor出现异常时会丢失这些数据,为了避免这种数据损失,在Spark1.2中引进了预写日志的形式(WriteAheadLogs)的形式。
2.1 ReceiverTracker.start()
我们从之前上面的StreamingContext.start() => JobScheduler.start() => JobGenerator.start()调用链路的JobScheduler的start开始说起:
var receiverTracker: ReceiverTracker = null
def start(): Unit = synchronized {
/*eventLoop区域*/
...
//负责管理所有在Executor上运行的数据接收器Receiver
receiverTracker = new ReceiverTracker(ssc)
...
receiverTracker.start()
...
}
这里有一个ReceiverTracker对象,我们查看它的构造以及start方法,如下:
class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false) extends Logging {
/*负责获取Receiver*/
private val receiverInputStreams = ssc.graph.getReceiverInputStreams()
private val receiverInputStreamIds = receiverInputStreams.map { _.id }
/*负责对接收到的Block进行管理*/
private val receivedBlockTracker = new ReceivedBlockTracker(
ssc.sparkContext.conf,
ssc.sparkContext.hadoopConfiguration,
receiverInputStreamIds,
ssc.scheduler.clock,
ssc.isCheckpointPresent,
Option(ssc.checkpointDir)
)
......
/*制定分发策略*/
private val schedulingPolicy = new ReceiverSchedulingPolicy()
/** Start the endpoint and receiver execution thread. */
def start(): Unit = synchronized {
if (isTrackerStarted) {
throw new SparkException("ReceiverTracker already started")
}
if (!receiverInputStreams.isEmpty) {
endpoint = ssc.env.rpcEnv.setupEndpoint(
"ReceiverTracker", new ReceiverTrackerEndpoint(ssc.env.rpcEnv))
if (!skipReceiverLaunch)
launchReceivers()
logInfo("ReceiverTracker started")
trackerState = Started
}
}
...
}
2.2 ReceivedBlockTracker对象
private[streaming] class ReceivedBlockTracker(
conf: SparkConf,
hadoopConf: Configuration,
streamIds: Seq[Int],
clock: Clock,
recoverFromWriteAheadLog: Boolean,
checkpointDirOption: Option[String])
extends Logging {
...
private val writeAheadLogOption = createWriteAheadLog()
...
/** Optionally create the write ahead log manager only if the feature is enabled */
private def createWriteAheadLog(): Option[WriteAheadLog] = {
checkpointDirOption.map { checkpointDir =>
val logDir = ReceivedBlockTracker.checkpointDirToLogDir(checkpointDirOption.get)
WriteAheadLogUtils.createLogForDriver(conf, logDir, hadoopConf)
}
}
...
}
其中receiverInputStreams是负责获取Receiver的,receivedBlockTracker则是负责对接收到的Block进行管理的,schedulingPolicy是制定分发策略的。
在JobScheduler的start()方法中调用receiverTracker.start()启动:
-
ReceiverTracker的start方法主要就是在driver端创建和各个Receiver通信的ReceiverTrackerEndpoint
-
调用launchReceivers方法给自己发StartAllReceivers的消息,从而启动所有Receiver的分发到对应Executor
private def launchReceivers(): Unit = { //这里的receiverInputStreams前文已经提起过,就是通过DStreamGraph得到的,通过追踪不难发现一个DStreamGraph对应生成的receiver的个数,就是inputStream的个数。 val receivers = receiverInputStreams.map { nis => val rcvr = nis.getReceiver() rcvr.setReceiverId(nis.id) rcvr } /*用来确认slave节点是否注册,避免将所有receivers分发到同一节点。其通过执行一非常简单的任务,让 SparkCore执行一次,然后通过其组件信息判断是否有除driver之外的Executor存在。*/ runDummySparkJob() logInfo("Starting " + receivers.length + " receivers") //给自己发StartAllReceivers的事件消息 endpoint.send(StartAllReceivers(receivers)) } private def runDummySparkJob(): Unit = { if (!ssc.sparkContext.isLocal) { //一个简单的spark计算 ssc.sparkContext.makeRDD(1 to 50, 50).map(x => (x, 1)).reduceByKey(_ + _, 20).collect() } assert(getExecutors.nonEmpty) }
2.2 receiver来源
通过ReceiverInputDStream
抽象类中的getReceiver
方法获取
2.3 ReceiverTrackerEndpoint对象接受StartAllReceivers事件消息
endpoint变量是ReceiverTrackerEndpoint对象,它的receive方法描述了如何处理接收到的消息:
override def receive: PartialFunction[Any, Unit] = {
// Local messages
case StartAllReceivers(receivers) =>
//对所有的Receiver找到一个合适的Executor
val scheduledLocations = schedulingPolicy.scheduleReceivers(receivers, getExecutors)
//对receivers进行遍历,进行真正的分发启动receiver
for (receiver <- receivers) {
val executors = scheduledLocations(receiver.streamId)
updateReceiverScheduledExecutors(receiver.streamId, executors)
receiverPreferredLocations(receiver.streamId) = receiver.preferredLocation
startReceiver(receiver, executors)
}
case RestartReceiver(receiver) =>
。。。
case c: CleanupOldBlocks =>
。。。
case UpdateReceiverRateLimit(streamUID, newRate) =>
。。。
case ReportError(streamId, message, error) =>
。。。
}
对于Receiver的启动策略就是调用之前说的schedulingPolicy,调用该类的scheduleReceivers方法来对所有的Receiver找到一个合适的Executor,每个Receiver都会有一个preferredLocation方法,在分配合适的Executor的时候,会选择Receiver的preferredLocation返回的结果,而如果preferredLocation没有指定结果,则会在所有Executor中随机均匀分配。
2.4 遍历分发启动Receivers
我们接着2.2最后的startReceiver方法看:
private def startReceiver(
receiver: Receiver[_],
scheduledLocations: Seq[TaskLocation]): Unit = {
def shouldStartReceiver: Boolean = {
// It's okay to start when trackerState is Initialized or Started
!(isTrackerStopping || isTrackerStopped)
}
val receiverId = receiver.streamId
//如果因为某些原因receiverTracker突然停止或正在停止,那么直接不用启动receiver返回执行完成即可。
if (!shouldStartReceiver) {
onReceiverJobFinish(receiverId)
return
}
val checkpointDirOption = Option(ssc.checkpointDir)
//序列化对应参数
val serializableHadoopConf =
new SerializableConfiguration(ssc.sparkContext.hadoopConfiguration)
// Function to start the receiver on the worker node
// 用于启动Receiver的方法,作为入参传给后面的future
val startReceiverFunc: Iterator[Receiver[_]] => Unit =
(iterator: Iterator[Receiver[_]]) => {
if (!iterator.hasNext) {
throw new SparkException(
"Could not start receiver as object not found.")
}
//判断是否是第一次启动,因为只有第一次启动对应的attemptNumber才为0,之后尝试一次均会+1
if (TaskContext.get().attemptNumber() == 0) {
val receiver = iterator.next()
assert(iterator.hasNext == false)
//Executor端创建的Receiver的监工,会负责对Receiver接收到的数据进行转储。
val supervisor = new ReceiverSupervisorImpl(
receiver, SparkEnv.get, serializableHadoopConf.value, checkpointDirOption)
supervisor.start()
supervisor.awaitTermination()
} else {
// It's restarted by TaskScheduler, but we want to reschedule it again. So exit it.
}
}
// Create the RDD using the scheduledLocations to run the receiver in a Spark job
// 创建一个测试用的RDD
val receiverRDD: RDD[Receiver[_]] =
if (scheduledLocations.isEmpty) {
ssc.sc.makeRDD(Seq(receiver), 1)
} else {
val preferredLocations = scheduledLocations.map(_.toString).distinct
ssc.sc.makeRDD(Seq(receiver -> preferredLocations))
}
receiverRDD.setName(s"Receiver $receiverId")
ssc.sparkContext.setJobDescription(s"Streaming job running receiver $receiverId")
ssc.sparkContext.setCallSite(Option(ssc.getStartSite()).getOrElse(Utils.getCallSite()))
//通过Spark Core中的sparkContext对RDD对应任务进行发布,并在执行的时候启动了ReceiverSupervisorImpl来作为监工,supervisor启动后,
//再自己创建好对应自己管理的Receiver来接收数据。
val future = ssc.sparkContext.submitJob[Receiver[_], Unit, Unit](
receiverRDD, startReceiverFunc, Seq(0), (_, _) => Unit, ())
// We will keep restarting the receiver job until ReceiverTracker is stopped
future.onComplete {
case Success(_) =>
if (!shouldStartReceiver) {
onReceiverJobFinish(receiverId)
} else {
logInfo(s"Restarting Receiver $receiverId")
self.send(RestartReceiver(receiver))
}
case Failure(e) =>
if (!shouldStartReceiver) {
onReceiverJobFinish(receiverId)
} else {
logError("Receiver has been stopped. Try to restart it.", e)
logInfo(s"Restarting Receiver $receiverId")
self.send(RestartReceiver(receiver))
}
}(ThreadUtils.sameThread)
logInfo(s"Receiver ${receiver.streamId} started")
}
Receiver的具体流程,也就是每个Receiver的启动其实是依托于ReceiverSupervisorImpl来启动的。