Flink 任务调度源码分析2 （JobGraph 构建和提交源码解析）

JobGraph：StreamGraph 经过优化后生成了 JobGraph，提交给 JobManager 的数据结构
它包含的主要抽象概念有
1、JobVertex：经过优化后符合条件的多个 StreamNode 可能会 chain 在一起生成一个
JobVertex，即一个JobVertex 包含一个或多个 operator，JobVertex 的输入是 JobEdge，输出是
IntermediateDataSet。
2、IntermediateDataSet：表示 JobVertex 的输出，即经过 operator 处理产生的数据集。
producer 是JobVertex，consumer 是 JobEdge。
3、JobEdge：代表了job graph中的一条数据传输通道。source 是 IntermediateDataSet，target
是 JobVertex。即数据通过JobEdge由IntermediateDataSet传递给目标JobVertex。

StreamGraph 转变成 JobGraph 也是在 Client 完成，主要作了三件事：
⚫ StreamNode 转成 JobVertex。
⚫ StreamEdge 转成 JobEdge。
⚫ JobEdge 和 JobVertex 之间创建 IntermediateDataSet 来连接

入口：AbstractJobClusterExecutor.execute
final JobGraph jobGraph = PipelineExecutorUtils.getJobGraph(pipeline, configuration);
-> getJobGraph -> pipelineTranslator.translateToJobGraph -> StreamGraphTranslator.translateToJobGraph
 -> streamGraph.getJobGraph ->  StreamingJobGraphGenerator.createJobGraph
   -> new StreamingJobGraphGenerator(streamGraph, jobID).createJobGraph();
	private JobGraph createJobGraph() {
    
    
		preValidate();

		// make sure that all vertices start immediately
		/*TODO streaming 模式下，调度模式是所有节点（vertices）一起启动。 对应调度名称 Eager */
		jobGraph.setScheduleMode(streamGraph.getScheduleMode());
		jobGraph.enableApproximateLocalRecovery(streamGraph.getCheckpointConfig().isApproximateLocalRecoveryEnabled());

		// Generate deterministic hashes for the nodes in order to identify them across
		// submission iff they didn't change.
		// 广度优先遍历 StreamGraph 并且为每个SteamNode生成hash id，
		// 保证如果提交的拓扑没有改变，则每次生成的hash都是一样的
		Map<Integer, byte[]> hashes = defaultStreamGraphHasher.traverseStreamGraphAndGenerateHashes(streamGraph);

		// Generate legacy version hashes for backwards compatibility
		List<Map<Integer, byte[]>> legacyHashes = new ArrayList<>(legacyStreamGraphHashers.size());
		for (StreamGraphHasher hasher : legacyStreamGraphHashers) {
    
    
			legacyHashes.add(hasher.traverseStreamGraphAndGenerateHashes(streamGraph));
		}

		/* TODO 最重要的函数，生成 JobVertex，JobEdge等，并尽可能地将多个节点chain在一起*/
		setChaining(hashes, legacyHashes);

		/*TODO 将每个JobVertex的入边集合也序列化到该JobVertex的StreamConfig中 (出边集合已经在setChaining的时候写入了)*/
		setPhysicalEdges();

		/*TODO 根据group name，为每个 JobVertex 指定所属的 SlotSharingGroup 以及针对 Iteration的头尾设置  CoLocationGroup*/
		setSlotSharingAndCoLocation();

		setManagedMemoryFraction(
			Collections.unmodifiableMap(jobVertices),
			Collections.unmodifiableMap(vertexConfigs),
			Collections.unmodifiableMap(chainedConfigs),
			id -> streamGraph.getStreamNode(id).getManagedMemoryOperatorScopeUseCaseWeights(),
			id -> streamGraph.getStreamNode(id).getManagedMemorySlotScopeUseCases());

		// TODO 配置 checkpoint
		configureCheckpointing();

		jobGraph.setSavepointRestoreSettings(streamGraph.getSavepointRestoreSettings());

		JobGraphUtils.addUserArtifactEntries(streamGraph.getUserArtifacts(), jobGraph);

		// set the ExecutionConfig last when it has been finalized
		try {
    
    
			/*TODO 将 StreamGraph 的 ExecutionConfig 序列化到 JobGraph 的配置中*/
			jobGraph.setExecutionConfig(streamGraph.getExecutionConfig());
		}
		catch (IOException e) {
    
    
			throw new IllegalConfigurationException("Could not serialize the ExecutionConfig." +
					"This indicates that non-serializable types (like custom serializers) were registered");
		}

		return jobGraph;
	}
  -> setChaining() 
		/*TODO 从source开始建⽴ node chains*/
		for (OperatorChainInfo info : initialEntryPoints) {
    
    
			/*TODO 构建node chains，返回当前节点的物理出边；startNodeId != currentNodeId 时,说明currentNode是chain中的子节点*/
			createChain(
					info.getStartNodeId(),
					1,  // operators start at position 1 because 0 is for chained source inputs
					info,
					chainEntryPoints);
		}
  --> createChain
  	private List<StreamEdge> createChain(
			final Integer currentNodeId,
			final int chainIndex,
			final OperatorChainInfo chainInfo,
			final Map<Integer, OperatorChainInfo> chainEntryPoints) {
    
    

		Integer startNodeId = chainInfo.getStartNodeId();
		if (!builtVertices.contains(startNodeId)) {
    
    
			/*TODO 过渡用的出边集合, 用来生成最终的 JobEdge, 注意不包括 chain 内部的边*/
			List<StreamEdge> transitiveOutEdges = new ArrayList<StreamEdge>();

			List<StreamEdge> chainableOutputs = new ArrayList<StreamEdge>();
			List<StreamEdge> nonChainableOutputs = new ArrayList<StreamEdge>();

			StreamNode currentNode = streamGraph.getStreamNode(currentNodeId);

			/*TODO 将当前节点的出边分成 chainable 和 nonChainable 两类*/
			for (StreamEdge outEdge : currentNode.getOutEdges()) {
    
    
				// TODO isChainable
				if (isChainable(outEdge, streamGraph)) {
    
    
					chainableOutputs.add(outEdge);
				} else {
    
    
					nonChainableOutputs.add(outEdge);
				}
			}

			for (StreamEdge chainable : chainableOutputs) {
    
    
				transitiveOutEdges.addAll(
						createChain(chainable.getTargetId(), chainIndex + 1, chainInfo, chainEntryPoints));
			}

			/*TODO 递归调用 createChain*/
			for (StreamEdge nonChainable : nonChainableOutputs) {
    
    
				transitiveOutEdges.add(nonChainable);
				createChain(
						nonChainable.getTargetId(),
						1, // operators start at position 1 because 0 is for chained source inputs
						chainEntryPoints.computeIfAbsent(
							nonChainable.getTargetId(),
							(k) -> chainInfo.newChain(nonChainable.getTargetId())),
						chainEntryPoints);
			}

			/*TODO 生成当前节点的显示名，如："Keyed Aggregation -> Sink: Unnamed"*/
			chainedNames.put(currentNodeId, createChainedName(currentNodeId, chainableOutputs, Optional.ofNullable(chainEntryPoints.get(currentNodeId))));
			chainedMinResources.put(currentNodeId, createChainedMinResources(currentNodeId, chainableOutputs));
			chainedPreferredResources.put(currentNodeId, createChainedPreferredResources(currentNodeId, chainableOutputs));

			OperatorID currentOperatorId = chainInfo.addNodeToChain(currentNodeId, chainedNames.get(currentNodeId));

			if (currentNode.getInputFormat() != null) {
    
    
				getOrCreateFormatContainer(startNodeId).addInputFormat(currentOperatorId, currentNode.getInputFormat());
			}

			if (currentNode.getOutputFormat() != null) {
    
    
				getOrCreateFormatContainer(startNodeId).addOutputFormat(currentOperatorId, currentNode.getOutputFormat());
			}

			/*TODO 如果当前节点是起始节点, 则直接创建 JobVertex 并返回 StreamConfig, 否则先创建一个空的 StreamConfig */
			StreamConfig config = currentNodeId.equals(startNodeId)
					? createJobVertex(startNodeId, chainInfo)
					: new StreamConfig(new Configuration());

			/*TODO 设置 JobVertex 的 StreamConfig, 基本上是序列化 StreamNode 中的配置到 StreamConfig中.*/
			setVertexConfig(currentNodeId, config, chainableOutputs, nonChainableOutputs, chainInfo.getChainedSources());

			if (currentNodeId.equals(startNodeId)) {
    
    
				/*TODO 如果是chain的起始节点，标记成chain start（不是chain中的节点，也会被标记成 chain start）*/
				config.setChainStart();
				config.setChainIndex(chainIndex);
				config.setOperatorName(streamGraph.getStreamNode(currentNodeId).getOperatorName());

				/*TODO 将当前节点(headOfChain)与所有出边相连*/
				for (StreamEdge edge : transitiveOutEdges) {
    
    
					/*TODO 通过StreamEdge构建出JobEdge，创建 IntermediateDataSet，用来将JobVertex和JobEdge相连*/
					connect(startNodeId, edge);
				}

				/*TODO 把物理出边写入配置, 部署时会用到*/
				config.setOutEdgesInOrder(transitiveOutEdges);
				/*TODO 将chain中所有子节点的StreamConfig写入到 headOfChain 节点的 CHAINED_TASK_CONFIG 配置中*/
				config.setTransitiveChainedTaskConfigs(chainedConfigs.get(startNodeId));

			} else {
    
    
				/*TODO 如果是 chain 中的子节点*/
				chainedConfigs.computeIfAbsent(startNodeId, k -> new HashMap<Integer, StreamConfig>());

				config.setChainIndex(chainIndex);
				StreamNode node = streamGraph.getStreamNode(currentNodeId);
				config.setOperatorName(node.getOperatorName());
				/*TODO 将当前节点的StreamConfig添加到该chain的config集合中*/
				chainedConfigs.get(startNodeId).put(currentNodeId, config);
			}

			config.setOperatorID(currentOperatorId);

			if (chainableOutputs.isEmpty()) {
    
    
				config.setChainEnd();
			}
			/*TODO 返回连往chain外部的出边集合*/
			return transitiveOutEdges;

		} else {
    
    
			return new ArrayList<>();
		}
	}

	--> isChainable() 可以chain条件 
	// 1、下游节点的入度为1 （也就是说下游节点没有来自其他节点的输入）
	downStreamVertex.getInEdges().size() == 1;
	// 2、上下游节点都在同一个 slot group 中
	upStreamVertex.isSameSlotSharingGroup(downStreamVertex);
	// 3、前后算子不为空
	!(downStreamOperator == null || upStreamOperator == null);
	// 4、上游节点的 chain 策略为 ALWAYS 或 HEAD（只能与下游链接，不能与上游链接，Source 默认是 HEAD）
	!upStreamOperator.getChainingStrategy() == ChainingStrategy.NEVER;
	// 5、下游节点的 chain 策略为 ALWAYS（可以与上下游链接，map、flatmap、filter 等默认是ALWAYS）
	!downStreamOperator.getChainingStrategy() != ChainingStrategy.ALWAYS;
	// 6、两个节点间物理分区逻辑是 ForwardPartitioner
	(edge.getPartitioner() instanceof ForwardPartitioner);
	// 7、两个算子间的shuffle方式不等于批处理模式
	edge.getShuffleMode() != ShuffleMode.BATCH;
	// 8、上下游的并行度一致
	upStreamVertex.getParallelism() == downStreamVertex.getParallelism();
	// 9、用户没有禁用 chain
	streamGraph.isChainingEnabled();

总结
每个 JobVertex 都会对应一个可序列化的 StreamConfig, 用来发送给 JobManager 和
TaskManager。最后在 TaskManager 中起 Task 时,需要从这里面反序列化出所需要的配置信
息, 其中就包括了含有用户代码的 StreamOperator。
setChaining 会对 source 调用 createChain 方法，该方法会递归调用下游节点，从而构建
出 node chains。 createChain 会分析当前节点的出边，根据 Operator Chains 中的 chainable 条
件，将出边分成 chainalbe 和 noChainable 两类，并分别递归调用自身方法。之后会将
StreamNode 中的配置信息序列化到 StreamConfig 中。如果当前不是 chain 中的子节点，则会
构建 JobVertex 和 JobEdge 相连。如果是 chain 中的子节点，则会将 StreamConfig 添加到该
chain 的 config 集合中。一个 node chains，除了 headOfChain node 会生成对应的 JobVertex，
其余的 nodes 都是以序列化的形式写入到 StreamConfig 中，并保存到 headOfChain 的
CHAINED_TASK_CONFIG 配置项中。直到部署时，才会取出并生成对应的 ChainOperators。

Flink 任务调度源码分析2 （JobGraph 构建和提交源码解析）

猜你喜欢