1.1 wordcount的案例代码追溯
1.1.1 wordcount案例
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.examples.java.wordcount;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.core.fs.FileSystem.WriteMode;
import org.apache.flink.examples.java.wordcount.util.WordCountData;
import org.apache.flink.util.Collector;
/**
* This example shows an implementation of WordCount without using the
* Tuple2 type, but a custom class.
*/
@SuppressWarnings("serial")
public class WordCountPojo {
/**
* This is the POJO (Plain Old Java Object) that is being used
* for all the operations.
* As long as all fields are public or have a getter/setter, the system can handle them
*/
public static class Word {
// fields
private String word;
private int frequency;
// constructors
public Word() {}
public Word(String word, int i) {
this.word = word;
this.frequency = i;
}
// getters setters
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
public int getFrequency() {
return frequency;
}
public void setFrequency(int frequency) {
this.frequency = frequency;
}
@Override
public String toString() {
return "Word=" + word + " freq=" + frequency;
}
}
public static void main(String[] args) throws Exception {
final ParameterTool params = ParameterTool.fromArgs(args);
// set up the execution environment
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// make parameters available in the web interface
env.getConfig().setGlobalJobParameters(params);
// get input data
DataSet<String> text;
if (params.has("input")) {
// read the text file from given input path
text = env.readTextFile(params.get("input"));
} else {
// get default test text data
System.out.println("Executing WordCount example with default input data set.");
System.out.println("Use --input to specify file input.");
text = WordCountData.getDefaultTextLineDataSet(env);
}
DataSet<Word> counts =
// split up the lines into Word objects (with frequency = 1)
text.flatMap(new Tokenizer())
// group by the field word and sum up the frequency
.groupBy("word")
.reduce(new ReduceFunction<Word>() {
@Override
public Word reduce(Word value1, Word value2) throws Exception {
return new Word(value1.word, value1.frequency + value2.frequency);
}
});
if (params.has("output")) {
counts.writeAsText(params.get("output"), WriteMode.OVERWRITE);
// execute program
env.execute("WordCount-Pojo Example");
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
counts.print();
}
}
// *************************************************************************
// USER FUNCTIONS
// *************************************************************************
/**
* Implements the string tokenizer that splits sentences into words as a user-defined
* FlatMapFunction. The function takes a line (String) and splits it into
* multiple Word objects.
*/
public static final class Tokenizer implements FlatMapFunction<String, Word> {
@Override
public void flatMap(String value, Collector<Word> out) {
// normalize and split the line
String[] tokens = value.toLowerCase().split("\\W+");
// emit the pairs
for (String token : tokens) {
if (token.length() > 0) {
out.collect(new Word(token, 1));
}
}
}
}
}
final ParameterTool params = ParameterTool.fromArgs(args); 支持-或者–开头的参数,解析成数组
1.1.2 job执行的命令
./flink run -m yarn-cluster -d -yst -yqu flinkqu -yst -yn 4 -ys 2 -c flinkdemoclass flinkdemo.jar args1 args2 ...
## Session模式:
## 先启动session:./bin/yarn-session.sh
## 后提交job:./bin/flink run ./path/to/job.jar
detached模式:上面job模式的-d代表detached,这种情况下flink yarn client将会只提交任务到集群然后关闭自己。这样就不能从 env.execute() 中获得 accumulator results 或 exceptions。而在session模式下使用,则无法使用flink停止yarn session,需用yarn工具来停止 yarn application -kill
main -> cliFrontend.parseParameters -> run() -> runProgram ->{
//获取yarnClusterDescriptor:customCommandLine.createClusterDescriptor()
try {
final T clusterId = customCommandLine.getClusterId(commandLine);
final ClusterClient<T> client;
// directly deploy the job if the cluster is started in job mode and detached
if (clusterId == null && runOptions.getDetachedMode()) {
int parallelism = runOptions.getParallelism() == -1 ? defaultParallelism : runOptions.getParallelism();
final JobGraph jobGraph = PackagedProgramUtils.createJobGraph(program, configuration, parallelism);
final ClusterSpecification clusterSpecification = customCommandLine.getClusterSpecification(commandLine);
client = clusterDescriptor.deployJobCluster(
clusterSpecification,
jobGraph,
runOptions.getDetachedMode());
logAndSysout("Job has been submitted with JobID " + jobGraph.getJobID());
try {
client.shutdown();
} catch (Exception e) {
LOG.info("Could not properly shut down the client.", e);
}
} else {
final Thread shutdownHook;
if (clusterId != null) {
client = clusterDescriptor.retrieve(clusterId);
shutdownHook = null;
} else {
// also in job mode we have to deploy a session cluster because the job
// might consist of multiple parts (e.g. when using collect)
final ClusterSpecification clusterSpecification = customCommandLine.getClusterSpecification(commandLine);
client = clusterDescriptor.deploySessionCluster(clusterSpecification);
// if not running in detached mode, add a shutdown hook to shut down cluster if client exits
// there's a race-condition here if cli is killed before shutdown hook is installed
if (!runOptions.getDetachedMode() && runOptions.isShutdownOnAttachedExit()) {
shutdownHook = ShutdownHookUtil.addShutdownHook(client::shutDownCluster, client.getClass().getSimpleName(), LOG);
} else {
shutdownHook = null;
}
}
try {
client.setPrintStatusDuringExecution(runOptions.getStdoutLogging());
client.setDetached(runOptions.getDetachedMode());
LOG.debug("Client slots is set to {}", client.getMaxSlots());
LOG.debug("{}", runOptions.getSavepointRestoreSettings());
int userParallelism = runOptions.getParallelism();
LOG.debug("User parallelism is set to {}", userParallelism);
if (client.getMaxSlots() != MAX_SLOTS_UNKNOWN && userParallelism == -1) {
logAndSysout("Using the parallelism provided by the remote cluster ("
+ client.getMaxSlots() + "). "
+ "To use another parallelism, set it at the ./bin/flink client.");
userParallelism = client.getMaxSlots();
} else if (ExecutionConfig.PARALLELISM_DEFAULT == userParallelism) {
userParallelism = defaultParallelism;
}
executeProgram(program, client, userParallelism);
} finally {
if (clusterId == null && !client.isDetached()) {
// terminate the cluster only if we have started it before and if it's not detached
try {
client.shutDownCluster();
} catch (final Exception e) {
LOG.info("Could not properly terminate the Flink cluster.", e);
}
if (shutdownHook != null) {
// we do not need the hook anymore as we have just tried to shutdown the cluster.
ShutdownHookUtil.removeShutdownHook(shutdownHook, client.getClass().getSimpleName(), LOG);
}
}
try {
client.shutdown();
} catch (Exception e) {
LOG.info("Could not properly shut down the client.", e);
}
}
}
} finally {
try {
clusterDescriptor.close();
} catch (Exception e) {
LOG.info("Could not properly close the cluster descriptor.", e);
}
}
}
我们将上面的类简化下:
main -> cliFrontend.parseParameters -> run() -> runProgram -> {
获取yarnClusterDescriptor:customCommandLine.createClusterDescriptor()
if (clusterId == null && runOptions.getDetachedMode()) { // job + DetachedMode模式
// 从jar包中获取jobGraph
// 新建一个RestClusterClient:AbstractYarnClusterDescriptor.deploySessionCluster(); -> {
在yarn集群中启动应用:deployInternal -> yarnClusterDescriptor.startAppMaster -> yarnClient.submitApplication() // flink便在yarn集群中启动 ClusterEntrypoint,这个类的介绍看下面。
}
}
else {
if (clusterId != null) { // session模式
clusterDescriptor.retrieve(clusterId);
}
else { // job + non-DetachedMode模式
针对非DetachedMode的job模式,job might consist of multiple parts (e.g. when using collect)。同样是新建一个RestClusterClient,只是不需要jobGraph和DetachedMode
}
}
executeProgram -> 这里是父类ClusterClient的run方法 -> {
if 非交互模式 {
RestClusterClient的run方法,这里阻塞,直到执行完成 -> {
getOptimizedPlan // 获取 optPlan 用于转化为JobGraph,后续图结构分析
run // 这里接下面的restClusterClient.run()
}
}
// 下面伪代码可忽略
else 交互模式 {
prog.invokeInteractiveModeForExecution()真正进入用户的flink代码 -> env.execute() 这里的env以及后面的指代StreamContextEnvironment -> {
获取streamGraph:this.getStreamGraph();
if DetachedMode模式,则setDetachedPlan(streamGraph)
else 执行ContextEnvironment.getClient().run()
}
if detached mode {
((DetachedEnvironment) factory.getLastEnvCreated()).finalizeExecute()这里调用DetachedEnvironment的finalizeExecute,里面调用RestClusterClient的run方法
}
else { // blocking mode
return this.lastJobExecutionResult;
}
}
}
}
Dispatcher 的启动
Dispatcher.start() -> leaderElectionService.start() ZooKeeperLeaderElectionService的
Dispatcher 接收 client 的 submitjob
RedirectHandler.channelRead0(),一个netty对象-> AbstractHandler.respondAsLeader() -> respondToRequest -> JobSubmitHandler.handleRequest() -> gateway.submitJob(),即 Dispatcher 的方法 -> persistAndRunJob() -> runJob -> createJobManagerRunner(jobGraph){
jobManagerRunnerFactory.createJobManagerRunner -> {
创建DefaultJobMasterServiceFactory
登记libraryCacheManager.registerJob
启动(未start)haServices.getJobManagerLeaderElectionService
启动jobMasterFactory.createJobMasterService{
实例化JobMaster{ JobMaster.createAndRestoreExecutionGraph },JM负责一个jobGraph的执行
}
}
dispatcher.startJobManagerRunner -> {
jobManagerRunner.start() -> ZooKeeperLeaderElectionService.start -> isLeader -> JobManagerRunner.grantLeadership -> verifyJobSchedulingStatusAndStartJobManager -> startJobMaster -> JobMaster.start -> startJobExecution -> {
startJobMasterServices:包括slotPool和scheduler的启动,告知flinkresourceManager leader的地址,当FlinkRM和JM建立好连接后,slot就可以开始requesting slots
// 执行job
resetAndScheduleExecutionGraph -> {
createAndRestoreExecutionGraph -> scheduleExecutionGraph -> executionGraph.scheduleForExecution() -> scheduleEager -> {
给 Execution 分配 slots: ExecutionJobVertex.allocateResourcesForAll -> Execution.allocateAndAssignSlotForExecution -> ProviderAndOwner.allocateSlot -> SlotPool.allocateSlot -> {
if task.getSlotSharingGroupId() == null {
return allocateSingleSlot() -> return SingleLogicalSlot
} else {
return allocateSharedSlot() -> {
if (task.getCoLocationConstraint() != null) {
return allocateCoLocatedMultiTaskSlot()
}
else {
return allocateMultiTaskSlot() -> {
有已经处理完的(被分配后完成job执行的) slot:multiTaskSlotLocality
if multiTaskSlotLocality != null && slot 是本地的 {
return multiTaskSlotLocality
}
available slots: polledSlotAndLocality
if polledSlotAndLocality != null && (polledSlotAndLocality.getLocality() == Locality.LOCAL || multiTaskSlotLocality == null{
allocatedSlot.tryAssignPayload(multiTaskSlot) // 尝试,成功就返回,失败就往下走
}
if multiTaskSlotLocality != null {
return multiTaskSlotLocality
}
if (allowQueuedScheduling) { // 如果允许排队等候
检查所有未处理完的 slot 是否可用
如果没有,向 RM 申请: requestNewAllocatedSlot -> requestSlotFromResourceManager -> resourceManagerGateway.requestSlot -> slotManager.registerSlotRequest() -> internalRequestSlot() -> allocateResource -> resourceActions.allocateResource(),ResourceActionsImpl的 -> YarnResourceManager.startNewWorker -> requestYarnContainer(),即申请 TM
}
else {
不限 locality,只要 SlotSharingManager 有 slot 就返回它
}
}
}
}
}
}
遍历 execution,调用其 deploy 方法 -> {
检查被分配的 Target slot (TaskManager) 是否还存活等一些检查
vertex.createDeploymentDescriptor()
slot.getTaskManagerGateway(); // 实际上是 RpcTaskManagerGateway -> taskExecutorGateway
taskManagerGateway.submitTask(); -> taskExecutorGateway.submitTask() -> {
检查 JM连接、JM id、slots
通过 BlobServer 下载 用户jar文件
new Task
task.startTaskThread(); // 至此,任务真正执行
}
}
}
}
}
}
笔名:凌萧子