3.Yarn应用_API

一.写一个简单的Yarn应用

1.Yarn Client

// Client通过RPC获取app id
// 初始化并启动YarnClient
YarnClient yarnClient = YarnClient.createYarnClient();
Configuration conf = new Configuration();
yarnClient.init(conf);
yarnClient.start();
// 使用YarnClient创建Application并获取application id
YarnClientApplication app = yarnClient.createApplication();
// appResponse中包含了集群的信息，例如集群中资源的最大最小值等
GetNewApplicationResponse appResponse = app.getNewApplicationResponse();
ApplicationSubmissionContext appContext = app.getApplicationSubmissionContext();
// 得到APP ID
ApplicationId appId = appContext.getApplicationId();


/** 1.使用Client的一大难点就是设置ApplicationSubmissionContext对象。
 *      该对象包含了RM启动AM需要的所有信息。包括：app id,app name,queue,priporty,user info。
 *  2.ContainerLaunchContext对象包含用来运行AM的容器信息。
 *      包括：本机资源(如:jar,输入文件等),环境设置(如:ClassPath等),待执行的命令和安全Tokens。
 */
// 设置应用名字等基本信息
ApplicationSubmissionContext appContext = app.getApplicationSubmissionContext();
appContext.setKeepContainersAcrossApplicationAttempts(keepContainers);
appContext.setApplicationName(appName);

//设置本地资源、日志配置
Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
FileSystem fs = FileSystem.get(conf);
addToLocalResources(fs, appMasterJar, appMasterJarPath, appId.toString(),
    localResources, null);

if (!log4jPropFile.isEmpty()) {
  addToLocalResources(fs, log4jPropFile, log4jPath, appId.toString(),
      localResources, null);
}
//shell脚本必须可以在containers上可用，因此，我们需要copy 一份到HDFS文件系统让Yarn框架可见
//为了让任务container能够访问到，应当将该资源设置成Yarn集群资源。
//不需要设置成AM的本地资源，AM不需要它。
String hdfsShellScriptLocation = "";
long hdfsShellScriptLen = 0;
long hdfsShellScriptTimestamp = 0;
if (!shellScriptPath.isEmpty()) {
  Path shellSrc = new Path(shellScriptPath);
  String shellPathSuffix =
      appName + "/" + appId.toString() + "/" + SCRIPT_PATH;
  Path shellDst =
      new Path(fs.getHomeDirectory(), shellPathSuffix);
  fs.copyFromLocalFile(false, true, shellSrc, shellDst);
  hdfsShellScriptLocation = shellDst.toUri().toString();
  FileStatus shellFileStatus = fs.getFileStatus(shellDst);
  hdfsShellScriptLen = shellFileStatus.getLen();
  hdfsShellScriptTimestamp = shellFileStatus.getModificationTime();
}

if (!shellCommand.isEmpty()) {
  addToLocalResources(fs, null, shellCommandPath, appId.toString(),
      localResources, shellCommand);
}

if (shellArgs.length > 0) {
  addToLocalResources(fs, null, shellArgsPath, appId.toString(),
      localResources, StringUtils.join(shellArgs, " "));
}
// 设置Application Master运行的环境变量
Map<String, String> env = new HashMap<String, String>();

// 使用env info将shell脚本放入env,AM会为containers创建本地资源,让shell执行
env.put(DSConstants.DISTRIBUTEDSHELLSCRIPTLOCATION, hdfsShellScriptLocation);
env.put(DSConstants.DISTRIBUTEDSHELLSCRIPTTIMESTAMP, Long.toString(hdfsShellScriptTimestamp));
env.put(DSConstants.DISTRIBUTEDSHELLSCRIPTLEN, Long.toString(hdfsShellScriptLen));

// 设置jar信息
StringBuilder classPathEnv = new StringBuilder(Environment.CLASSPATH.$$())
  .append(ApplicationConstants.CLASS_PATH_SEPARATOR).append("./*");
for (String c : conf.getStrings(
    YarnConfiguration.YARN_APPLICATION_CLASSPATH,
    YarnConfiguration.DEFAULT_YARN_CROSS_PLATFORM_APPLICATION_CLASSPATH)) {
  classPathEnv.append(ApplicationConstants.CLASS_PATH_SEPARATOR);
  classPathEnv.append(c.trim());
}
classPathEnv.append(ApplicationConstants.CLASS_PATH_SEPARATOR).append(
  "./log4j.properties");

// 设置命令参数
Vector<CharSequence> vargs = new Vector<CharSequence>(30);

// 设置java可执行命令
LOG.info("Setting up app master command");
vargs.add(Environment.JAVA_HOME.$$() + "/bin/java");
// 设置AM内存大小
vargs.add("-Xmx" + amMemory + "m");
// 设置类路径
vargs.add(appMasterMainClass);
// 设置AM参数
vargs.add("--container_memory " + String.valueOf(containerMemory));
vargs.add("--container_vcores " + String.valueOf(containerVirtualCores));
vargs.add("--num_containers " + String.valueOf(numContainers));
vargs.add("--priority " + String.valueOf(shellCmdPriority));

for (Map.Entry<String, String> entry : shellEnv.entrySet()) {
  vargs.add("--shell_env " + entry.getKey() + "=" + entry.getValue());
}
if (debugFlag) {
  vargs.add("--debug");
}

vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stdout");
vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stderr");

// 获取最终命令
StringBuilder command = new StringBuilder();
for (CharSequence str : vargs) {
  command.append(str).append(" ");
}

LOG.info("Completed setting up app master command " + command.toString());
List<String> commands = new ArrayList<String>();
commands.add(command.toString());

// 设置AM containerContext
ContainerLaunchContext amContainer = ContainerLaunchContext.newInstance(
  localResources, env, commands, null, null, null);

// 创建必须资源类型,如：memory,vcores
Resource capability = Resource.newInstance(amMemory, amVCores);
appContext.setResource(capability);

// service data是一个二进制blob,会将数据传给app
// amContainer.setServiceData(serviceData);

// 创建安全tokens
if (UserGroupInformation.isSecurityEnabled()) {
  // Note: Credentials class is marked as LimitedPrivate for HDFS and MapReduce
  Credentials credentials = new Credentials();
  String tokenRenewer = conf.get(YarnConfiguration.RM_PRINCIPAL);
  if (tokenRenewer == null | | tokenRenewer.length() == 0) {
    throw new IOException(
      "Can't get Master Kerberos principal for the RM to use as renewer");
  }

  // 只得到默认文件系统tokens
  final Token<?> tokens[] =
      fs.addDelegationTokens(tokenRenewer, credentials);
  if (tokens != null) {
    for (Token<?> token : tokens) {
      LOG.info("Got dt for " + fs.getUri() + "; " + token);
    }
  }
  DataOutputBuffer dob = new DataOutputBuffer();
  credentials.writeTokenStorageToStream(dob);
  ByteBuffer fsTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength());
  amContainer.setTokens(fsTokens);
}
// 设置上AM Container
appContext.setAMContainerSpec(amContainer);

//设置app优先级
Priority pri = Priority.newInstance(amPriority);
appContext.setPriority(pri);

// 设置app在Yarn中的queue
appContext.setQueue(amQueue);

// 提交app
yarnClient.submitApplication(appContext);

在提交app之后，RM接收Job，并为AM分配Container，然后设置环境并在Container中开启AM。客户端可用通过app id获得应用运行信息。

ApplicationReport report = yarnClient.getApplicationReport(appId);

当然，如果某个应用太消耗资源或运行时间过长，希望杀死它。可以通过YarnClient的killApplication方法杀死应用。实际上是通知RM给AM发送kill命令。

yarnClient.killApplication(appId);

2.Yarn AM

AM是Job实际拥有者，通过YarnClient设置各种参数，由RM启动它。
由于AM也运行在Container中，不能对使用的端口做假设。只能使用预定义端口8888，该端口可能被其他应用占用了。
当AM启动后，AM可以查询到自身所在的NodeManager信息和其他环境参数。
AM和RM通信需要携带ApplicationAttemptId参数。

// 获取ApplicationAttemptId
Map<String, String> envs = System.getenv();
String containerIdString =
    envs.get(ApplicationConstants.AM_CONTAINER_ID_ENV);
if (containerIdString == null) {
  // container id should always be set in the env by the framework
  throw new IllegalArgumentException(
      "ContainerId not set in the environment");
}
ContainerId containerId = ConverterUtils.toContainerId(containerIdString);
ApplicationAttemptId appAttemptID = containerId.getApplicationAttemptId();


// AM初始化完成后，可以启动两个AM客户端分别与RM和NM通信。设置事件处理函数用于处理各种事件。
AMRMClientAsync.CallbackHandler allocListener = new RMCallbackHandler();
amRMClient = AMRMClientAsync.createAMRMClientAsync(1000, allocListener);
amRMClient.init(conf);
amRMClient.start();

containerListener = createNMCallbackHandler();
nmClientAsync = new NMClientAsyncImpl(containerListener);
nmClientAsync.init(conf);
nmClientAsync.start();

// AM需要通过heartbeats通知RM,AM还活跃着。注册AM到RM。
appMasterHostname = NetUtils.getHostname();
RegisterApplicationMasterResponse response = amRMClient.registerApplicationMaster(appMasterHostname, appMasterRpcPort,
        appMasterTrackingUrl);

// RM heartbeat response中包含了一些集群中现状信息
// 集群当前最大可用内存、虚拟cpu核数，之前AM运行的Container数目
int maxMem = response.getMaximumResourceCapability().getMemory();
int maxVCores = response.getMaximumResourceCapability().getVirtualCores();

List<Container> previousAMRunningContainers =
    response.getContainersFromPreviousAttempts();

// AM可以请求多个Containers,用来运行特定的Job
List<Container> previousAMRunningContainers =
    response.getContainersFromPreviousAttempts();
LOG.info("Received " + previousAMRunningContainers.size()
    + " previous AM's running containers on AM registration.");

int numTotalContainersToRequest =
    numTotalContainers - previousAMRunningContainers.size();

for (int i = 0; i < numTotalContainersToRequest; ++i) {
  // 在setupContainerAskForRM中，你需要设置：memory,vcores;优先级
  ContainerRequest containerAsk = setupContainerAskForRM();
  amRMClient.addContainerRequest(containerAsk);
}

AM向RM申请资源后，可通过AMRMClientAsync对象来异步启动Container。在AM处理onContainersAllocated回调函数时，需要初始化设置ContainerLaunchContext 。设置过程繁琐，为了不阻塞AM，我们另启动一个线程处理设置参数和启动Container。

@Override
public void onContainersAllocated(List<Container> allocatedContainers) {
  LOG.info("Got response from RM for container ask, allocatedCnt="
      + allocatedContainers.size());
  numAllocatedContainers.addAndGet(allocatedContainers.size());
  for (Container allocatedContainer : allocatedContainers) {
    LaunchContainerRunnable runnableLaunchContainer =
        new LaunchContainerRunnable(allocatedContainer, containerListener);
    Thread launchThread = new Thread(runnableLaunchContainer);

    // launch and start the container on a separate thread to keep
    // the main thread unblocked
    // as all containers may not be allocated at one go.
    launchThreads.add(launchThread);
    launchThread.start();
  }
}


/** launchThread线程实现如下。AM通过NMClientAsync和NodeManager通信来启动Container。**/
// 设置在Container上执行的必需命令
Vector<CharSequence> vargs = new Vector<CharSequence>(5);

// 设置可执行命令
vargs.add(shellCommand);
// 设置shell脚本路径
if (!scriptPath.isEmpty()) {
  vargs.add(Shell.WINDOWS ? ExecBatScripStringtPath
    : ExecShellStringPath);
}

// 设置shell参数
vargs.add(shellArgs);
// 添加日志参数
vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout");
vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr");

// 最终拼接好的命令
StringBuilder command = new StringBuilder();
for (CharSequence str : vargs) {
  command.append(str).append(" ");
}

List<String> commands = new ArrayList<String>();
commands.add(command.toString());

// 设置ContainerLaunchContext：本地资源、环境、命令、token
ContainerLaunchContext ctx = ContainerLaunchContext.newInstance(
  localResources, shellEnv, commands, null, allTokens.duplicate(), null);
containerListener.addContainer(container.getId(), container);
nmClientAsync.startContainerAsync(container, ctx);


/**AM heartbeat中，可通过如下方式来报告进度**/
@Override
public float getProgress() {
  // set progress to deliver to RM on next heartbeat
  float progress = (float) numCompletedContainers.get()
      / numTotalContainers;
  return progress;
}


/**在所有的任务完成后，AM需要和Yarn注销自己，停止所有的client对象**/
try {
  amRMClient.unregisterApplicationMaster(appStatus, appMessage, null);
} catch (YarnException ex) {
  LOG.error("Failed to unregister application", ex);
} catch (IOException e) {
  LOG.error("Failed to unregister application", e);
}

amRMClient.stop();

二.Yarn Shell命令

1.常用命令

1.1 yarn jar
描述：运行一个jar文件，用户可以捆绑yarn代码在一个jar文件，然后使用这个命令执行。
用法：yarn jar <jar> [mainClass] args...
1.2 yarn application
描述：打印应用程序的报告和kill掉的应用程序。
用法：yarn application <options>
命令选项   描述
-applicationId ApplicationId   指定一个应用程序ID（application id）
-appOwner AppOwner   指定一个程序拥有者（application owner）
-containerId ContainerId   指定一个容器ID（container id）
-nodeAddress NodeAddress   指定一个节点地址（ node address）
1.3 yarn node
描述：打印节点报告
用法：yarn node <options>
命令选项   描述
-status NodeId   指定一个节点ID（node id）
-list   列出所有节点列表信息
1.4 yarn logs
描述：转储容器日志
用法：yarn logs <options>
命令选项   描述
-applicationId ApplicationId   指定一个应用程序id（ application id）
-appOwner AppOwner   指定一个程序拥有者（ application owner）
-containerId ContainerId   指定一个容器id（container id）
-nodeAddress NodeAddress   指定一个节点地址（ node address）
1.5 yarn classpath
描述：打印需要得到Hadoop的jar和所需要的库的类路径
用法：yarn classpah
1.6 yarn version
描述：打印版本信息
用法：yarn version

2.管理命令

2.1 启动资源管理

用法：yarn resourcemanager

2.2 启动节点管理

用法：yarn nodemanager

2.3 启动代理服务

用法：yarn proxyserver