Hadoop 2.7.5 MapReduce Recovery

Map reduce can recovery from last attempt, the succeeded map tasks or reduce tasks in previous application attempt will not be executed again.

MRAppMaster.serviceInit calls processRecovery to recovery from previous attempt.

 private void processRecovery() throws IOException{
    if (appAttemptID.getAttemptId() == 1) {
      return;  // no need to recover on the first attempt
    }

    boolean recoveryEnabled = getConfig().getBoolean(
        MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE,
        MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE_DEFAULT);

    boolean recoverySupportedByCommitter = isRecoverySupported(); // true

    // If a shuffle secret was not provided by the job client then this app
    // attempt will generate one.  However that disables recovery if there
    // are reducers as the shuffle secret would be app attempt specific.
    int numReduceTasks = getConfig().getInt(MRJobConfig.NUM_REDUCES, 0);
    boolean shuffleKeyValidForRecovery =
        TokenCache.getShuffleSecretKey(jobCredentials) != null;

    if (recoveryEnabled && recoverySupportedByCommitter
        && (numReduceTasks <= 0 || shuffleKeyValidForRecovery)) {
      LOG.info("Recovery is enabled. "
          + "Will try to recover from previous life on best effort basis.");
      try {
        parsePreviousJobHistory();
      } catch (IOException e) {
        LOG.warn("Unable to parse prior job history, aborting recovery", e);
        // try to get just the AMInfos
        amInfos.addAll(readJustAMInfos());
      }
    } else {
      LOG.info("Will not try to recover. recoveryEnabled: "
            + recoveryEnabled + " recoverySupportedByCommitter: "
            + recoverySupportedByCommitter + " numReduceTasks: "
            + numReduceTasks + " shuffleKeyValidForRecovery: "
            + shuffleKeyValidForRecovery + " ApplicationAttemptID: "
            + appAttemptID.getAttemptId());
      // Get the amInfos anyways whether recovery is enabled or not
      amInfos.addAll(readJustAMInfos());
    }
  }

MRAppMaster.isRecoverySupported

IsRecoverySupported is determined by committer.isRecoverySupported(_jobContext), which always return true when the type of output commiter is FileOutputCommitter.

private boolean isRecoverySupported() throws IOException {
    boolean isSupported = false;
    Configuration conf = getConfig();
    if (committer != null) {
      final JobContext _jobContext = getJobContextFromConf(conf);
      isSupported = callWithJobClassLoader(conf,
          new ExceptionAction<Boolean>() {
            public Boolean call(Configuration conf) throws IOException {
              return committer.isRecoverySupported(_jobContext);
            }
      });
    }
    return isSupported;
  }
private void parsePreviousJobHistory() throws IOException {
    FSDataInputStream in = getPreviousJobHistoryStream(getConfig(),
        appAttemptID);
    JobHistoryParser parser = new JobHistoryParser(in);
    JobInfo jobInfo = parser.parse();
    Exception parseException = parser.getParseException();
    if (parseException != null) {
      LOG.info("Got an error parsing job-history file" +
          ", ignoring incomplete events.", parseException);
    }
    Map<org.apache.hadoop.mapreduce.TaskID, TaskInfo> taskInfos = jobInfo
        .getAllTasks();
    for (TaskInfo taskInfo : taskInfos.values()) {
      if (TaskState.SUCCEEDED.toString().equals(taskInfo.getTaskStatus())) {
        Iterator<Entry<TaskAttemptID, TaskAttemptInfo>> taskAttemptIterator =
            taskInfo.getAllTaskAttempts().entrySet().iterator();
        while (taskAttemptIterator.hasNext()) {
          Map.Entry<TaskAttemptID, TaskAttemptInfo> currentEntry = taskAttemptIterator.next();
          if (!jobInfo.getAllCompletedTaskAttempts().containsKey(currentEntry.getKey())) {
            taskAttemptIterator.remove();
          }
        }
        completedTasksFromPreviousRun
            .put(TypeConverter.toYarn(taskInfo.getTaskId()), taskInfo);
        LOG.info("Read from history task "
            + TypeConverter.toYarn(taskInfo.getTaskId()));
      }
    }
    LOG.info("Read completed tasks from history "
        + completedTasksFromPreviousRun.size());
    recoveredJobStartTime = jobInfo.getLaunchTime();

    // recover AMInfos
    List<JobHistoryParser.AMInfo> jhAmInfoList = jobInfo.getAMInfos();
    if (jhAmInfoList != null) {
      for (JobHistoryParser.AMInfo jhAmInfo : jhAmInfoList) {
        AMInfo amInfo = MRBuilderUtils.newAMInfo(jhAmInfo.getAppAttemptId(),
            jhAmInfo.getStartTime(), jhAmInfo.getContainerId(),
            jhAmInfo.getNodeManagerHost(), jhAmInfo.getNodeManagerPort(),
            jhAmInfo.getNodeManagerHttpPort());
        amInfos.add(amInfo);
      }
    }
  }

MRAppMaster.getPreviousJobHistoryStream

  private static FSDataInputStream getPreviousJobHistoryStream(
      Configuration conf, ApplicationAttemptId appAttemptId)
      throws IOException {
    Path historyFile = JobHistoryUtils.getPreviousJobHistoryPath(conf,
        appAttemptId);
    LOG.info("Previous history file is at " + historyFile);
    //historyFile: hdfs://localhost:8020/tmp/hadoop-yarn/staging/houzhizhen/.staging/job_1523876398612_0014/job_1523876398612_0014_1.jhist
    return historyFile.getFileSystem(conf).open(historyFile);
  }

MRAppMaster.

MRAppMaster.

MRAppMaster.

MRAppMaster.

MRAppMaster.

MRAppMaster.

猜你喜欢

转载自blog.csdn.net/houzhizhen/article/details/79990004
今日推荐