reduce task启动后的第一阶段是shuffle(向map端fetch数据),每次fetch数据的时候都可能因为connect timeout,read timeout,checksum error等原因时报,因而reduce task为每个map设置了一个计数器,用以记录fetch该map输出时失败的次数,当失败次数达到一定阀值的时候。会通知MRAppMaster 从该map fetch数据时失败的次数太多了,并打印想要的log;
该阀值计算方式:
org.apache.hadoop.mapreduce.v2.app.job.impl.JobImpl.java float failureRate = runningReduceTasks == 0 ? 1.0f : (float) fetchFailures / runningReduceTasks; // declare faulty if fetch-failures >= max-allowed-failures boolean isMapFaulty = (failureRate >= MAX_ALLOWED_FETCH_FAILURES_FRACTION); if (fetchFailures >= MAX_FETCH_FAILURES_NOTIFICATIONS && isMapFaulty) { LOG.info("Too many fetch-failures for output of task attempt: " + mapId + " ... raising fetch failure to map"); job.eventHandler.handle(new TaskAttemptEvent(mapId, TaskAttemptEventType.TA_TOO_MANY_FETCH_FAILURE)); job.fetchFailuresMapping.remove(mapId); }
默认的阀值是3,
//The maximum fraction of fetch failures allowed for a map private static final double MAX_ALLOWED_FETCH_FAILURES_FRACTION = 0.5; // Maximum no. of fetch-failure notifications after which map task is failed private static final int MAX_FETCH_FAILURES_NOTIFICATIONS = 3;
最终的日志信息是在
org.apache.hadoop.mapreduce.v2.app.job.impl.TaskAttemptImpl.TooManyFetchFailureTransition类中打印出来的
private static class TooManyFetchFailureTransition implements SingleArcTransition<TaskAttemptImpl, TaskAttemptEvent> { @SuppressWarnings("unchecked") @Override public void transition(TaskAttemptImpl taskAttempt, TaskAttemptEvent event) { //add to diagnostic taskAttempt.addDiagnosticInfo("Too Many fetch failures.Failing the attempt"); //set the finish time taskAttempt.setFinishTime(); if (taskAttempt.getLaunchTime() != 0) { taskAttempt.eventHandler .handle(createJobCounterUpdateEventTAFailed(taskAttempt)); TaskAttemptUnsuccessfulCompletionEvent tauce = createTaskAttemptUnsuccessfulCompletionEvent(taskAttempt, TaskAttemptState.FAILED); taskAttempt.eventHandler.handle(new JobHistoryEvent( taskAttempt.attemptId.getTaskId().getJobId(), tauce)); }else { LOG.debug("Not generating HistoryFinish event since start event not " + "generated for taskAttempt: " + taskAttempt.getID()); } taskAttempt.eventHandler.handle(new TaskTAttemptEvent( taskAttempt.attemptId, TaskEventType.T_ATTEMPT_FAILED)); } }