Hadoop分布式文件缓存(DistributeCache)

如果在Hadoop程序中,我们需要将一大堆中间结果集,给其他的MR任务的使用,那么这个参数怎么传递呢?
如果是少量的参数问题,我们可以使用Configuration使用,但是如果是大量的结果集怎么处理呢?
这个时候,就需要引入MR任务的分布式缓存文件系统了.
如果要使用缓存文件,首先需要在Driver层,将中间结果的文件路径添加到MR的job中

package net.icsoc.cti.report.call;

import net.icsoc.cti.report.utils.EMapReduceOSSUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.net.URI;
import java.util.List;

/*******************************************************************************
 * 版权信息：北京中通天鸿武汉分公司
 * @author xuchang
 * Copyright: Copyright (c) 2007北京中通天鸿武汉分公司,Inc.All Rights Reserved.
 * Description:
 ******************************************************************************/
public class CallDriver {
    /**
     * 电话消息处理任务调度函数
     *
     * @return
     */
    public static Path submitJob(FileSystem fileSystem, Configuration conf, String data_input, String temp_input, String data_output, List<Path> cacheFiles) {
        try {
            //初始化Job实例
            Job job = Job.getInstance(conf, "cti-call-message-handler");
            //设置任务名称
            job.setJarByClass(CallDriver.class);
            //设置Mapper方法
            job.setMapperClass(CallMapper.class);
            //设置Combiner方法
            // job.setCombinerClass(CallReducer.class);
            //设置Reduce方法
            job.setReducerClass(CallReducer.class);
            //设置输出key值类型
            job.setOutputKeyClass(Text.class);
            //设置输出Value值类型
            job.setOutputValueClass(Text.class);
            //根据mapre-site配置,合理设置reduce的合数
            job.setNumReduceTasks(6);
            //添加分布式缓存文件
            if (cacheFiles != null) {
                for (Path path : cacheFiles) {
                    System.out.println("加入到mapReduce 配置文件中重启消息路径为: " + path.toString());
                    job.addCacheFile(path.toUri());
                }
            }
            //设置输入路径目录可以递归
            FileInputFormat.setInputDirRecursive(job, true);
            //原数据目录
            Path inputPath1 = new Path(EMapReduceOSSUtil.buildOSSCompleteUri(data_input, conf));
            //上一天生成的临时数据目录
            Path inputPath2 = new Path(EMapReduceOSSUtil.buildOSSCompleteUri(temp_input, conf));
            if (!fileSystem.exists(inputPath1) && !fileSystem.exists(inputPath2)) {
                System.out.println("没有任何输入数据......");
                return null;
            }
            if (fileSystem.exists(inputPath2)) {
                FileInputFormat.addInputPath(job, inputPath2);
            }
            if (fileSystem.exists(inputPath1)) {
                //设置输入文件的OSS路径
                FileInputFormat.addInputPath(job, inputPath1);
            }
            //设置输入文件的OSS路径
            MultipleOutputs.addNamedOutput(job, "json", TextOutputFormat.class, Text.class, Text.class);
            MultipleOutputs.addNamedOutput(job, "temp", TextOutputFormat.class, Text.class, Text.class);

            Path outputPath = new Path(EMapReduceOSSUtil.buildOSSCompleteUri(data_output, conf));
            if (fileSystem.exists(outputPath)) {
                fileSystem.delete(outputPath, true);
            }
            FileOutputFormat.setOutputPath(job, outputPath);

            //取消默认名称为文件输出
            LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

            //设置Job执行完成之后才返回
            job.waitForCompletion(true);
            //返回输出路径
            return outputPath;
        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("submit call mission exception!" + e);
        }
        return null;
    }
}

上面的Driver中添加路径到job任务的分布文件缓存中的代码为:
遍历任务中需要使用的缓存文件,添加到CacheFile中.

 //添加分布式缓存文件
            if (cacheFiles != null) {
                for (Path path : cacheFiles) {
                    System.out.println("加入到mapReduce 配置文件中重启消息路径为: " + path.toString());
                    job.addCacheFile(path.toUri());
                }
            }

在map任务或者在reduce任务中获取该文件结果集,扫描文件,读取文件数据,以供程序使用.

import com.alibaba.fastjson.JSON;
import net.icsoc.cti.report.restart.model.MainMessage;
import net.icsoc.cti.report.restart.model.SubMessage;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

/*******************************************************************************
 * 版权信息：北京中通天鸿武汉分公司
 * @author xuchang
 * Copyright: Copyright (c) 2007北京中通天鸿武汉分公司,Inc.All Rights Reserved.
 * Description:
 ******************************************************************************/
public class DistributedCache {
    public static Map<String, Long> loadCache(Reducer.Context context) throws IOException {
        String line;
        Map<String, Long> map = null;
        Configuration conf = context.getConfiguration();
        URI[] localFiles = context.getCacheFiles();
        if (localFiles != null) {
            map = new HashMap<>();
            for (URI file : localFiles) {
                System.out.println("重启消息所在文件路径为 :" + file);
                Path file_path = new Path(EMapReduceOSSUtil.buildOSSCompleteUri(file.toString(), context.getConfiguration()));
                FSDataInputStream inputStream = FileSystem.get(file, conf).open(file_path);
                while ((line = inputStream.readLine()) != null) {
                    MainMessage message = JSON.parseObject(line, MainMessage.class);
                    SubMessage subMessage = message.getMSG();
                    System.out.println("机架ID : " + subMessage.getTelId() + "  操作时间:" + subMessage.getOpTime());
                    //存储结构为 key:机架Id value:重启时间
                    map.put(subMessage.getTelId(), LongUtils.default0(subMessage.getOpTime()) * 1000);
                }
            }
        }
        return map;
    }
}

从context中读取MR任务中的缓存文件的URI,遍历URL,读取文件内存,既可以在MR任务的各个几点上使用同一份中间结果集.

Hadoop分布式文件缓存(DistributeCache)

猜你喜欢