hadoop-RecoderReader小文件处理-自定义inputFormat合并文本/图片/视频model

RecoderReader合并当前文件夹下文件到同一文件内

将多个小文件合并成一个文件SequenceFile
基础数据

1.FileCombineRecoderReader




 
  
  package
  com.cevent.hadoop.mapreduce.inputformat;
   
  import
  java.io.IOException;
   
  import
  org.apache.hadoop.conf.Configuration;
  import
  org.apache.hadoop.fs.FSDataInputStream;
  import
  org.apache.hadoop.fs.FileSystem;
  import
  org.apache.hadoop.fs.Path;
  import
  org.apache.hadoop.io.BytesWritable;
  import
  org.apache.hadoop.io.IOUtils;
  import org.apache.hadoop.io.NullWritable;
  import
  org.apache.hadoop.mapreduce.InputSplit;
  import
  org.apache.hadoop.mapreduce.RecordReader;
  import
  org.apache.hadoop.mapreduce.TaskAttemptContext;
  import
  org.apache.hadoop.mapreduce.lib.input.FileSplit;
  /**
   * 业务具体实现RecoderReader
   * @author cevent
   * @date 2020年4月14日
   */
  public class WholeFileCombineRecordReader extends RecordReader<NullWritable, BytesWritable>{
   
     private BytesWritable bytesWritableValues=new BytesWritable();
     private FileSplit fileSplit;
     private Configuration configuration;
     //在nextKeyValue读取完毕数据后,记录是否读取,默认未读取
     private boolean isProcess=false;
     
     //1.初始化方法:configuration配置方法,必须调用
     @Override
     public void initialize(InputSplit split, TaskAttemptContext context)
           throws
  IOException, InterruptedException {
        // 1.1获取切片信息:调用fileSplit获取当前的切片split(需要强转)
        this.fileSplit=(FileSplit)split;
        // 1.2获取配置信息
        configuration=context.getConfiguration();
        
     }
   
     //2.读取数据
     @Override
     public boolean nextKeyValue() throws
  IOException, InterruptedException {
        
        //判断是否在读取
        if(!isProcess){
           //输入流调取
           FSDataInputStream dataInputStream=null;
           try {
              // 2.1按文件整体处理,读取文件系统
              FileSystem fileSystem=FileSystem.get(configuration);
              //2.2获取切片的路径
              Path path=fileSplit.getPath();
              //2.3读取切片文件数据的路径
              dataInputStream=fileSystem.open(path);
              //2.4读取切片的输入流(InputSteam输入流,byte[] 读取的字节数组,int 从index开始读取,int 读取的长度)
              //定义字节缓存数组,从获取的切片中得到文件长度
              byte [] bufferArr=new byte[(int) fileSplit.getLength()];
              IOUtils.readFully(dataInputStream, bufferArr, 0, bufferArr.length);
              //2.5设置输出:将读取的文件计入缓存
              bytesWritableValues.set(bufferArr, 0, bufferArr.length);
              
           }
  finally{
              //关闭流
              IOUtils.closeStream(dataInputStream);
           }
           
           
           
           //读取完毕,返回true
           isProcess=true;
           return true;
        }
        
        return false;
     }
   
     //3.当前NullWritable,路径及文件名
     @Override
     public NullWritable getCurrentKey() throws
  IOException,
           InterruptedException
  {
        // 直接接收
        return NullWritable.get();
     }
   
     //4.获取当前文件内容,需要一个BytesWritable存放当前值
     @Override
     public BytesWritable getCurrentValue() throws
  IOException,
           InterruptedException
  {
        // 
        return bytesWritableValues;
     }
   
     //5.progress:当前进程是否在读取数据
     @Override
     public float getProgress() throws
  IOException, InterruptedException {
        // 
        return 0;
     }
   
     //6.关闭流资源
     @Override
     public void close() throws
  IOException {
        // 
        
     }
   
  }
   
  
 


2.FileCombineInputFormat




 
  
  package
  com.cevent.hadoop.mapreduce.inputformat;
   
  import
  java.io.IOException;
   
  import
  org.apache.hadoop.fs.Path;
  import
  org.apache.hadoop.io.BytesWritable;
  import
  org.apache.hadoop.io.NullWritable;
  import
  org.apache.hadoop.mapreduce.InputSplit;
  import
  org.apache.hadoop.mapreduce.JobContext;
  import
  org.apache.hadoop.mapreduce.RecordReader;
  import
  org.apache.hadoop.mapreduce.TaskAttemptContext;
  import
  org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
   
  /**
   *
  FileInputFormat:只作为RecoderReader(实现方法)的调用
   *
  key=路径+文件名 (先不声明时,取NullWritable)
   *
  value=文件内容(字节流 BytesWritable)
   * @author cevent
   * @date 2020年4月14日
   */
  public class WholeFileCombineInputFormat extends FileInputFormat<NullWritable, BytesWritable>{
     
     //是否可以切割数据
     @Override
     protected boolean isSplitable(JobContext context, Path filename) {
        // 
        return false;
     }
   
     //调用RecordReader:需要返回一个RecordReader,新建class继承RecordReader
     @Override
     public RecordReader<NullWritable, BytesWritable> createRecordReader(
           InputSplit split, TaskAttemptContext context) throws
  IOException,InterruptedException {
        // 1.调用初始化方法
        WholeFileCombineRecordReader combineRecordReader=new WholeFileCombineRecordReader();
        combineRecordReader.initialize(split, context);
        
        return combineRecordReader;
     }
   
  }
   
  
 


3.FileCombineMapper




 
  
  package
  com.cevent.hadoop.mapreduce.inputformat;
   
  import
  java.io.IOException;
   
  import
  org.apache.hadoop.fs.Path;
  import
  org.apache.hadoop.io.BytesWritable;
  import
  org.apache.hadoop.io.NullWritable;
  import
  org.apache.hadoop.io.Text;
  import
  org.apache.hadoop.mapreduce.Mapper;
  import
  org.apache.hadoop.mapreduce.lib.input.FileSplit;
  /**
   *
  NullWritable,                   
  BytesWritable,  KEYOUT,              VALUEOUT
   *
  =RecordReader的key=NullWritable   =value          合并的文件path=Text     合并的文件内容=字节码=BytesWritable
   * @author cevent
   * @date 2020年4月14日
   */
  public class WholeFileCombineMapper extends Mapper<NullWritable, BytesWritable, Text, BytesWritable>{
     
     //setup返回路径作为key
     Text keyText=new Text();
     
     //获取切片的路径
     @Override
     protected void setup(Context context)
           throws
  IOException, InterruptedException {
        // 1.调用FileSplit
        FileSplit fileSplit=(FileSplit)context.getInputSplit();
        // 2.获取路径
        Path path=fileSplit.getPath();
        // 3.将路径保存到key
        keyText.set(path.toString());
     }
     
     @Override
     protected void map(NullWritable key, BytesWritable value,Context context)
           throws
  IOException, InterruptedException {
        // 1.控制输出数据
        context.write(keyText, value);
     }
   
  }
   
  
 


4.FileCombineDriver




 
  
  package
  com.cevent.hadoop.mapreduce.inputformat;
   
  import
  org.apache.hadoop.conf.Configuration;
  import
  org.apache.hadoop.fs.Path;
  import
  org.apache.hadoop.io.BytesWritable;
  import
  org.apache.hadoop.io.Text;
  import
  org.apache.hadoop.mapreduce.Job;
  import
  org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  import
  org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
   
  /**
   * 
   * @author cevent
   * @date 2020年4月14日
   */
  public class WholeFileCombineDriver {
     public static void main(String[] args) throws Exception
  {
        Configuration configuration=new Configuration();
        Job job=Job.getInstance(configuration);
        
        job.setJarByClass(WholeFileCombineDriver.class);
        
        //关联自定义的inputFormat
        job.setInputFormatClass(WholeFileCombineInputFormat.class);
        //设置输出文件的个,必须是SequenceFile
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BytesWritable.class);
        
        job.setMapperClass(WholeFileCombineMapper.class);
        
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        
        boolean result=job.waitForCompletion(true);
        System.exit(result?0:1);
        
     }
  }


 


实现合并

arguments路径写入结果
实现合并

发布了5 篇原创文章 · 获赞 4 · 访问量 61

猜你喜欢

转载自blog.csdn.net/weixin_37056888/article/details/105522823