Hadoop-MapReducer implements data merge

Use mapReducer to realize data merging

Order table order.txt and product.txt table data merge

Basic data

1.order.txt

order_id product_id product_amount
1001 01 1
1002 02 2
1003 03 3

order.txt
1001 01 1
1002 02 2
1003 03 3
1001 01 1
1002 02 2
1003 03 3

2.product.txt

01 Xiaomi
02 Huawei
03 Gree

  1. TableBean



 
  
  package
  com.cevent.hadoop.mapreduce.table;
   
  import
  java.io.DataInput;
  import
  java.io.DataOutput;
  import
  java.io.IOException;
   
  import
  org.apache.hadoop.io.Writable;
   
  public class TableBean implements Writable{
   
     private String order_id;    //订单id
     private String product_id;  //产品id
     private int product_amount; //产品数量
     private String product_name;//产品名称
     private String order_flag;  //表单标记
     
     public String getOrder_id() {
        return order_id;
     }
   
     public void setOrder_id(String order_id) {
        this.order_id = order_id;
     }
   
     public String getProduct_id() {
        return product_id;
     }
   
     public void setProduct_id(String product_id) {
        this.product_id = product_id;
     }
   
     public int getProduct_amount() {
        return product_amount;
     }
   
     public void setProduct_amount(int product_amount) {
        this.product_amount = product_amount;
     }
   
     public String getProduct_name() {
        return product_name;
     }
   
     public void setProduct_name(String product_name) {
        this.product_name = product_name;
     }
   
     public String getOrder_flag() {
        return order_flag;
     }
   
     public void setOrder_flag(String order_flag) {
        this.order_flag = order_flag;
     }
   
     public TableBean() {
        super();
     }
     
   
     public TableBean(String order_id, String product_id, int product_amount,
           String product_name, String order_flag) {
        super();
        this.order_id = order_id;
        this.product_id = product_id;
        this.product_amount = product_amount;
        this.product_name = product_name;
        this.order_flag = order_flag;
     }
     
   
     @Override
     public String toString() {
        return order_id + "\t" + product_id+ "\t" + product_amount+"\t"+product_name;
     }
     
     //序列化
     @Override
     public void write(DataOutput output) throws
  IOException {
        // 写入
        output.writeUTF(order_id);
        output.writeUTF(product_id);
        output.writeInt(product_amount);
        output.writeUTF(product_name);
        output.writeUTF(order_flag);
     }
     //反序列化
     @Override
     public void readFields(DataInput input) throws
  IOException {
        // 读取
        this.order_id=input.readUTF();
        this.product_id=input.readUTF();
        this.product_amount=input.readInt();
        this.product_name=input.readUTF();
        this.order_flag=input.readUTF();
     }



  }
  
  1. TableMapper



 
  
  package
  com.cevent.hadoop.mapreduce.table;
   
  import
  java.io.IOException;
   
  import
  org.apache.hadoop.io.LongWritable;
  import
  org.apache.hadoop.io.Text;
  import
  org.apache.hadoop.mapreduce.Mapper;
  import
  org.apache.hadoop.mapreduce.lib.input.FileSplit;
  /**
   *
  LongWritable, Text,      Text,       TableBean
   * 输入Writable类型   输入格式                 product_id  bean对象
   * @author cevent
   * @date 2020年4月12日
   */
  public class TableMapper extends Mapper<LongWritable, Text, Text, TableBean>{
   
     TableBean tableBean=new TableBean();
     Text keyID=new Text();
     
     @Override
     protected void map(LongWritable key, Text value,Context context)
           throws
  IOException, InterruptedException {
        // 1.获取输入文件类型:FileSplit为txt文件的切片类型
        FileSplit inputSplit=(FileSplit)context.getInputSplit();
        // 1.1获取输入文件的路径和名称
        String txtName=inputSplit.getPath().getName();
        //2.获取输入数据
        String splitLine=value.toString();
        //3.不同文件,分别处理
        if(txtName.startsWith("order")){ //订单
           //3.1开始切割
           String [] fields=splitLine.split("\t");
           //3.2封装bean对象:1002 02 2
           tableBean.setOrder_id(fields[0]);
           tableBean.setProduct_id(fields[1]);
           tableBean.setProduct_amount(Integer.parseInt(fields[2]));
           //3.3封装bean的product表默认值
           tableBean.setProduct_name("");
           tableBean.setOrder_flag("0"); //reducer读取区分表
           //3.4设置key索引=priduct_id
           keyID.set(fields[1]);
           
        }else{ //产品
           String [] fields=splitLine.split("\t");
           tableBean.setOrder_id("");
           tableBean.setProduct_id(fields[0]);
           tableBean.setProduct_amount(0);
           tableBean.setProduct_name(fields[1]);
           tableBean.setOrder_flag("1"); //reducer读取区分表
           
           keyID.set(fields[0]);
        }
        
        //4.封装bean对象输出
        context.write(keyID, tableBean);
     }
  }
   
  
 



  1. TableReducer



 
  
  package
  com.cevent.hadoop.mapreduce.table;
   
  import
  java.io.IOException;
  import
  java.util.ArrayList;
   
  import
  org.apache.commons.beanutils.BeanUtils;
  import
  org.apache.hadoop.io.NullWritable;
  import
  org.apache.hadoop.io.Text;
  import
  org.apache.hadoop.mapreduce.Reducer;
   
  /**
   *
  Reducer的输入=mapper的输出,Reducer的输出key=输入的value,Reducer的输出value=写入的类型
   *
  Text,          TableBean,      TableBean, NullWritable
   *
  mapperOutKey   mapperOutValue
   * @author cevent
   * @date 2020年4月12日
   */
  public class TableReducer extends Reducer<Text, TableBean, TableBean, NullWritable>{
   
     @Override
     protected void reduce(Text key, Iterable<TableBean> values,Context context)
           throws
  IOException, InterruptedException {
        // 1.缓存区分:order和product表的数据来源
        TableBean productBean=new TableBean();
        //作为遍历bean对象的缓存集合
        ArrayList<TableBean> orderBeans=new ArrayList<>();
        
        //1.1循环遍历values
        for(TableBean bean:values){
           //order_flag=0
  product_flag=1
           if("0".equals(bean.getOrder_flag())){
              //订单处理:1002 02 2 ---- 
  1003 03 3
              TableBean orderBean=new TableBean();
              
              try {
                  //2.拷贝order缓存数据
                  BeanUtils.copyProperties(orderBean, bean);
                  //3.将缓存的数据添加到beans集合
                  orderBeans.add(orderBean);
                  
              }
  catch (Exception e) {
                  e.printStackTrace();
              }
  
              
           }else{
              
              //产品处理:02   华为
              //直接获取product_name容易数据混淆
              //productBean.setProduct_name(productBean.getProduct_name());
              try {
                  //调用BeanUtils区分,断开缓存类和实体类的连接
                  BeanUtils.copyProperties(productBean, bean);
                  
              }
  catch (Exception e) {
                  e.printStackTrace();
              }
  
           }
        }
        
        //数据拼接
        for(TableBean bean:orderBeans){
           //更新产品名称字段
           bean.setProduct_name(productBean.getProduct_name());
           //写出
           context.write(bean, NullWritable.get());
        }
     }
  }
   
  
 



  1. TableDriver



 
  
  package
  com.cevent.hadoop.mapreduce.table;
   
  import java.io.IOException;
   
  import
  org.apache.hadoop.conf.Configuration;
  import
  org.apache.hadoop.fs.Path;
  import
  org.apache.hadoop.io.NullWritable;
  import
  org.apache.hadoop.io.Text;
  import
  org.apache.hadoop.mapreduce.Job;
  import
  org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  import
  org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
   
  /**
   * 整合mapper和reducer
   * @author cevent
   * @date 2020年4月12日
   */
  public class TableDriver {
   
     public static void main(String[] args) throws Exception
  {
        //1.获取配置信息及job实例
        Configuration configuration=new Configuration();
        Job job=Job.getInstance(configuration);
        
        //2.指定jar包本地路径
        job.setJarByClass(TableBean.class);
        
        //3.指定job使用的mapper和reducer
        job.setMapperClass(TableMapper.class);
        job.setReducerClass(TableReducer.class);
        
        //4.指定mapper的输出key/value类型:mapper的输出=reducer的输入
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(TableBean.class);
        
        //5.指定输出的key和value类型=reducer的输出
        job.setOutputKeyClass(TableBean.class);
        job.setOutputValueClass(NullWritable.class);
        
        //6.指定job的输入原始文件目录
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        
        //7.将job中配置及jar提交yarn处理
        boolean result=job.waitForCompletion(true);
        System.exit(result?0:1);
     }
  }
   
  
 



  1. run实现

Input: D: \ xxx \ eclipse_code \ hadoopTMP \ inputOrderProduct

输出:D:\xxx\eclipse_code\hadoopTMP\outputOrderProduct

cevent_run
cevent_run
cevent_run_result
result
9. 修改TableBean




 
  
     @Override
     public String toString() {
        return order_id + "\t" + product_id+ "\t" + product_amount+"\t";
     }
     
  
 



  1. 修改TableReducer



 
  
        //数据拼接
        for(TableBean bean:orderBeans){
           //更新产品名称字段
           bean.setProduct_id(productBean.getProduct_name());
           //写出
           context.write(bean, NullWritable.get());
        }
  
 



  1. 实现product_id→product_name转换
    cevent_run_result
Published 5 original articles · praised 4 · visits 61

Guess you like

Origin blog.csdn.net/weixin_37056888/article/details/105477271