MapReduce框架原理之(四)Join

MapReduce框架原理之Join

Join

类似SQL的Join,将1张以上的表按照某种关系,将数据进行格式化输出.

1 Reduce Join

Reduce Join工作原理

  1. Map端的主要工作:为来自不同表或文件的key/value对,添加标识加以区别不同的数据来源,然后用join字段作为排序和分组条件(本例是pid),其余部分和标识作为value,最后进行输出.
  2. Reduce端的主要工作:在Reduce端以连接字段作为key的分组已经完成,每组数据会调用一个reduce方法,此时我们只需要根据在Map端添加的标识将数据分别拿出后,再合并成我们需要的格式即可.

需求:
(1)订单数据表

id pid amount
1001 01 1
1002 02 2
1003 03 3
1004 01 4
1005 02 5
1006 03 6

(2)商品信息表

pid pname
01 mi
02 iphone
03 blackberry

(3)最终期望输出结果(按pid升序可以保证,id可能会变成倒序乱,如有特殊要求可以自行调整)

id pname amount
1001 mi 1
1004 mi 4
1002 iphone 2
1005 iphone 5
1003 blackberry 3
1006 blackberry 6

自定义Bean

package join;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class TableBean implements WritableComparable<TableBean> {
    private String id;
    private String pid;
    private int amount;
    private String name;
    private String tblName;
    @Override
    public String toString() {
        return this.id + "\t" + this.name + "\t" + this.amount;
    }

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public String getPid() {
        return pid;
    }

    public void setPid(String pid) {
        this.pid = pid;
    }

    public int getAmount() {
        return amount;
    }

    public void setAmount(int amount) {
        this.amount = amount;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public String getTblName() {
        return tblName;
    }

    public void setTblName(String tblName) {
        this.tblName = tblName;
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(this.id);
        dataOutput.writeUTF(this.pid);
        dataOutput.writeInt(this.amount);
        dataOutput.writeUTF(this.name);
        dataOutput.writeUTF(this.tblName);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        this.id = dataInput.readUTF();
        this.pid = dataInput.readUTF();
        this.amount = dataInput.readInt();
        this.name = dataInput.readUTF();
        this.tblName = dataInput.readUTF();
    }

    @Override
    public int compareTo(TableBean o) {
        int result = 0;
        if(this.pid.compareTo(o.pid) > 0) {
            result = 1;
        } else if (this.pid.compareTo(o.pid) < 0) {
            result = -1;
        }
        return result;
    }
}

Mapper类

package join;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

public class TableMapper extends Mapper<LongWritable, Text, TableBean, NullWritable> {
    private String fileName;
    private TableBean bean = new TableBean();

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        FileSplit split = (FileSplit) context.getInputSplit();
        fileName = split.getPath().getName();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] fields = value.toString().split("\t");

        bean.setTblName(fileName);
        if("order.txt".equals(fileName)) {
            // order表
            bean.setId(fields[0]);
            bean.setPid(fields[1]);
            bean.setAmount(Integer.parseInt(fields[2]));
            bean.setName("");
        } else {
            // pd表
            bean.setId("");
            bean.setPid(fields[0]);
            bean.setAmount(0);
            bean.setName(fields[1]);
        }
        context.write(bean, NullWritable.get());
    }
}

Reducer类

package join;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.List;

public class TableReducer extends Reducer<TableBean, NullWritable, TableBean, NullWritable> {
    private List<TableBean> list = new ArrayList<TableBean>();
    private TableBean pBean = new TableBean();
    @Override
    protected void reduce(TableBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
        for (NullWritable value : values) {
            if("order.txt".equals(key.getTblName())) {
                TableBean bean = new TableBean();
                try {
                    BeanUtils.copyProperties(bean, key);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
                list.add(bean);
            } else {
                try {
                    BeanUtils.copyProperties(pBean, key);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
            }
        }
        for (TableBean bean : list) {
            bean.setName(pBean.getName());
            context.write(bean, NullWritable.get());
        }
        list.clear();
    }
}

Driver类

package join;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class TableDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        Path oPath = new Path("i:\\tbl_output");
        if (fs.exists(oPath)) {
            fs.delete(oPath, true);
        }
        Job job = Job.getInstance(conf);
        job.setJarByClass(TableDriver.class);
        job.setMapperClass(TableMapper.class);
        job.setReducerClass(TableReducer.class);

        job.setMapOutputKeyClass(TableBean.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(TableBean.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.addInputPath(job, new Path("i:\\tbl_input"));
        FileOutputFormat.setOutputPath(job, oPath);

        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }
}

缺点:
join操作全部由Reduce端完成,造成Reduce端压力偏大,而Map端压力偏小,且极易造成数据倾斜.
优化方法:
使用Map Join

2 Map Join

  1. 使用场景
    Map Join适用于一张小表(10-15M)和一张大表的场景.
  2. 优点
    可以将多张小表缓存起来,在map端进行业务合并,减少到reduce的数据,减缓reduce端的压力,尽可能的减少数据倾斜.
  3. 具体办法
    (1)在Driver类中,指明要添加到缓存的表.
    (2)在Mapper的setup方法中,将文件从缓存中读出,并进行封装.(如使用map)
  4. 代码演示(Bean还用上面的)

Mapper类

package join.map;

import join.reduce.TableBean;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.*;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

public class TblMapper extends Mapper<LongWritable, Text, TableBean, NullWritable> {
    private Map<String, String> map = new HashMap<String, String>();
    private TableBean bean = new TableBean();
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        URI[] files = context.getCacheFiles();
        URI uri = files[0];
        String p = uri.getPath();
        BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(p)));
        String line = "";
        while(StringUtils.isNotEmpty(line = reader.readLine())){
            String[] fields = line.split("\t");
            map.put(fields[0], fields[1]);
        }
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] fields = value.toString().split("\t");
        bean.setId(fields[0]);
        bean.setPid(fields[1]);
        bean.setAmount(Integer.parseInt(fields[2]));
        bean.setName(map.get(fields[1]));
        // 值得注意的是这里,因为在Bean中,readFields和write中涉及到的field,一定都是要非Null的否则会报空指针异常.
        // 所以,这里我们虽然不需要TblName这个字段的值,也需要将它设置成非Null的值
        bean.setTblName("");
        context.write(bean, NullWritable.get());
    }
}

Reducer类

package join.map;

import join.reduce.TableBean;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class TblReducer extends Reducer<TableBean, NullWritable, TableBean, NullWritable> {
    private TableBean bean = new TableBean();
    @Override
    protected void reduce(TableBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
        for (NullWritable value : values) {
            bean.setId(key.getId());
            bean.setPid(key.getPid());
            bean.setName(key.getName());
            bean.setAmount(key.getAmount());
            context.write(bean ,NullWritable.get());
        }
    }
}

Driver类

package join.map;

import join.reduce.TableBean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

public class TblDriver {
    public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        Path oPath = new Path("i:\\tbl_output");
        if (fs.exists(oPath)) {
            fs.delete(oPath, true);
        }
        Job job = Job.getInstance(conf);
        // 将较小的pd表内容添加到缓存
        job.addCacheFile(new URI("file:///i:/tbl_input/pd.txt"));
        job.setJarByClass(TblDriver.class);
        job.setMapperClass(TblMapper.class);
        job.setReducerClass(TblReducer.class);

        job.setMapOutputKeyClass(TableBean.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(TableBean.class);
        job.setOutputValueClass(NullWritable.class);

		// 只有2张表,pd已经加载进缓存中,所以inputformat只需要处理order表即可.
        FileInputFormat.addInputPath(job, new Path("i:\\tbl_input\\order.txt"));
        FileOutputFormat.setOutputPath(job, oPath);

        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }
}
发布了62 篇原创文章 · 获赞 3 · 访问量 2万+

猜你喜欢

转载自blog.csdn.net/Leonardy/article/details/103946066
今日推荐