hadoop-day05

1 join case 

1.1 pojo type

The MR program will process different types of data, which can be processed separately according to the file name

Encapsulate a pojo class that contains all the attributes of the data, and add an identification field to distinguish the encapsulated data

/**
 * @Classname UserAndOrdersWritable
 * @Date 2020/9/24 0024 9:43
 * @Created by 多易教育-DOIT18
 * @Description: 存储用户数据和订单数据
 */
public class UserAndOrdersWritable implements Writable {
   // 用户的数据
    private  String uid ;
    private  String  name ;
    private  int  age ;
    private  String gender ;
    private  String  friend ;
  // 存储订单的数据属性
    private String  oid ;
    /**
     * 数据标识
     */
    private  String  tbName ;

    public String getUid() {
        return uid;
    }
    public void set(String uid, String name, int age, String gender, String friend, String oid, String tbName) {
        this.uid = uid;
        this.name = name;
        this.age = age;
        this.gender = gender;
        this.friend = friend;
        this.oid = oid;
        this.tbName = tbName;
    }
    public void setUid(String uid) {
        this.uid = uid;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public int getAge() {
        return age;
    }

    public void setAge(int age) {
        this.age = age;
    }

    public String getGender() {
        return gender;
    }

    public void setGender(String gender) {
        this.gender = gender;
    }

    public String getFriend() {
        return friend;
    }

    public void setFriend(String friend) {
        this.friend = friend;
    }

    public String getOid() {
        return oid;
    }

    public void setOid(String oid) {
        this.oid = oid;
    }

    public String getTbName() {
        return tbName;
    }

    public void setTbName(String tbName) {
        this.tbName = tbName;
    }

    @Override
    public String toString() {
        return  uid+","+name+","+age+","+gender+","+friend ;
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(uid);
        dataOutput.writeUTF(name);
        dataOutput.writeInt(age);
        dataOutput.writeUTF(gender);
        dataOutput.writeUTF(friend);
        dataOutput.writeUTF(oid);
        dataOutput.writeUTF(tbName);

    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        this.uid = dataInput.readUTF();
        this.name = dataInput.readUTF();
        this.age = dataInput.readInt() ;
        this.gender = dataInput.readUTF();
        this.friend = dataInput.readUTF();
        this.oid = dataInput.readUTF();//null
        this.tbName = dataInput.readUTF();
    }
}

1.2 Step analysis

1.3 Code implementation

package com._51doit.mr.day05.join;

import com._51doit.mr.day04.index.Index2;
import com._51doit.mr.day05.beans.UserAndOrdersWritable;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.List;

/**
 * @Classname Join
 * @Date 2020/9/24 0024 9:40
 * @Created by 多易教育-DOIT18
 * @Description:
 * 1 MR程序处理的数据的输入路径可以是一个文件夹
 *   文件夹中可能存在不同类型的数据文件 , 可以使用文件名来区分数据
 */
public class Join {
    static class JoinMapper extends Mapper<LongWritable, Text , Text , UserAndOrdersWritable> {
        String fileName = null ;
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            FileSplit f = (FileSplit) context.getInputSplit();
            fileName = f.getPath().getName();
        }
        Text k = new Text() ;
        UserAndOrdersWritable v = new UserAndOrdersWritable() ;
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            try {
                String line = value.toString();
                // 根据文件名区分是何种数据
                if("user.txt".equals(fileName)){ // 用户数据 u003,jiajia,16,female,chunge
                    String[] split = line.split(",");
                    String uid = split[0] ;
                    String name = split[1] ;
                    int age = Integer.parseInt(split[2]) ;
                    String gender = split[3] ;
                    String friend = split[4] ;
                    String tbName = "tb_user" ;
                    k.set(uid);
                    //自定义类的每个属性都要设置
                    v.set(uid,name,age,gender,friend,"",tbName);

                }else{// 订单数据  order033 u005
                    String[] split = line.split("\\s+");
                    String oid = split[0] ;
                    String uid = split[1] ;
                    k.set(uid);
                    v.set(uid,"",-1 , "","",oid,"tb_orders");
                }
                context.write(k,v);
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }
    static class JoinReducer extends Reducer<Text, UserAndOrdersWritable ,Text, NullWritable>{
        Text k = new Text() ;

        /**
         * 将用户id相同的数据聚合在一起
         *   uid  包含一个用户信息  多个订单信息
         * @param key
         * @param values
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void reduce(Text key, Iterable<UserAndOrdersWritable> values, Context context) throws IOException, InterruptedException {
            try {
                // 用户
                UserAndOrdersWritable user = new UserAndOrdersWritable() ;
                // list 订单
                List<UserAndOrdersWritable> ordersList = new ArrayList<>() ;
                // 将所有的数据  多个订单数据封装在list中 一个用户数据封装在user中
                for (UserAndOrdersWritable value : values) {
                    String tbName = value.getTbName();
                    if("tb_user".equals(tbName)){  // 用户数据
                        //用户数据
                        BeanUtils.copyProperties(user , value);
                    }else{// 订单数据
                        // 将订单数据存储在list集合中
                        UserAndOrdersWritable orders = new UserAndOrdersWritable() ;
                        BeanUtils.copyProperties(orders , value);
                        // 添加到list集合
                        ordersList.add(orders);
                    }
                }

                /*
                  join
                for (UserAndOrdersWritable ordersWritable : ordersList) {
                        String oid = ordersWritable.getOid();
                        //oid,uid,name,age,gender,friend
                        String  kk = oid+","+user ;
                        k.set(kk) ;
                        context.write(k,NullWritable.get());
                    }*/
                if(ordersList!=null && ordersList.size()>0){   // right  join
                    // 遍历list  拼接用户数据  写出去
                    for (UserAndOrdersWritable ordersWritable : ordersList) {
                        String oid = ordersWritable.getOid();
                        //oid,uid,name,age,gender,friend
                        String  kk = oid+","+user ;
                        k.set(kk) ;
                        context.write(k,NullWritable.get());
                    }

                }else{ // right  join
                    String  kk = "null"+","+user ;
                    k.set(kk) ;
                    context.write(k,NullWritable.get());
                }



            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

    public static void main(String[] args) throws Exception {


        Configuration conf = new Configuration();
        // 2 获取一个Job对象
        Job job = Job.getInstance(conf, "wordcount");
        //3 设置map和reducetask任务类
        job.setMapperClass(JoinMapper.class);
        job.setReducerClass(JoinReducer.class);
        // 4 设置maptask和reducetask的输出的key value类型
       job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(UserAndOrdersWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
        //5 设置reduce的个数
        // job.setNumReduceTasks(2);
        //6 设置处理数据的输入路径 本地测试
        FileInputFormat.setInputPaths(job,new Path("D:\\data\\join\\input"));
        // 7设置结果的输出路径
        FileOutputFormat.setOutputPath(job,new Path("D:\\data\\join\\output3"));
        //8提交job
        job.waitForCompletion(true) ;

    }
}

1.4 MR program can have no reducer stage

package com._51doit.mr.day05.noreducer;

import com._51doit.mr.day05.beans.UserAndOrdersWritable;
import com._51doit.mr.day05.join.Join;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * @Classname Demo
 * @Date 2020/9/24 0024 11:09
 * @Created by 多易教育-DOIT18
 * @Description:
 */
public class Demo {

    static class DemoMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
        Text k = new Text();
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String upstr = value.toString().toUpperCase();
            k.set(upstr);
            context.write(k, NullWritable.get());
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        // 2 获取一个Job对象
        Job job = Job.getInstance(conf, "dmeo");
        //3 设置map和reducetask任务类
        job.setMapperClass(DemoMapper.class);

        // 4 设置maptask和reducetask的输出的key value类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
        //5 设置reduce的个数
        job.setNumReduceTasks(0);
        //6 设置处理数据的输入路径 本地测试
        FileInputFormat.setInputPaths(job,new Path("D://word.txt"));
        // 7设置结果的输出路径
        FileOutputFormat.setOutputPath(job,new Path("D://demo/"));
        //8提交job
        job.waitForCompletion(true) ;

    }
}

2 Data tilt case

When processing data, some reduce tasks allocate a large amount of data, and some allocate very little, resulting in uneven distribution of tasks. The process of processing data may slow down certain reduce tasks!

package com._51doit.mr.day05.skew;

import com.sun.media.jfxmediaimpl.platform.gstreamer.GSTPlatform;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.Random;

/**
 * @Classname Skew2
 * @Date 2020/9/24 0024 14:39
 * @Created by 多易教育-DOIT18
 * @Description:
 */
public class Skew2 {

    static  class  Skew2Mapper extends Mapper<LongWritable , Text , Text, IntWritable> {
        int reduceTasks = 0 ;
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
             reduceTasks = context.getNumReduceTasks();
        }

        Random random = new Random();
        Text k = new Text() ;
        IntWritable v = new IntWritable(1) ;
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String line = value.toString();
            String[] words = line.split("\\s+");

            for (String word : words) {
                //a    a-0  a-1  生成随机数字
                int i = random.nextInt(reduceTasks);// 0 1  0 1   1
                // 将单词随机数字拼接
                String kk = word+"-"+i ; // a-1  a-0  a-0  a-1
                k.set(kk);
                context.write(k,v);
            }
        }
    }

    static  class  Skew2Reducer extends Reducer<Text, IntWritable, Text , IntWritable>{

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int count = 0 ;
            for (IntWritable value : values) { // a   1770
                count++ ;
            }
            context.write(key ,new IntWritable(count));
        }
    }

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        // 2 获取一个Job对象
        Job job = Job.getInstance(conf, "wordcount");
        //3 设置map和reducetask任务类
        job.setMapperClass(Skew2Mapper.class);
        job.setReducerClass(Skew2Reducer.class);
        // 4 设置maptask和reducetask的输出的key value类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        //5 设置reduce的个数
        job.setNumReduceTasks(2);
        //6 设置处理数据的输入路径 本地测试
        FileInputFormat.setInputPaths(job,new Path("D:\\data\\skew\\input"));
        // 7设置结果的输出路径
        FileOutputFormat.setOutputPath(job,new Path("D:\\data\\skew\\output1"));
        //8提交job
        job.waitForCompletion(true) ;
    }

}
package com._51doit.mr.day05.skew;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * @Classname Skew3
 * @Date 2020/9/24 0024 14:50
 * @Created by 多易教育-DOIT18
 * @Description:
 */
public class Skew3 {

    static class Skew3Mapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        Text k = new Text();
        IntWritable v = new IntWritable();

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString(); //a-0	846
            String[] split = line.split("-");
            String word = split[0];
            String[] split1 = split[1].split("\\s+");
            int count = Integer.parseInt(split1[1]);
            k.set(word);
            v.set(count);
            context.write(k,v);
        }
    }

    static class Skew3Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        IntWritable v = new IntWritable();

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int count = 0;
            for (IntWritable value : values) {
                count += value.get();
            }
            v.set(count);
            context.write(key, v);
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        // 2 获取一个Job对象
        Job job = Job.getInstance(conf, "wordcount");
        //3 设置map和reducetask任务类
        job.setMapperClass(Skew3Mapper.class);
        job.setReducerClass(Skew3Reducer.class);
        // 4 设置maptask和reducetask的输出的key value类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        //5 设置reduce的个数
        job.setNumReduceTasks(1);
        //6 设置处理数据的输入路径 本地测试
        FileInputFormat.setInputPaths(job,new Path("D:\\data\\skew\\output1"));
        // 7设置结果的输出路径
        FileOutputFormat.setOutputPath(job,new Path("D:\\data\\skew\\res"));
        //8提交job
        job.waitForCompletion(true) ;



    }

}

 

 

 

 

 

 

 

 

 

 

 

Guess you like

Origin blog.csdn.net/qq_37933018/article/details/108769186