第十八天 – MapReduce自定义数据类型

文章目录

第十八天 -- MapReduce自定义数据类型

一、多文件输出
二、二次排序 -- 内存排序
三、二次排序 -- KV排序

实现自定义数据类型

四、求统计结果前N名

实现自定义数据类型

一、多文件输出

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


/**
 * 多文件输出
 * @author lyd
 * 
 * 数据：
hello qianfeng qianfeng world heloo
Hello Hi Hello World
QF QQ
163.com
15900001111 17900001111
@163.com
@189.com
$1100000
*[a-z]

az-r-00000
hello	1
heloo	1
qianfeng	2
world	1

AZ-r-00001
Hello	2
Hi	1
QF	1
QQ	1

part-r-00002
163.com 1
15900001111 1
17900001111 1

part-r-00003
 @163.com 1
 @189.com 1
 $1100000 1
  *[a-z] 1

 *
 */
public class MultiOutputDemo {
	/**
	 * map阶段
	 * @author lyd
	 *
	 */
	public static class MyMapper extends Mapper<Object, Text, Text, Text>{

		@Override
		protected void map(Object key, Text value,Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
			StringTokenizer st = new StringTokenizer(line);
			while (st.hasMoreElements()) {
				context.write(new Text(st.nextToken()), new Text(1+""));
			}
		}
	}
	
	/**
	 * reduce阶段
	 * @author lyd
	 *
	 */
	public static class MyReducer extends Reducer<Text, Text, Text, Text>{
		//获取多文件输出对象
		MultipleOutputs<Text, Text> mos = null;
		@Override
		protected void setup(Context context)
				throws IOException, InterruptedException {
			mos = new MultipleOutputs<Text, Text>(context);
		}
		@Override
		protected void reduce(Text key, Iterable<Text> values,Context context)
				throws IOException, InterruptedException {
			int counter = 0;
			for (Text t : values) {
				counter += Integer.parseInt(t.toString());
			}
			//判断首字母
			String firstchar = key.toString().substring(0, 1);
			if(firstchar.matches("^[a-z]")){
				mos.write("az", key, new Text(counter+""));
			} else if(firstchar.matches("^[A-Z]")){
				mos.write("AZ", key, new Text(counter+""));
			} else if(firstchar.matches("^[0-9]")){
				mos.write("09", key, new Text(counter+""));
			} else {
				mos.write("others", key, new Text(counter+""));
			}
		}
		@Override
		protected void cleanup(Context context)
				throws IOException, InterruptedException {
			//关闭
			mos.close();
		}
	}
	
		/**
		 * 驱动
		 * @param args
		 * @throws IOException
		 * @throws InterruptedException
		 * @throws ClassNotFoundException
		 */
		public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
			Configuration conf = new Configuration();
			conf.set("fs.defaultFS", "hdfs://hadoop01:9000");
			Job job = Job.getInstance(conf, "multipleoutput");
			job.setJarByClass(MultiOutputDemo.class);
			job.setMapperClass(MyMapper.class);
			
			job.setMapOutputKeyClass(Text.class);
			job.setMapOutputValueClass(Text.class);
			
			//设置多文件输出信息
			MultipleOutputs.addNamedOutput(job, "az", TextOutputFormat.class, Text.class, Text.class);
			MultipleOutputs.addNamedOutput(job, "AZ", TextOutputFormat.class, Text.class, Text.class);
			MultipleOutputs.addNamedOutput(job, "09", TextOutputFormat.class, Text.class, Text.class);
			MultipleOutputs.addNamedOutput(job, "others", TextOutputFormat.class, Text.class, Text.class);
			
			job.setReducerClass(MyReducer.class);
			
			/*job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(Text.class);*/
			
			setArgs(job,args);
			//提交作业
			int issuccessed = job.waitForCompletion(true) ? 0 : 1;
			//关闭job
			System.exit(issuccessed);
			
		}
		/**
		 * 作业参数处理
		 * @param job
		 * @param args
		 */
		public static void setArgs(Job job , String[] args){
			try {
				if(args.length != 2){
					System.out.println("argments size is not enough!!!");
					System.out.println("Useage :yarn jar *.jar wordcount /inputdata /outputdata");
				}
				//设置输入文件路径
				FileInputFormat.addInputPath(job, new Path(args[0]));
				//判断输出目录是否存在
				FileSystem fs = FileSystem.get(job.getConfiguration());
				Path op = new Path(args[1]);
				if(fs.exists(op)){
					fs.delete(op, true);
				}
				//设置输出数据目录
				FileOutputFormat.setOutputPath(job, new Path(args[1]));
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
}

二、二次排序 – 内存排序

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


/**
 * 
 * @author lyd
 *

778	89
8	55
768	88
768	68
778	90
798	68
8	99
8	12
768	78

输出数据：
------------
8	12
8	55
8	99
-----------
768	68
768	78
768	88
-----------
778	89
778	90
-----------
798	68
 */
public class SecondarySort implements Tool{
	/**
	 * map阶段
	 * @author lyd
	 *
	 */
	public static class MyMapper extends Mapper<Object, Text, IntWritable, IntWritable>{
		@Override
		protected void setup(Context context)
				throws IOException, InterruptedException {
		}

		@Override
		protected void map(Object key, Text value,Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
			String[] fields = line.split("\t");
			context.write(new IntWritable(Integer.parseInt(fields[0])), new IntWritable(Integer.parseInt(fields[1])));
		}

		@Override
		protected void cleanup(Context context)
				throws IOException, InterruptedException {
		}
	}
	
	/**
	 * reduce阶段
	 * @author lyd
	 *
	 */
	public static class MyReducer extends Reducer<IntWritable, IntWritable, Text, Text>{
		@Override
		protected void setup(Context context)
				throws IOException, InterruptedException {
		}
		@Override
		protected void reduce(IntWritable key, Iterable<IntWritable> values,Context context)
				throws IOException, InterruptedException {
			/**
			 * 8 list(55,99,12)
			 */
			List<Integer> li = new ArrayList<Integer>();
			for (IntWritable t : values) {
				li.add(t.get());
			}
			//对li进行排序
			Collections.sort(li);
			
			//输出
			context.write(new Text(key.toString()), new Text(li.get(li.size()-1)+""));
			/*for (Integer i : li) {
				context.write(new Text(key.toString()), new Text(i.toString()));
			}*/
			context.write(new Text("---------------------"), new Text(""));
		}
		@Override
		protected void cleanup(Context context)
				throws IOException, InterruptedException {
		}
	}
	
	
	
	public void setConf(Configuration conf) {

	}

	public Configuration getConf() {
		return new Configuration();
	}

	/**
	 * 驱动方法
	 */
	public int run(String[] args) throws Exception {
		Configuration conf = getConf();
		conf.set("fs.defaultFS", "hdfs://hadoop01:9000");
		Job job = Job.getInstance(conf, "SecondarySort");
		job.setJarByClass(SecondarySort.class);
		job.setMapperClass(MyMapper.class);
		
		job.setMapOutputKeyClass(IntWritable.class);
		job.setMapOutputValueClass(IntWritable.class);
		
		job.setReducerClass(MyReducer.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		setArgs(job,args);
		//提交作业
		int issuccessed = job.waitForCompletion(true) ? 0 : 1;
		return issuccessed;
	}
	
	/**
	 * 作业参数处理
	 * @param job
	 * @param args
	 */
	public static void setArgs(Job job , String[] args){
		try {
			if(args.length != 2){
				System.out.println("argments size is not enough!!!");
				System.out.println("Useage :yarn jar *.jar wordcount /inputdata /outputdata");
			}
			//设置输入文件路径
			FileInputFormat.addInputPath(job, new Path(args[0]));
			//判断输出目录是否存在
			FileSystem fs = FileSystem.get(job.getConfiguration());
			Path op = new Path(args[1]);
			if(fs.exists(op)){
				fs.delete(op, true);
			}
			//设置输出数据目录
			FileOutputFormat.setOutputPath(job, new Path(args[1]));
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * 主函数
	 * @param args
	 * @throws Exception 
	 */
	public static void main(String[] args) throws Exception {
		int isok = ToolRunner.run(new Configuration(), new SecondarySort(), args);
		System.exit(isok);
	}
	
}

三、二次排序 – KV排序

demo.java

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;


/**
 * 
 * @author lyd
 *
778	89
8	55
768	88
768	68
778	90
798	68
8	99
8	12
768	78

输出数据：
8	12
8	55
8	99
-----------
768	68
768	78
768	88
-----------
778	89
778	90
-----------
798	68
 */
public class SecondarySort_kv implements Tool{
	/**
	 * map阶段
	 * @author lyd
	 *
	 */
	public static class MyMapper extends Mapper<Object, Text, SecondarySortWritable, IntWritable>{
		@Override
		protected void setup(Context context)
				throws IOException, InterruptedException {
		}

		@Override
		protected void map(Object key, Text value,Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
			String[] fields = line.split("\t");
			SecondarySortWritable ssw = new SecondarySortWritable(Integer.parseInt(fields[0]), Integer.parseInt(fields[1]));
			
			context.write(ssw, new IntWritable(Integer.parseInt(fields[1])));
		}

		@Override
		protected void cleanup(Context context)
				throws IOException, InterruptedException {
		}
	}
	
	/**
	 * reduce阶段
	 * @author lyd
	 *
	 */
	public static class MyReducer extends Reducer<SecondarySortWritable, IntWritable, SecondarySortWritable, Text>{
		@Override
		protected void setup(Context context)
				throws IOException, InterruptedException {
		}
		@Override
		protected void reduce(SecondarySortWritable key, Iterable<IntWritable> values,Context context)
				throws IOException, InterruptedException {
			context.write(key, new Text(""));
		}
		@Override
		protected void cleanup(Context context)
				throws IOException, InterruptedException {
		}
	}
	
	
	
	public void setConf(Configuration conf) {
	}

	public Configuration getConf() {
		return new Configuration();
	}

	/**
	 * 驱动方法
	 */
	public int run(String[] args) throws Exception {
		Configuration conf = getConf();
		conf.set("fs.defaultFS", "hdfs://hadoop01:9000");
		Job job = Job.getInstance(conf, "SecondarySort");
		job.setJarByClass(SecondarySort_kv.class);
		job.setMapperClass(MyMapper.class);

		job.setMapOutputKeyClass(SecondarySortWritable.class);
		job.setMapOutputValueClass(IntWritable.class);

		job.setReducerClass(MyReducer.class);

		job.setOutputKeyClass(SecondarySortWritable.class);
		job.setOutputValueClass(Text.class);

		setArgs(job,args);
		//提交作业
		int issuccessed = job.waitForCompletion(true) ? 0 : 1;
		return issuccessed;
	}
	
	/**
	 * 作业参数处理
	 * @param job
	 * @param args
	 */
	public static void setArgs(Job job , String[] args){
		try {
			if(args.length != 2){
				System.out.println("argments size is not enough!!!");
				System.out.println("Useage :yarn jar *.jar wordcount /inputdata /outputdata");
			}
			//设置输入文件路径
			FileInputFormat.addInputPath(job, new Path(args[0]));
			//判断输出目录是否存在
			FileSystem fs = FileSystem.get(job.getConfiguration());
			Path op = new Path(args[1]);
			if(fs.exists(op)){
				fs.delete(op, true);
			}
			//设置输出数据目录
			FileOutputFormat.setOutputPath(job, new Path(args[1]));
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * 主函数
	 * @param args
	 * @throws Exception 
	 */
	public static void main(String[] args) throws Exception {
		int isok = ToolRunner.run(new Configuration(), new SecondarySort_kv(), args);
		System.exit(isok);
	}
	
}

实现自定义数据类型

自定义数据类型时，需要实现Writable接口或WritableComparable接口

前者需要实现write()，readFields()方法

后者在前者的基础上多实现一个compareTo()方法，用于排序

SecondarySortWritable.java

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Objects;

/**
 * @ClassName SecondarySortWritable
 * @Author lyd
 * @Date $ $
 * @Vesion 1.0
 * @Description //TODO $
 **/
public class SecondarySortWritable implements WritableComparable<SecondarySortWritable> {
    private int first;
    private int second;

    public SecondarySortWritable(){

    }
    public SecondarySortWritable(int first, int second) {
        this.first = first;
        this.second = second;
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeInt(this.first);
        dataOutput.writeInt(this.second);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        this.first = dataInput.readInt();
        this.second = dataInput.readInt();
    }


    @Override
    public int compareTo(SecondarySortWritable o) {
        if(o == this){
          return 0;
        }
//        int tmp = this.first - o.first;  //升序
        int tmp = o.first - this.first;  //降序
//        return tmp;
       if(tmp != 0){
            return tmp;
        }
        return o.second - this.second; //降序
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;
        SecondarySortWritable that = (SecondarySortWritable) o;
        return first == that.first &&
                second == that.second;
    }

    @Override
    public int hashCode() {

        return Objects.hash(first, second);
    }

    public int getFirst() {
        return first;
    }

    public void setFirst(int first) {
        this.first = first;
    }

    public int getSecond() {
        return second;
    }

    public void setSecond(int second) {
        this.second = second;
    }

    @Override
    public String toString() {
        return "SecondarySortWritable{" +
                "first=" + first +
                ", second=" + second +
                '}';
    }
}

四、求统计结果前N名

TopN.java

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;
import java.util.TreeSet;


/**
 求统计结果的前3名：
 hadoop hadoop hadoop hadoop hadoop is ok is nice is better
 spark hbase hive flume nice

 输出：
 hadoop 5
 is 3
 nice 2
 **/

public class TopN implements Tool {
    private Configuration conf = new Configuration();

    /**
     * map阶段
     * @author lyd
     *
     */
    public static class MyMapper extends Mapper<Object, Text, Text, Text>{

        @Override
        protected void setup(Context context)
                throws IOException, InterruptedException {
        }

        @Override
        protected void map(Object key, Text value,Context context)
                throws IOException, InterruptedException {
            String line = value.toString();
            String [] fields = line.split(" ");
            for (String string : fields) {
                context.write(new Text(string), new Text(1+""));
            }
        }

        @Override
        protected void cleanup(Context context)
                throws IOException, InterruptedException {
        }
    }

    /**
     * reduce阶段
     * @author lyd
     *
     */
    public static class MyReducer extends Reducer<Text, Text, TopNWritable, NullWritable>{
        TreeSet<TopNWritable> ts = new TreeSet<TopNWritable>();
        @Override
        protected void setup(Context context)
                throws IOException, InterruptedException {
        }
        int SIZE = 3; //top N的指定
        @Override
        protected void reduce(Text key, Iterable<Text> values,Context context)
                throws IOException, InterruptedException {
            int counter = 0;
            for (Text t : values) {
                counter += Integer.parseInt(t.toString());
            }
            TopNWritable tn = new TopNWritable(key.toString(), counter);
            //将所有的对象添加到ts中
            ts.add(tn);
            if(ts.size() > SIZE){
                //移除较小
                ts.remove(ts.last());
            }
        }
        @Override
        protected void cleanup(Context context)
                throws IOException, InterruptedException {
            for (TopNWritable topNWritable : ts) {
                context.write(topNWritable, NullWritable.get());
            }
        }
    }

    public void setConf(Configuration conf) {

        }

    public Configuration getConf() {
        return this.conf;
    }

    /**
     * 驱动方法
     */
    public int run(String[] args) throws Exception {
        Configuration conf = getConf();
        conf.set("fs.defaultFS", "hdfs://hadoop01:9000");
        Job job = Job.getInstance(conf, "TopN");
        job.setJarByClass(TopN.class);
        job.setMapperClass(MyMapper.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(MyReducer.class);

        job.setOutputKeyClass(TopNWritable.class);
        job.setOutputValueClass(NullWritable.class);

        setArgs(job,args);
        //提交作业
        int issuccessed = job.waitForCompletion(true) ? 0 : 1;
        return issuccessed;
    }

    /**
     * 作业参数处理
     * @param job
     * @param args
     */
    public static void setArgs(Job job , String[] args){
        try {
            if(args.length != 2){
                System.out.println("argments size is not enough!!!");
                System.out.println("Useage :yarn jar *.jar wordcount /inputdata /outputdata");
            }
            //设置输入文件路径
            FileInputFormat.addInputPath(job, new Path(args[0]));
            //判断输出目录是否存在
            FileSystem fs = FileSystem.get(job.getConfiguration());
            Path op = new Path(args[1]);
            if(fs.exists(op)){
                fs.delete(op, true);
            }
            //设置输出数据目录
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * 主函数
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
        int isok = ToolRunner.run(new Configuration(), new TopN(), args);
        System.exit(isok);
    }

}

实现自定义数据类型

TopNWritable.java

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Objects;

/**
 * @ClassName TopNWritable
 * @Author lyd
 * @Date $ $
 * @Vesion 1.0
 * @Description
 * 自定义数据类型：
 * 1、需要实现WritableComparable(可实现排序)  或者Writable(不能排序，只能序列化)
 * 2、需要注意write方法和readFields()方法中的字段的顺序、类型、个数相同
 * 3、compareTo()用于排序使用
 **/
public class TopNWritable implements WritableComparable<TopNWritable> {
    private String words;
    private int count;

    public TopNWritable(){
    }

    public TopNWritable(String words, int count) {
        this.words = words;
        this.count = count;
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(this.words);
        dataOutput.writeInt(this.count);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        this.words = dataInput.readUTF();
        this.count = dataInput.readInt();
    }

    @Override
    public int compareTo(TopNWritable o) {
        if(o == this){
            return 0;
        }
        int tmp = o.count - this.count;
        if(tmp != 0){
            return tmp;
        }
        return this.words.compareTo(o.words);
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;
        TopNWritable that = (TopNWritable) o;
        return count == that.count &&
                Objects.equals(words, that.words);
    }

    @Override
    public int hashCode() {

        return Objects.hash(words, count);
    }

    @Override
    public String toString() {
        return "words='" + words + ", count=" + count;
    }
}

第十八天 -- MapReduce自定义数据类型

第十八天 – MapReduce自定义数据类型

文章目录

一、多文件输出

二、二次排序 – 内存排序

三、二次排序 – KV排序

实现自定义数据类型

四、求统计结果前N名

实现自定义数据类型

猜你喜欢