mapreduce-多个文件的输入，实现每个单词的文件次数倒序排序

1、第一次mapreduce

package cn.itcast.mr.combineSort2;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class combineSortMRD {

	static class combineSortMapper extends Mapper<LongWritable, Text, Text, IntWritable>{

		@Override
		protected void map(LongWritable key, Text value,Context context)
				throws IOException, InterruptedException {

			String line = value.toString();
			String[] words = line.split(" ");
			
			//获取切片信息 
			FileSplit inputSplit = (FileSplit) context.getInputSplit();
			String name = inputSplit.getPath().getName();
			
			for(String word:words){
				context.write(new Text(word+"-->"+name),new IntWritable(1));
			}
		}
	}
	
	static class combineSortReducer extends Reducer<Text, IntWritable, Text, IntWritable>{

		@Override
		protected void reduce(Text key, Iterable<IntWritable> values,Context context)
				throws IOException, InterruptedException {

			int count = 0;
			for(IntWritable value:values){
				count+=value.get();
			}
			context.write(key, new IntWritable(count));
		}
	}
	
	public static void main(String[] args) throws Exception {
		if (args.length == 0) {
			args = new String[2];
			args[0] = "hdfs://192.168.40.10:9000/combineSort/input/";
			args[1] = "hdfs://192.168.40.10:9000/combineSort/output1";
		}
		
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(combineSortMRD.class);
		
		job.setMapperClass(combineSortMapper.class);
		job.setCombinerClass(MyCombiner.class);
		job.setReducerClass(combineSortReducer.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
			
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		boolean res = job.waitForCompletion(true);// 将信息打印出来看
		System.exit(res ? 0 : 1);
	}

}

2、第二次mapreduce

combineSortMRD1.java

package cn.itcast.mr.combineSort2;

import java.io.IOException;
import java.util.TreeSet;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;

public class combineSortMRD1 {

	static class combineSortMRD1Mapper extends Mapper<LongWritable, Text, Text, Text>{

		@Override
		protected void map(LongWritable key, Text value,Context context)
				throws IOException, InterruptedException {

			String line = value.toString();
			String[] split = line.split("\t");
			
			String []wordAndFile = split[0].split("-->");
			context.write(new Text(wordAndFile[0]), new Text(wordAndFile[1]+"\t"+split[1]));
		}
	}
	
	static class combineSortMRD1Reducer extends Reducer<Text, Text,Text, Text>{
		//创建一个FileNumber类型的TreeSet集合类型
		private TreeSet<FileNumber> buffer;

		@Override
		public void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			buffer = new TreeSet<FileNumber>();
			for (Text val : values) {
				//切分文件名和相应出现的次数：a.txt 3
				String[] split = val.toString().split("\t");

				//获取文件的名字
				String fileName = split[0];
				
				//获取该单词在某文件出现的次数
				int number = Integer.parseInt(split[1]);
				
				buffer.add(new FileNumber(fileName, number));
			}
			context.write(key, new Text(buffer.toString()));
		}
	}
	
	public static void main(String[] args) throws Exception {
		if (args.length == 0) {
			args = new String[2];
			args[0] = "hdfs://192.168.40.10:9000/combineSort/output1/";
			args[1] = "hdfs://192.168.40.10:9000/combineSort/output2";
		}
		
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(combineSortMRD1.class);
		
		job.setMapperClass(combineSortMRD1Mapper.class);
		job.setCombinerClass(MyCombiner.class);
		job.setReducerClass(combineSortMRD1Reducer.class);
		
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		boolean res = job.waitForCompletion(true);// 将信息打印出来看
		System.exit(res ? 0 : 1);
	}
}

MyCombine.jara(提高效率)

package cn.itcast.mr.combineSort2;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class MyCombiner extends Reducer<Text, IntWritable, Text, IntWritable>{

	@Override
	protected void reduce(Text key, Iterable<IntWritable> values, Context context)
			throws IOException, InterruptedException {
		int num=0;
		for (IntWritable value : values) {
			num += value.get();
		}
		context.write(key, new IntWritable(num));
	}
}

FileNumber.java(实现次数倒序排序)

package cn.itcast.mr.combineSort2;

import org.apache.hadoop.io.IntWritable;
//创建自定义的比较器，重写compareTo方法
public class FileNumber implements Comparable<FileNumber>{

	private String fileName;
	private int number;

	public FileNumber(String fileName, int number) {
		super();
		this.fileName = fileName;
		this.number = number;
	}

	public FileNumber() {}


	public int getNumber() {
		return number;
	}
	public String getFileName(){
		return fileName;
	}

	@Override
	public String toString() {
		return fileName + "-->" + number;
	}

	/*
compareTo()的返回值是整型,它是先比较对应字符的大小(ASCII码顺序),如果第一个字符和参数的第一个字符不等,结束比较,
返回他们之间的差值,如果第一个字符和参数的第一个字符相等,则以第二个字符和参数的第二个字符做比较,以此类推,直至比较的字
符或被比较的字符有一方全比较完,这时就比较字符的长度. 
	 */
	
	@Override
	public int compareTo(FileNumber o) {
		//如果两数不相等，则返回差值
		if(this.getNumber()-o.getNumber()!=0)
			//实现倒序排序
			return o.getNumber()-this.getNumber();
		else {
			return this.getFileName()==o.getFileName()?1:-1;
		}
	}
}

mapreduce-多个文件的输入，实现每个单词的文件次数倒序排序

猜你喜欢