20190319-利用MapReduce去重和排序

1 分析数据去重 Distinct

原文件内容
2015-3-1a
2015-3-2b
2015-3-1a
2015-5-4d

目标结果
2015-3-1a
2015-3-2b
2015-5-4d

MyMapper.java

package DataDistinct;

import java.io.IOException;


import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class MyMapper extends Mapper<Object, Text, Text, Text> {
	@Override
	protected void map(Object key, Text value, Mapper<Object, Text, Text, Text>.Context context)
			throws IOException, InterruptedException {
	
		
		context.write(value, new Text(""));
	}
}

MyReducer.java

package DataDistinct;

import java.io.IOException;


import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class MyReducer extends Reducer<Text, Text, Text, Text> {
@Override
protected void reduce(Text k2, Iterable<Text> values,
		Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
	
	context.write(k2, new Text(""));
	}
}

MyMain.java

package DataDistinct;

import java.io.IOException;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class MyMain {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf=new Configuration();
		conf.set("fs.defaultFS", "hdfs://192.168.16.199:9000");
		System.setProperty("HADOOP_USER_NAME", "yao");
		
		Job job=Job.getInstance(conf);
		job.setJarByClass(MyMain.class);
		
		job.setMapperClass(MyMapper.class);
		job.setReducerClass(MyReducer.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		
		FileInputFormat.addInputPath(job, new Path("/hw.txt"));
		FileOutputFormat.setOutputPath(job, new Path("/hw3"));
		
		job.waitForCompletion(true);
		

	}

}

result

[yao@master ~]$ hadoop fs -ls /hw3
Found 2 items
-rw-r--r--   3 yao supergroup          0 2019-03-20 09:52 /hw3/_SUCCESS
-rw-r--r--   3 yao supergroup         33 2019-03-20 09:52 /hw3/part-r-00000
[yao@master ~]$ hadoop fs -cat /hw3/part-r-00000
2015-3-1a	
2015-3-2b	
2015-5-4d

2 数据排序并加序号 NumSort

原文件内容
234
23
45
2
767
34
56
100

目标结果
1 2
2 23
3 34
4 45
5 56
6 100
7 234
8 767

NumSortMapper.java

package org.yao.numsort;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class NumSortMapper extends Mapper<Object, Text, IntWritable, IntWritable> {
@Override
protected void map(Object key, Text value, Mapper<Object, Text, IntWritable, IntWritable>.Context context)
		throws IOException, InterruptedException {
	int num=Integer.parseInt(value.toString());
	context.write(new IntWritable(num), new IntWritable(1));
}
}

NumSortReducer.java

package org.yao.numsort;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;

public class NumSortReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {
IntWritable num=new IntWritable(1);
@Override
	protected void reduce(IntWritable k2, Iterable<IntWritable> values,
			Reducer<IntWritable, IntWritable, IntWritable, IntWritable>.Context context)
			throws IOException, InterruptedException {
		for(IntWritable count:values){
			context.write(num, k2);
			num=new IntWritable(num.get()+1);
		}
	}
}

MyMain.java

package org.yao.numsort;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class MyMain {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf=new Configuration();
		conf.set("fs.defaultFS", "hdfs://192.168.16.199:9000");
		System.setProperty("HADOOP_USER_NAME", "yao");
		
		Job job=Job.getInstance(conf);
		job.setJarByClass(MyMain.class);
		
		job.setMapperClass(NumSortMapper.class);
		job.setReducerClass(NumSortReducer.class);
		
		job.setOutputKeyClass(IntWritable.class);
		job.setOutputValueClass(IntWritable.class);
		
		FileInputFormat.addInputPath(job, new Path("/sort.txt"));
        FileOutputFormat.setOutputPath(job, new Path("/sort"));
		
		job.waitForCompletion(true);
	}

}

result

[yao@master ~]$ hadoop fs -cat /sort/part-r-00000
1	2
2	23
3	34
4	45
5	56
6	100
7	234
8	767

猜你喜欢

转载自blog.csdn.net/shayuwei/article/details/88747637