1 分析数据去重 Distinct
原文件内容
2015-3-1a
2015-3-2b
2015-3-1a
2015-5-4d
目标结果
2015-3-1a
2015-3-2b
2015-5-4d
MyMapper.java
package DataDistinct;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MyMapper extends Mapper<Object, Text, Text, Text> {
@Override
protected void map(Object key, Text value, Mapper<Object, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
context.write(value, new Text(""));
}
}
MyReducer.java
package DataDistinct;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class MyReducer extends Reducer<Text, Text, Text, Text> {
@Override
protected void reduce(Text k2, Iterable<Text> values,
Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
context.write(k2, new Text(""));
}
}
MyMain.java
package DataDistinct;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MyMain {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf=new Configuration();
conf.set("fs.defaultFS", "hdfs://192.168.16.199:9000");
System.setProperty("HADOOP_USER_NAME", "yao");
Job job=Job.getInstance(conf);
job.setJarByClass(MyMain.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path("/hw.txt"));
FileOutputFormat.setOutputPath(job, new Path("/hw3"));
job.waitForCompletion(true);
}
}
result
[yao@master ~]$ hadoop fs -ls /hw3
Found 2 items
-rw-r--r-- 3 yao supergroup 0 2019-03-20 09:52 /hw3/_SUCCESS
-rw-r--r-- 3 yao supergroup 33 2019-03-20 09:52 /hw3/part-r-00000
[yao@master ~]$ hadoop fs -cat /hw3/part-r-00000
2015-3-1a
2015-3-2b
2015-5-4d
2 数据排序并加序号 NumSort
原文件内容
234
23
45
2
767
34
56
100
目标结果
1 2
2 23
3 34
4 45
5 56
6 100
7 234
8 767
NumSortMapper.java
package org.yao.numsort;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class NumSortMapper extends Mapper<Object, Text, IntWritable, IntWritable> {
@Override
protected void map(Object key, Text value, Mapper<Object, Text, IntWritable, IntWritable>.Context context)
throws IOException, InterruptedException {
int num=Integer.parseInt(value.toString());
context.write(new IntWritable(num), new IntWritable(1));
}
}
NumSortReducer.java
package org.yao.numsort;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class NumSortReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {
IntWritable num=new IntWritable(1);
@Override
protected void reduce(IntWritable k2, Iterable<IntWritable> values,
Reducer<IntWritable, IntWritable, IntWritable, IntWritable>.Context context)
throws IOException, InterruptedException {
for(IntWritable count:values){
context.write(num, k2);
num=new IntWritable(num.get()+1);
}
}
}
MyMain.java
package org.yao.numsort;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MyMain {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf=new Configuration();
conf.set("fs.defaultFS", "hdfs://192.168.16.199:9000");
System.setProperty("HADOOP_USER_NAME", "yao");
Job job=Job.getInstance(conf);
job.setJarByClass(MyMain.class);
job.setMapperClass(NumSortMapper.class);
job.setReducerClass(NumSortReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path("/sort.txt"));
FileOutputFormat.setOutputPath(job, new Path("/sort"));
job.waitForCompletion(true);
}
}
result
[yao@master ~]$ hadoop fs -cat /sort/part-r-00000
1 2
2 23
3 34
4 45
5 56
6 100
7 234
8 767