如果还是原来的文件,如果我们想求被访问次数最多的三个网站,这里和之前不一样。之前我们求的是每个网站访问前三的页面。这里我们如果还是和之前一样调用reduce方法我们只能知道一组数据的总次数,我们根本不知道其他组的总次数,所以不能进行比较。所以我们就需要将每一次的结果使用hashmap记录下来。那么什么时候才能使用比较方法呢?我们希望能在所有reduce方法执行完之后能够再次执行比较方法。这个时候就需要我们知道当reduce方法的worker将数据一组一组的调完之后,它最后还会调用另一个方法cleanup。这个方法用来收尾。
这里我们换一种实现方法,我们不使用HashMap,我们使用treemap来实现。treemap也是map的一种实现,它可以帮我们排序。但是需要我们告诉它怎么比大小,我们要传一个comparator给他,treemap会比较key的大小,所以我们就要先想好key放什么,value放什么。排序的依据:key本身是comparable的(String本身实现了Comparable,它有comrareTo方法,整数也实现了),或者就需要传入一个专门的comparator对象告诉他怎么比较两个key数据的大小。如果我们需要比较的类实现了Comparable接口而且我们同时又传入comparator它会用哪个进行比较?答案是会用我们传入的comparator。如果有专门的比较器则用专门的比较器,没有专门的比较器时才会用本身实现的Comparable。如果比较的两个值相同怎么办?它会保留前面的而不保留后面的,因为它认为两个值相同,后面的值就没必要保留。下面是使用treemap的demo:
package com.test.treemap;
public class Person {
private String field;
private int count;
public Person(String field, int count) {
super();
this.field = field;
this.count = count;
}
public String getField() {
return field;
}
public void setField(String field) {
this.field = field;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
}
package com.test.treemap;
import java.util.Map.Entry;
import java.util.Comparator;
import java.util.Set;
import java.util.TreeMap;
/**
* treemap会按照放入的数据的key排序
* 排序的依据:key本身是comparable的
* @author 刘峰瑞
*
* 2018年8月15日上午8:54:34
*/
public class TreeMapDemo {
public static void main(String[] args) {
TreeMap<String, Integer> map = new TreeMap<>();
map.put("a", 1);
map.put("c", 2);
map.put("b", 3);
//字符串比大小是按照字典顺序比大小的
Set<Entry<String, Integer>> entrySet = map.entrySet();
for (Entry<String, Integer> entry : entrySet) {
System.out.println(entry.getKey()+" "+entry.getValue());
}
TreeMap<Integer,String> map2 = new TreeMap<>();
map2.put(1,"a");
map2.put(1,"k");
map2.put(2,"c");
map2.put(3,"b");
map2.put(6,"f");
map2.put(5,"h");
map2.put(4,"j");
Set<Entry<Integer, String>> ebtrySet2 = map2.entrySet();
for (Entry<Integer, String> entry : ebtrySet2) {
System.out.println(entry.getKey()+" "+entry.getValue());
}
System.out.println("-------------------------------");
Person p1 = new Person("qq.com",3);
Person p2 = new Person("163.com",34);
Person p3 = new Person("sina.com",3);
TreeMap<Person,Object> map3 = new TreeMap<Person,Object>(new Comparator<Person>() {
@Override
public int compare(Person o1, Person o2) {
// TODO Auto-generated method stub
return o2.getCount()-o1.getCount()==0?o2.getField().compareTo(o1.getField()):o2.getCount()-o1.getCount();
}
});
map3.put(p1, null);
map3.put(p2, null);
map3.put(p3, null);
Set<Entry<Person, Object>> entrySet2 = map3.entrySet();
for (Entry<Person, Object> entry : entrySet2) {
System.out.println(entry.getKey().getField()+" "+entry.getKey().getCount());
}
}
}
运行截图:
接下来正式贴出本项目的代码:
package com.test.wordcount2;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class TopMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
Text k = new Text();
IntWritable v = new IntWritable();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
String line = value.toString();
String[] split = line.split(" ");
String[] field = split[1].split("/");
k.set(field[0]);
v.set(1);
context.write(k, v);
}
}
package com.test.wordcount2;
import java.io.IOException;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class TopReducer extends Reducer<Text, IntWritable, Count, NullWritable>{
TreeMap<Count,Object> map = new TreeMap<>();
@Override
protected void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
int count = 0;
for (IntWritable value : values) {
count += value.get();
}
Count count2 = new Count(key.toString(),count);
map.put(count2, null);
}
@Override
protected void cleanup(Reducer<Text, IntWritable, Count, NullWritable>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
int topn = context.getConfiguration().getInt("top.n",3);
topn = topn>map.size()?map.size():topn;
Set<Entry<Count, Object>> entrySet = map.entrySet();
int i=0;
for (Entry<Count, Object> entry : entrySet) {
//这里的第一个参数直接写了一个对象,所以要重写toString
context.write(entry.getKey(),NullWritable.get() );
i++;
if(i==topn) return ;
}
}
}
package com.test.wordcount2;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class Count implements WritableComparable<Count>{
private String field;
private int count;
public Count(String field, int count) {
super();
this.field = field;
this.count = count;
}
public String getField() {
return field;
}
public Count() {
super();
// TODO Auto-generated constructor stub
}
public void setField(String field) {
this.field = field;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
@Override
public String toString() {
return field+" "+count;
}
@Override
public int compareTo(Count o) {
// TODO Auto-generated method stub
return o.getCount() - this.count==0?this.field.compareTo(o.getField()):o.getCount() - this.count;
}
@Override
public void readFields(DataInput arg0) throws IOException {
// TODO Auto-generated method stub
}
@Override
public void write(DataOutput arg0) throws IOException {
// TODO Auto-generated method stub
}
}
package com.test.wordcount2;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Driver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("top.n", args[2]);
Job job = Job.getInstance(conf);
//设置jar包文件的加载路径,用类加载器动态获取
job.setJarByClass(Driver.class);
//设置maptask和reducetask分别调用的Mapper类和Reducer类
job.setMapperClass(TopMapper.class);
job.setReducerClass(TopReducer.class);
//设置maptask产生的kv数据类型。
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//设置reducetask产生的kv数据类型
job.setOutputKeyClass(Count.class);
job.setOutputValueClass(NullWritable.class);
//设置输入数据所在路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
//设置输出路径所在路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//设置reducetask的并行实例数
//本案例中reducetask的个数只能是一,如果有两个reducetask则两个取到的topn不一致。
job.setNumReduceTasks(1);
boolean res = job.waitForCompletion(true);
System.out.println(res?"mr程序成功执行":"mr程序好像被外星人抓走了");
}
}
如果嫌每次执行都要删除output目录太麻烦,我们 可以写这样一段代码:
FileInputFormat.setInputPaths(job, new Path(args[0]));
//设置输出路径所在路径
FileSystem fs = FileSystem.get(conf);
Path p = new Path(args[1]);
if(fs.exists(p)){
fs.delete(p, true);
}
FileOutputFormat.setOutputPath(job, p);