分析内容:
2017/07/28 qq.com/a
2017/07/28 qq.com/bx
2017/07/28 qq.com/by
2017/07/28 qq.com/by3
2017/07/28 qq.com/news
2017/07/28 sina.com/news/socail
2017/07/28 163.com/ac
2017/07/28 sina.com/news/socail
2017/07/28 163.com/sport
2017/07/28 163.com/ac
2017/07/28 sina.com/play
2017/07/28 163.com/sport
2017/07/28 163.com/ac
2017/07/28 sina.com/movie
2017/07/28 sina.com/play
2017/07/28 sina.com/movie
2017/07/28 163.com/sport
2017/07/28 sina.com/movie
2017/07/28 163.com/ac
2017/07/28 163.com/ac
2017/07/28 163.com/acc
2017/07/28 qq.com/by
2017/07/28 qq.com/by3
2017/07/28 qq.com/news
2017/07/28 163.com/sport
2017/07/28 sina.com/news/socail
2017/07/28 163.com/sport
2017/07/28 sina.com/movie
2017/07/28 sina.com/news/socail
2017/07/28 sina.com/movie
2017/07/28 qq.com/news
2017/07/28 163.com/bb
2017/07/28 163.com/cc
2017/07/28 sina.com/lady/
2017/07/28 163.com/cc
2017/07/28 qq.com/news
2017/07/28 qq.com/by
2017/07/28 qq.com/by3
2017/07/28 sina.com/lady/
2017/07/28 qq.com/by3
2017/07/28 sina.com/lady/
2017/07/28 qq.com/by3
2017/07/28 qq.com/news
2017/07/28 qq.com/by3
2017/07/28 163.com/sport
2017/07/28 163.com/sport
2017/07/28 sina.com/news/socail
2017/07/28 sina.com/lady/
2017/07/28 sina.com/play
2017/07/28 sina.com/movie
2017/07/28 sina.com/music
2017/07/28 sina.com/sport
2017/07/28 sina.com/sport
package com.test.wordcount;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordCountMapper extends Mapper<LongWritable, Text, Text, Text>{
Text k = new Text();
Text v = new Text();
@Override
protected void map(LongWritable key, Text value,Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
String line = value.toString();
String[] split = line.split(" ");
String field = split[1].split("/")[0];
String url = split[1];
k.set(field);
v.set(url);
context.write(k, v);
}
}
package com.test.wordcount;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCountReducer extends Reducer<Text, Text, Text, IntWritable>{
Text k = new Text();
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
HashMap<String,Integer> map = new HashMap<>();
/*
* 计算每个url的次数
*/
for (Text url : values) {
if(map.containsKey(url.toString())){
map.put(url.toString(), map.get(url.toString())+1);
}else{
map.put(url.toString(), 1);
}
}
/**
* 找出其中次数最大的三个
*/
Set<Entry<String, Integer>> entrySet = map.entrySet();
List<Entry<String, Integer>> list = new ArrayList<Entry<String, Integer>>(entrySet);
List<Entry<String, Integer>> sort = SortUtil.sort(list);
for (int i = 0; i < 3; i++) {
k.set(sort.get(i).getKey());
v.set(sort.get(i).getValue());
context.write(k,v);
}
}
}
package com.test.wordcount;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map.Entry;
public class SortUtil {
public static List<Entry<String, Integer>> sort(List<Entry<String, Integer>> list){
Collections.sort(list, new Comparator<Entry<String, Integer>>(){
@Override
public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) {
// TODO Auto-generated method stub
return o2.getValue()-o1.getValue();
}
});
return list;
}
}
package com.test.wordcount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.sun.jersey.core.impl.provider.entity.XMLJAXBElementProvider.Text;
public class Driver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//设置jar包文件的加载路径,用类加载器动态获取
job.setJarByClass(Driver.class);
//设置maptask和reducetask分别调用的Mapper类和Reducer类
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
//设置maptask产生的kv数据类型。
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//设置reducetask产生的kv数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//设置输入数据所在路径
FileInputFormat.setInputPaths(job, new Path(args[1]));
//设置输出路径所在路径
FileOutputFormat.setOutputPath(job, new Path(args[2]));
//设置reducetask的并行实例数
job.setNumReduceTasks(Integer.parseInt(args[0]));
boolean res = job.waitForCompletion(true);
System.out.println(res?"mr程序成功执行":"mr程序好像被外星人抓走了");
}
}
我们也可以在本地运行调试,mapreduce有本地运行模拟器,它会用多线程模拟。
这里本地模拟运行的时候可能会报错,这个主要是由于hadoop和windows的兼容问题。
我们可以通过修改源码的方式解决:
修改run configuration将错误忽略,并将其他错误注释掉,将606行注释掉返回true
运行参数如下:
运行成功截图: