大数据学习之路12-求网站topN页面，本地模拟方式运行mr程序

分析内容：

2017/07/28 qq.com/a
2017/07/28 qq.com/bx
2017/07/28 qq.com/by
2017/07/28 qq.com/by3
2017/07/28 qq.com/news
2017/07/28 sina.com/news/socail
2017/07/28 163.com/ac
2017/07/28 sina.com/news/socail
2017/07/28 163.com/sport
2017/07/28 163.com/ac
2017/07/28 sina.com/play
2017/07/28 163.com/sport
2017/07/28 163.com/ac
2017/07/28 sina.com/movie
2017/07/28 sina.com/play
2017/07/28 sina.com/movie
2017/07/28 163.com/sport
2017/07/28 sina.com/movie
2017/07/28 163.com/ac
2017/07/28 163.com/ac
2017/07/28 163.com/acc
2017/07/28 qq.com/by
2017/07/28 qq.com/by3
2017/07/28 qq.com/news
2017/07/28 163.com/sport
2017/07/28 sina.com/news/socail
2017/07/28 163.com/sport
2017/07/28 sina.com/movie
2017/07/28 sina.com/news/socail
2017/07/28 sina.com/movie
2017/07/28 qq.com/news
2017/07/28 163.com/bb
2017/07/28 163.com/cc
2017/07/28 sina.com/lady/
2017/07/28 163.com/cc
2017/07/28 qq.com/news
2017/07/28 qq.com/by
2017/07/28 qq.com/by3
2017/07/28 sina.com/lady/
2017/07/28 qq.com/by3
2017/07/28 sina.com/lady/
2017/07/28 qq.com/by3
2017/07/28 qq.com/news
2017/07/28 qq.com/by3
2017/07/28 163.com/sport
2017/07/28 163.com/sport
2017/07/28 sina.com/news/socail
2017/07/28 sina.com/lady/
2017/07/28 sina.com/play
2017/07/28 sina.com/movie
2017/07/28 sina.com/music
2017/07/28 sina.com/sport
2017/07/28 sina.com/sport

package com.test.wordcount;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class WordCountMapper extends Mapper<LongWritable, Text, Text, Text>{
	Text k = new Text();
	Text v = new Text();
        @Override
        protected void map(LongWritable key, Text value,Context context)
        		throws IOException, InterruptedException {
        	// TODO Auto-generated method stub
        	String line = value.toString();
        	String[] split = line.split(" ");
        	String field = split[1].split("/")[0];
        	String url = split[1];
        	k.set(field);
        	v.set(url);
        	context.write(k, v);
        }
}

package com.test.wordcount;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WordCountReducer extends Reducer<Text, Text, Text, IntWritable>{
    Text k = new Text();
    IntWritable v = new IntWritable();
	@Override
	protected void reduce(Text key, Iterable<Text> values, Context context)
			throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		HashMap<String,Integer> map = new HashMap<>();
		/*
		 * 计算每个url的次数
		 */
		for (Text url : values) {
			if(map.containsKey(url.toString())){
				map.put(url.toString(), map.get(url.toString())+1);
			}else{
				map.put(url.toString(), 1);
			}
		}
		/**
		 * 找出其中次数最大的三个
		 */
		Set<Entry<String, Integer>> entrySet = map.entrySet();
	    List<Entry<String, Integer>> list = new ArrayList<Entry<String, Integer>>(entrySet);
		List<Entry<String, Integer>> sort = SortUtil.sort(list);
		for (int i = 0; i < 3; i++) {
			k.set(sort.get(i).getKey());
			v.set(sort.get(i).getValue());
			context.write(k,v);
		}
	}
}

package com.test.wordcount;

import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map.Entry;

public class SortUtil {
     public static List<Entry<String, Integer>> sort(List<Entry<String, Integer>> list){
    	 Collections.sort(list, new Comparator<Entry<String, Integer>>(){

			@Override
			public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) {
				// TODO Auto-generated method stub
				return o2.getValue()-o1.getValue();
			}
    		 
    	 });
    	 
		return list;
    	 
     }
    	 
}

package com.test.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.sun.jersey.core.impl.provider.entity.XMLJAXBElementProvider.Text;

public class Driver {
        public static void main(String[] args) throws Exception {
			Configuration conf = new Configuration();
			Job job = Job.getInstance(conf);
			//设置jar包文件的加载路径，用类加载器动态获取
			job.setJarByClass(Driver.class);
			//设置maptask和reducetask分别调用的Mapper类和Reducer类
			job.setMapperClass(WordCountMapper.class);
			job.setReducerClass(WordCountReducer.class);
			//设置maptask产生的kv数据类型。
			job.setMapOutputKeyClass(Text.class);
			job.setMapOutputValueClass(Text.class);
			//设置reducetask产生的kv数据类型
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(IntWritable.class);
			//设置输入数据所在路径
			FileInputFormat.setInputPaths(job, new Path(args[1]));
			//设置输出路径所在路径
			FileOutputFormat.setOutputPath(job, new Path(args[2]));
			//设置reducetask的并行实例数
			job.setNumReduceTasks(Integer.parseInt(args[0]));
			boolean res = job.waitForCompletion(true);
			System.out.println(res?"mr程序成功执行":"mr程序好像被外星人抓走了");
		}
}

我们也可以在本地运行调试，mapreduce有本地运行模拟器，它会用多线程模拟。

这里本地模拟运行的时候可能会报错，这个主要是由于hadoop和windows的兼容问题。

我们可以通过修改源码的方式解决：

修改run configuration将错误忽略，并将其他错误注释掉，将606行注释掉返回true

运行参数如下：

运行成功截图：

大数据学习之路12-求网站topN页面，本地模拟方式运行mr程序

猜你喜欢