结合案例讲解MapReduce重要知识点 -------- 过滤敏感词汇

版权声明:个人原创,转载请标注! https://blog.csdn.net/Z_Date/article/details/83863607

一篇文章 article.txt 内容如下 :

We ask that you please do not send us emails privately asking for support. We are non-paid
 volunteers who help out with the project and we do not necessarily have the time or energy 
to help people on an individual basis. Instead, we have setup mailing lists for each module 
which often contain hundreds of individuals who will help answer detailed requests for 
help. The benefit of using mailing lists over private communication is that it is a shared 
resource where others can also learn from common mistakes and as a community we all grow 
together.

一个敏感词库 sensitive.txt 内容如下 :

ask from all

需求:在article.txt中过滤掉sensitive.txt 包含的词 

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
 * 敏感词汇过滤并统计:
 * @author lyd
 *
 */
public class GrepDemo  extends ToolRunner implements Tool{

	/**
	 * 自定义的myMapper
	 * @author lyd
	 *
	 */
	static class MyMapper extends Mapper<LongWritable, Text, Text, Text>{

		/**
		 * 读取小文件进行缓存  (分布式缓存)
		 */
		static List<String> li = new ArrayList<String>();
		@Override
		protected void setup(Context context)throws IOException, InterruptedException {
			//获取缓存文件路径的数组
			Path [] paths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
			//循环读取每一个缓存文件
			for (Path p : paths) {
				//获取文件名字
				String fileName = p.getName();
				if(fileName.equals("dir")){
					BufferedReader sb = null;
					sb = new BufferedReader(new FileReader(new File(p.toString())));
					//读取BufferedReader里面的数据
					String tmp = null;
					while ( (tmp = sb.readLine()) != null) {
						String ss []= tmp.split(" ");
						for (String s : ss) {
							li.add(s);
						}
					}
					//关闭sb对象
					sb.close();
				} 
			}
		}

		@Override
		protected void map(LongWritable key, Text value,Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
			StringTokenizer lines = new StringTokenizer(line);
			while (lines.hasMoreTokens()) {
				//判断每一个单词是否是敏感词汇
				String word = lines.nextToken();
				if(!li.contains(word)){
					context.write(new Text(word), new Text("1"));
				}
			}
		}

		@Override
		protected void cleanup(Context context)throws IOException, InterruptedException {
		}
		
	}
	
	/**
	 * 自定义MyReducer
	 * @author lyd
	 *
	 */
	static class MyReducer extends Reducer<Text, Text, Text, Text>{

		@Override
		protected void setup(Context context)throws IOException, InterruptedException {
		}
		
		@Override
		protected void reduce(Text key, Iterable<Text> value,Context context)
				throws IOException, InterruptedException {
			int counter = 0;
			for (Text t : value) {
				counter += Integer.parseInt(t.toString());
			}
			context.write(key, new Text(counter+""));
		}
		
		@Override
		protected void cleanup(Context context)throws IOException, InterruptedException {
		}
	}
	
	
	@Override
	public void setConf(Configuration conf) {
		conf.set("fs.defaultFS", "hdfs://hadoop01:9000");
	}

	@Override
	public Configuration getConf() {
		return new Configuration();
	}
	
	/**
	 * 驱动方法
	 */
	@Override
	public int run(String[] args) throws Exception {
		//1、获取conf对象
		Configuration conf = getConf();
		//2、创建job
		Job job = Job.getInstance(conf, "GrepDemo");
		//3、设置运行job的class
		job.setJarByClass(GrepDemo.class);
		//4、设置map相关属性
		job.setMapperClass(MyMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		
		//设置分布式缓存文件
		job.addCacheFile(new URI("hdfs://hadoop01:9000/1603data/dir"));
		
		//5、设置reduce相关属性
		job.setReducerClass(MyReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		//判断输出目录是否存在,若存在则删除
		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(new Path(args[1]))){
			fs.delete(new Path(args[1]), true);
		}
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		//6、提交运行job
		int isok = job.waitForCompletion(true) ? 0 : 1;
		return isok;
	}
	
	/**
	 * job的主入口
	 * @param args
	 */
	public static void main(String[] args) {
		try {
			//对输入参数作解析
			String [] argss = new GenericOptionsParser(new Configuration(), args).getRemainingArgs();
			System.exit(ToolRunner.run(new GrepDemo(), argss));
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
}

猜你喜欢

转载自blog.csdn.net/Z_Date/article/details/83863607