MapReduce实现倒排索引

倒排索引这个名字让人很容易误解成A-Z,倒排成Z-A;但实际上缺不是这样的。
一般我们是根据问文件来确定文件内容,而倒排索引是指通过文件内容来得到文档的信息,也就是根据一些单词判断他在哪个文件中。
知道了这一点下面就好做了:

准备一些元数据

在这里插入图片描述
下面我们要进行两次MapReduce处理

第一次数据处理

package com.invalid;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public static void main(String[] args) throws Exception {
                 //创建连接
		Configuration conf = new Configuration();
		Job job =Job.getInstance(conf);
		//设置要运行的jar
		job.setJarByClass(InvalidDriver.class);
		//设置map和reduce
		job.setMapperClass(InvalidMapper.class);
		job.setReducerClass(InvalidReducer.class);
		//设置map输出
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		//设置reduce输出
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);	
		//设置输入输出路径
		FileInputFormat.setInputPaths(job,new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));	
	    //程序运行结果
		boolean result = job.waitForCompletion(true);
		System.out.println(result);		
	}
}
class InvalidMapper extends Mapper<LongWritable,Text,Text,IntWritable>{
	String name;
	Text k = new Text();
	IntWritable v = new IntWritable();
	//setup里读取文件名,且只执行一次
	@Override
	protected void setup(Context context) 
			throws IOException ,InterruptedException {
		FileSplit split = (FileSplit) context.getInputSplit();
		name = split.getPath().getName();
	};
	@Override
	protected void map(LongWritable key, Text value,Context context)
			throws IOException ,InterruptedException {
		String[] split = value.toString().split(" ");
		for (String word : split) {
			context.write(new Text(word+"--"+ name), new IntWritable(1));
		}
	}; 	
}
class InvalidReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
	@Override
	protected void reduce(Text key, Iterable<IntWritable> values,Context context)
			throws IOException, InterruptedException {
		int count = 0;
		for (IntWritable value : values) {
			count+=value.get();
		}
		context.write(new Text(key), new IntWritable(count));
	}
}

第一次处理结果如下

在这里插入图片描述

第二次处理

package com.invalid;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class InvalidDriver2 {
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job =Job.getInstance(conf);
		job.setJarByClass(InvalidDriver2.class);
		job.setMapperClass(InvalidMapper2.class);
		job.setReducerClass(InvalidReducer2.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		FileInputFormat.setInputPaths(job,new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		boolean result = job.waitForCompletion(true);
		System.out.println(result);
	}
}
class InvalidMapper2 extends Mapper<LongWritable,Text,Text,Text>{
	@Override
	protected void map(LongWritable key, Text value,Context context)
			throws IOException ,InterruptedException {
		String[] split = value.toString().split("--");
		String keys=split[0];	
		String values=split[1];
		context.write(new Text(keys),new Text(values));	
}
}
class InvalidReducer2 extends Reducer<Text, Text, Text, Text>{;
	@Override
	protected void reduce(Text key, Iterable<Text> values,Context context)
			throws IOException, InterruptedException {
		//String[] split = values.toString().split("\t");
		//String value=null;
		StringBuilder sb = new StringBuilder(); 
		for (Text string: values) {
			
		sb.append(string.toString().replaceAll("\t", "-->")+" ");	
		}
		context.write(new Text(key), new Text(sb.toString()));	
}
}

第二次处理结果

在这里插入图片描述

至此,倒排索引完成

猜你喜欢

转载自blog.csdn.net/weixin_43345864/article/details/84778783
今日推荐