利用MapReduce来实现文档全局搜索引擎

                          利用MapReduce来实现全局搜索引擎

根据内容来查看文档,可以统计每个单词在一些文档中出现了几次,来实现全文检索的这样的一个功能

预备文件:

hadoop中分三步走:

1.mapper对文档初步处理, 获得每个单词以及单词的路径,设置每个单词出现的次数都初步设置为1;

    输出格式 : 单词||文档uri   1;

2.combiner对于每个文档同样的单词初步的合计统计次数并输出到reducer

     合并每个文件单词出现的次数,也就是词频 

     输出格式: 单词  uri------词频

 3.reducer经过shuffer处理形成最终的文件

    输出格式;     单词    uri------词频;uri-------词频;

代码展示:

package demo01.hadoop;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class InvertedIndex extends Configured implements Tool {

	@Override
	public int run(String[] args) throws Exception {

		if (args.length != 2) {

			System.out.println("args error!");
			return -1;
		}
		Path src = new Path(args[0]);
		Path desc = new Path(args[1]);

		Configuration conf = getConf();
		FileSystem fs = FileSystem.get(conf);
		if (fs.exists(desc)) {
			fs.delete(desc, true);
		}

		Job job = Job.getInstance(conf,"倒排索引");
		job.setJarByClass(getClass());

		job.setMapperClass(MyMapper.class);	
		job.setMapOutputKeyClass(Text.class);            //这是reducer的东西
		job.setMapOutputValueClass(Text.class);
		
		
		job.setCombinerClass(MyCombiner.class);
		
		//job.setReducerClass(MyReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class); 
		
		FileInputFormat.addInputPath(job, src);
		FileOutputFormat.setOutputPath(job, desc);

		return job.waitForCompletion(true) ? 0 : 1;

	}

	public static void main(String[] args) throws Exception {

		int code = ToolRunner.run(new InvertedIndex(), args);
		System.exit(code);

	}

	/**
	 * 
	 * @author hp 输出格式 : key单词:文档uri value每个单词设置出现次数为1;
	 *
	 */

	public static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {

		String uri;
		Text key2 = new Text();
		Text value2 = new Text();

		@Override
		public void setup(Mapper<LongWritable, Text, Text,Text>.Context context)
				throws IOException, InterruptedException {

			FileSplit split = (FileSplit) context.getInputSplit();
			this.uri = split.getPath().toString();

		}

		@Override
		public void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text,Text>.Context context)
				throws IOException, InterruptedException {

			String[] strs = value.toString().split("\\s+");
			for (String str : strs) {

				key2.set(str + "||" + uri);
				value2.set("1");
				
				context.write(key2, value2);
			}

		}
	}

	/**
	 * 合并每个文件单词出现的次数,也就是词频 输出格式: key单词 value uri+每个文档中的词频
	 */

	public static class MyCombiner extends Reducer<Text, Text, Text, Text> {

		Text key4 = new Text();
		Text value4 = new Text();

		
		@Override
		public void reduce(Text key3, Iterable<Text> value3,
				Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {

			
			int sum = 0;
			for (Text v3 : value3) {

				sum += Integer.parseInt(v3.toString());

			}
			

			String word = key3.toString().substring(0, key3.toString().indexOf("||"));
			key4.set(word);
			int pos = key3.toString().length();
			String uri = key3.toString().substring(key3.toString().indexOf("||") + 2, pos);
            
			value4.set(uri + "-------" +sum);
			context.write(key4, value4);

		}

	}
	
	public static class MyReducer extends Reducer<Text, Text, Text,Text>{
		
		Text key6 = new Text();
		Text value6 = new Text();
		
		@Override
		public void reduce(Text key5, Iterable<Text> value5, Reducer<Text, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			
			StringBuffer sb = new StringBuffer();
			for (Text v5 : value5) {
				
				sb.append(v5 + ";");
			}
			
			key6.set(key5.toString());
			value6.set(sb.toString());	
			context.write(key6, value6);
		}
		
	}

}

欢迎提出见解跟指导

猜你喜欢

转载自blog.csdn.net/qq_42482484/article/details/81677047