基于pagerank算法的运用Hbase的搜索引擎(4)——构建倒置索引表篇

思想步骤:

1.原先的hbase的表是(左边)===》变成一个以关键字为rowkey的表(右边)

在这里插入图片描述

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


/*
 * 构建倒置索引表
 * 新建表 invertindex
 * create 'invertindex','page'
 * 行健是 关键字  page列族下 放 url:rank##cnt
 * 为了方便对接页面
 * */
public class InvertIndexMR extends Configured implements Tool {
	
	 public static void main(String[] args) {
	        try {
	            ToolRunner.run(new InvertIndexMR(),args);
	        } catch (Exception e) {
	            e.printStackTrace();
	        }
    }
	
    @Override
    public int run(String[] args) throws Exception {
        Configuration conf = getConf();
        Job job = Job.getInstance(conf,"BuildInvertIndex");
        job.setJarByClass(InvertIndexMR.class);
        TableMapReduceUtil.initTableMapperJob(conf.get("intable"),
                new Scan(),IIMapper.class,ImmutableBytesWritable.class,MapWritable.class,job);
        TableMapReduceUtil.initTableReducerJob(conf.get("outtable"),IIReducer.class,job);
        job.setNumReduceTasks(1);
        job.waitForCompletion(true);
        return 0;
    }

   
	public static class IIMapper extends TableMapper<ImmutableBytesWritable,MapWritable>{
        @Override
        protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
            //通过map方法第一个参数获得uri
        	String uri = new String(key.get()).trim();
            //如果该行包含非空  page:key   page:rank,则为有效行
            if(value.containsNonEmptyColumn(Bytes.toBytes("page"), Bytes.toBytes("key"))
        		&& value.containsNonEmptyColumn(Bytes.toBytes("page"), Bytes.toBytes("rank"))) {
            	//有效则取 page:key
            	byte[] keyword = value.getValue(Bytes.toBytes("page"), Bytes.toBytes("key"));
            	//有效则取 page:rank
            	byte[] rank = value.getValue(Bytes.toBytes("page"), Bytes.toBytes("rank"));
            	//有效则取 page:c
            	byte[] cnt = value.getValue(Bytes.toBytes("page"), Bytes.toBytes("c"));
            	//page:c为空 则赋值空字符串
            	if(cnt==null) {
            		cnt = Bytes.toBytes("");
            	}
            	//构建MapWritable进行存放三个值  rank   key   cnt
            	MapWritable map = new MapWritable();
            	map.put(new Text("url"), new Text(key.get()));
            	map.put(new Text("rank"), new DoubleWritable(Bytes.toDouble(rank)));
            	map.put(new Text("cnt"), new Text(cnt));
            	//页面关键字去空格
            	String kw = Bytes.toString(keyword).trim();
            	//输出  keyword,map
            	context.write(new ImmutableBytesWritable(kw.getBytes()),map);
            }
        }
    }
    public static class IIReducer extends TableReducer<ImmutableBytesWritable,MapWritable,NullWritable>{
        @Override
        protected void reduce(ImmutableBytesWritable key, Iterable<MapWritable> values, Context context) throws IOException, InterruptedException {
            //页面关键字再次去重
        	String kw = new String(key.get()).trim();
        	//如果kw长度不为0
            if(kw.length() != 0){
            	//构建put
                Put put = new Put(Bytes.toBytes(kw));
                //从values中取值设置到put中
                for (MapWritable map : values) {
                	Text url = (Text) map.get(new Text("url"));
                	DoubleWritable rank = (DoubleWritable) map.get(new Text("rank"));
                	Text cnt = (Text) map.get(new Text("cnt"));
                    put.addColumn(Bytes.toBytes("page"),Bytes.toBytes(url.toString()),Bytes.toBytes(rank.get()+"##"+cnt.toString()));
                }
                //输出
                context.write(NullWritable.get(),put);
            }
        }
    }
}

发布了20 篇原创文章 · 获赞 0 · 访问量 252

猜你喜欢

转载自blog.csdn.net/weixin_43570155/article/details/103733344