案例4-使用hadoop-mapreduce进行PageRank计算

版权声明：原创文章，未经允许不得转载.Tips:传统电商火热的时代已经成为过去，下一个阶段属于大数据人工智能，服务、便捷、安全、效率、创新成为下一个阶段互联网时代的新词汇，而IT技术也随着行业的变化发展而不断更迭。对于码农的出路总结一句话：追技术不如追领域。[基础][设计][能力] https://blog.csdn.net/shengqianfeng/article/details/82875056

什么是pagerank

PageRank是Google专有的算法，用于衡量特定网页相对于搜索引擎索引中的其他网页而言的重要程度。
是Google创始人拉里·佩奇和谢尔盖·布林于1997年创造的
PageRank实现了将链接价值概念作为排名因素。

计算环境

Hadoop-2.5.2
四台主机
两台NN的HA
两台RM的HA
离线计算框架MapReduce

算法原理（1）

入链 ====投票

PageRank让链接来“投票“，到一个页面的超链接相当于对该页投一票

入链数量

如果一个页面节点接收到的其他网页指向的入链数量越多，那么这个页面越重要

入链质量

指向页面A的入链质量不同，质量高的页面会通过链接向其他页面传递更多的权重。所以越是质量高的页面指向页面A，则页面A越重要

网络上各个页面的链接图

假设网络上有ABCD四个网页，A指向B和D，B指向C，C指向A和B，D指向B和C。

如上图所示。

算法原理（2）

初始值
- 每个页面设置相同的PR值
- Google的pagerank算法给每个页面的PR初始值为1。

迭代递归计算（收敛）
- Google不断的重复计算每个页面的PageRank。那么经过不断的重复计算，这些页面的PR值会趋向于稳定，也就是收敛的状态。
- 在具体企业应用中怎么样确定收敛标准？
  - 1、每个页面的PR值和上一次计算的PR相等
  - 2、设定一个差值指标（0.0001）。当所有页面和上一次计算的PR差值平均小于该标准时，则收敛。
  - 3、设定一个百分比（99%），当99%的页面和上一次计算的PR相等

在本例子中我们使用第二种方法来作为收敛标准。

算法原理（3）

修正PageRank计算公式
- 由于存在一些出链为0，也就是那些不链接任何其他网页的网，也称为孤立网页，使得很多网页能被访问到。因此需要对 PageRank公式进行修正，即在简单公式的基础上增加了阻尼系数（damping factor）q， q一般取值q=0.85。
完整PageRank计算公式

N表示网页的个数

Pj表示某个网页,PageRank(Pj)表示当前网页的PR值，L(Pj)表示网页的超链接个数。

执行结果：

执行了29次完成：

ABCD四个网页的pageRank值如图所示。

代码：

package com.jeff.mr.pagerank;


import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class RunJob {
	
	public static enum Mycounter{
		my
	}

	public static void main(String[] args) {
		Configuration config =new Configuration();
		config.set("fs.defaultFS", "hdfs://node4:8020");
		config.set("yarn.resourcemanager.hostname", "node4");
		double d =0.001;
		int i=0;
		while(true){
			i++;
			try {
				config.setInt("runCount", i);
				FileSystem fs =FileSystem.get(config);
				Job job =Job.getInstance(config);
				job.setJarByClass(RunJob.class);
				job.setJobName("pr"+i);
				job.setMapperClass(PageRankMapper.class);
				job.setReducerClass(PageRankReducer.class);
				job.setMapOutputKeyClass(Text.class);
				job.setMapOutputValueClass(Text.class);
				job.setInputFormatClass(KeyValueTextInputFormat.class);
				Path inputPath =new Path("/usr/input/pagerank.txt");
				if(i>1){
					inputPath =new Path("/usr/output/pr"+(i-1));
				}
				FileInputFormat.addInputPath(job, inputPath);
				
				Path outpath =new Path("/usr/output/pr"+i);
				if(fs.exists(outpath)){
					fs.delete(outpath, true);
				}
				FileOutputFormat.setOutputPath(job, outpath);
				
				boolean f= job.waitForCompletion(true);
				if(f){
					System.out.println("success.");
					long sum= job.getCounters().findCounter(Mycounter.my).getValue();
					System.out.println(sum);
					//计算所有网页Node的PR差值之和除以网页个数，再缩小1000倍
					double avgd= sum/4000.0;
					//比较结果和收敛标准
					if(avgd < d){
						break;
					}
				}
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
		
	}
	
	/**
	 * key为输入文件的每一行的第一个字符串即网页Node
	 * value为输入文件的每一行除第一个字符串外的其他字符串，即key对应网页所有的超链接
	 * @author jeffSheng
	 * 2018年9月27日
	 */
	static class PageRankMapper extends Mapper<Text, Text, Text, Text>{
		protected void map(Text key, Text value,Context context) throws IOException, InterruptedException {
			//设置runCount即运行次数，没有设置则默认为1
			int runCount= context.getConfiguration().getInt("runCount", 1);
			//当前网页
			String page = key.toString();
			//当前网页对应的Node对象
			Node node = null;
			//第一次运行则当前网页Node的pageRank为1.0,符合谷歌的规定
			if(runCount==1){
				node = Node.fromMR("1.0" + "\t" + value.toString());
			}else{
				node = Node.fromMR(value.toString());
			}
			//输出数据key为当前网页Page，比如A网页。value为当前网页对应的Node比如：1.0	B	D
			context.write(new Text(page), new Text(node.toString()));//A:1.0	B	D
			//如果node包含超链接，则说明是不是孤立网页
			if(node.containsAdjacentNodes()){
				//根据公式计算当前网页超链接的PR平均值outValue
				double outValue = node.getPageRank() / node.getAdjacentNodeNames().length;
				//迭代每一个超链接对应指向的网页Node，并输出其PR值
				for (int i = 0; i < node.getAdjacentNodeNames().length; i++) {
					String  outPage = node.getAdjacentNodeNames()[i];
					context.write(new Text(outPage), new Text(outValue+""));//B:0.5  D:0.5
				}
			}
		}
	}
	
	/**
	 * 累加所有网页的PR差值
	 * reduceTask：
	 * 		输入数据Key为网页page比如A即sourceNode，Value为对应的PR值，或者是包含PR值和超链接的字符串
	 * @author jeffSheng
	 * 2018年9月27日
	 */
	static class PageRankReducer extends Reducer<Text, Text, Text, Text>{
		protected void reduce(Text key, Iterable<Text> arg1, Context context) throws IOException, InterruptedException {
			double sum =0.0;
			Node sourceNode = null;
			//迭代每一组的page对应的PR值，并转化为网页Node，这个Node可能只是PR值，或者是包含PR和Node的超链接
			for(Text i : arg1){
				Node node = Node.fromMR(i.toString());
				//原来的那个Node（最初包含超链接的网页Page所拥有的PR+超链接）的Node，比如A：1.0	 B	D
				if(node.containsAdjacentNodes()){
					sourceNode = node;//A：1.0	 B	D
				}else{
					//当前sourceNode作为某网页的子超链接节点Node的PageRank累加，比如A:0.5
					sum = sum + node.getPageRank();//A:0.5
				}
			}
			//当前sourceNode根据公式计算出来的PageRank
			double newPR=(0.15/4)+(0.85*sum);
			System.out.println("*********** new pageRank value is "+newPR);
			
			//把新的pr值和计算之前的pr比较
			double d= newPR -sourceNode.getPageRank();
			//把pr之差放大1000倍，取绝对值后累加所有网页的PR差值
			int j=(int)( d*1000.0);
			j=Math.abs(j);
			System.out.println(j+"___________");
			context.getCounter(Mycounter.my).increment(j);;
			//重新给sourceNode赋值pageRank
			sourceNode.setPageRank(newPR);
			context.write(key, new Text(sourceNode.toString()));
		}
	}
}

package com.jeff.mr.pagerank;

import java.io.IOException;
import java.util.Arrays;

import org.apache.commons.lang.StringUtils;

public class Node {

	 private double pageRank=1.0;
	  private String[] adjacentNodeNames;

	  public static final char fieldSeparator = '\t';

	  public double getPageRank() {
	    return pageRank;
	  }

	  public Node setPageRank(double pageRank) {
	    this.pageRank = pageRank;
	    return this;
	  }

	  public String[] getAdjacentNodeNames() {
	    return adjacentNodeNames;
	  }

	  public Node setAdjacentNodeNames(String[] adjacentNodeNames) {
	    this.adjacentNodeNames = adjacentNodeNames;
	    return this;
	  }

	  public boolean containsAdjacentNodes() {
	    return adjacentNodeNames != null && adjacentNodeNames.length>0;
	  }

	  @Override
	  public String toString() {
	    StringBuilder sb = new StringBuilder();
	    sb.append(pageRank);

	    if (getAdjacentNodeNames() != null) {
	      sb.append(fieldSeparator)
	          .append(StringUtils
	              .join(getAdjacentNodeNames(), fieldSeparator));
	    }
	    return sb.toString();
	  }

	  //value =1.0	B	D
	  public static Node fromMR(String value) throws IOException {
		//将字符串按照分隔符拆分成数组
	    String[] parts = StringUtils.splitPreserveAllTokens(value, fieldSeparator);
	    if (parts.length < 1) {
	      throw new IOException(
	          "Expected 1 or more parts but received " + parts.length);
	    }
	    //创建网页对象即Node，数组的第一个元素作为网页Node的pageRank值
	    Node node = new Node().setPageRank(Double.valueOf(parts[0]));
	    //数组个数大于1，表示当前的字符串表示的是含有超链接的网页Node，设置除第一位外的其他Node作为当前网页的超链接数组
	    if (parts.length > 1) {
	      node.setAdjacentNodeNames(Arrays.copyOfRange(parts, 1,parts.length));
	    }
	    return node;
	  }
}

案例4-使用hadoop-mapreduce进行PageRank计算

猜你喜欢