Hadoop MapReduce自关联(单表关联)-案例

案例:获取孙子和祖父母的列表
     【单表关联.txt】 
           child        parent
        Tom        Lucy
        Tom        Jack
        Jone        Lucy
        Jone        Jack
        Lucy        Mary
        Lucy        Ben
        Jack        Alice
        Jack        Jesse
        Terry        Alice
        Terry        Jesse
        Philip       Terry
        Philip       Alma
        Mark       Terry
        Mark       Alma

思路分析:
    Tom           Lucy/Jack
                  Lucy   Mary/Ben  
                  Jack   Alice/Jesse
    
    Jone          Lucy/Jack
                  Lucy   Mary/Ben  
                  Jack   Alice/Jesse
         1                                2
                    左表                                   右表                
   k-------------v    (孩子与父母)        k-------------v(父母与祖父母)
   parent      child                child        parent
       grandchild                        grandparent
  mapper:
          确定标识符,左右表
          <k2,v2>--> <parent,"1"+child>
  reducer:
          遍历value,做笛卡尔积
          <k3,v3>--> <grandchild,grandparent>

代码如下:

Mapper端:

package com.hyxy.hadoop.join.single;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class SingleJoinMapper extends Mapper<LongWritable, Text, Text, Text>{
	@Override
	protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
		/*
		 * 置空表头
		 */
		String line = value.toString();
		if(line.contains("child")||line.contains("parent")){
			return ;
		}
		/*
		 * 确定左右表和表关系标识
		 */
		//空格切分文件内容
		StringTokenizer st = new StringTokenizer(value.toString());
		String child = st.nextToken();//返回孩子字段
		String parent = st.nextToken();//返回父母字段
		//判断左表和右表的child
		if(child.compareTo("child")!=0){
			//标识1:孩子与父母关系
			context.write(new Text(parent), new Text("1"+"+"+child));
			//标识2:父母与祖父母
			context.write(new Text(child), new Text("2"+"+"+parent));
		}
	}
}

Reducer端:

package com.hyxy.hadoop.join.single;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/*
 * <>
 */
public class SingleJoinReducer extends Reducer<Text, Text, Text, Text>{
	List<String> grandchild = new ArrayList<String>();
	List<String> grandparent = new ArrayList<String>();
	private Text _key = new Text();
	private Text _value = new Text();
	@Override
	//只调用一次,做表头
	protected void setup(org.apache.hadoop.mapreduce.Reducer.Context context)throws IOException, InterruptedException {
		context.write("grandchild", "grandparent");
	}
	@Override
	protected void reduce(Text key, Iterable<Text> values,Context context)throws IOException, InterruptedException {
		//遍历value,讲元素添加到集合中
		for (Text value : values) {
			//获取行内容
			String line = value.toString();
			//判断字符串是否以指定的前缀开始,判断是否是左表
			if(line.startsWith("1")){
				//将截取到的内容添加到grandchild集合中
				grandchild.add(line.substring(2));
			}else{
				//将截取到的内容添加到grandparent集合中
					grandparent.add(line.substring(2));
			}
		}
		//笛卡尔积,遍历grandchild集合
		for (String  child: grandchild) {
			//遍历grandparent集合
			for(String parent: grandparent){
				//将遍历出的元素赋值给_key和_value
				_key.set(child);
				_value.set(parent);
				context.write(_key,_value);
			}
		}
		//清空集合
		grandchild.clear();
		grandparent.clear();
	}
}

Driver端:

package com.hyxy.hadoop.join.single;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.hyxy.hadoop.join.multi.MultiJoinDriver;
import com.hyxy.hadoop.join.multi.MultiJoinMapper;
import com.hyxy.hadoop.join.multi.MultiJoinReducer;

public class SingleJoinDriver {
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();
		Path outfile = new Path("file:///G:/SingleOut");
		FileSystem fs = outfile.getFileSystem(conf);
		if(fs.exists(outfile)){
			fs.delete(outfile,true);
		}
		Job job = Job.getInstance(conf);
		job.setJarByClass(SingleJoinDriver.class);
		job.setJobName("Single Demo");
		
		job.setMapperClass(SingleJoinMapper.class);
		job.setReducerClass(SingleJoinReducer.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		FileInputFormat.addInputPath(job, new Path("file:///G:/测试数据/单表关联/"));
		FileOutputFormat.setOutputPath(job, outfile);
		System.exit(job.waitForCompletion(true)?0:1);
	}

}

输出结果样式:

grandchild	grandparent
Tom	Alice
Tom	Jesse
Jone	Alice
Jone	Jesse
Tom	Ben
Tom	Mary
Jone	Ben
Jone	Mary
Philip	Alice
Philip	Jesse
Mark	Alice
Mark	Jesse

猜你喜欢

转载自blog.csdn.net/zy_remarkable/article/details/81193884