hadoop--mapreduce代码之多表关联

package com.hadoop.sample;

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class MTJoin {
	private static int time = 0;
	public static class Map extends Mapper<Object,Text,Text,Text>{
		//在map中先区分输入行属于左表还是右表，然后对两列值进行分割，
		//保存连接列在key值，剩余列和左右表标志在value中，最后输出
		public void map(Object key,Text value,Context context) throws IOException,InterruptedException{
			String line = value.toString();
			int i = 0;
			//输入文件首行，不处理
			if(line.contains("factoryname")==true||line.contains("addressID")==true){
				return;
			}
			//找出数据中的分割点
			while(line.charAt(i)>='9'||line.charAt(i)<='0'){
				i++;
			}
			if(line.charAt(i)>='9'||line.charAt(i)<='0'){
				//左表
				int j = i-1;
				while(line.charAt(j)!=' ') j--;
				String[] values = {line.substring(0, j),line.substring(i)};
				context.write(new Text(values[1]), new Text("1+"+values[0]));
			}else{//右表
				int j = i+1;
				while(line.charAt(j)!=' ') j++;
				String[] values = {line.substring(0, i+1),line.substring(j)};
				context.write(new Text(values[0]), new Text("2+"+values[1]));
			}
		}
	}
	public static class Reduce extends Reducer<Text,Text,Text,Text>{
		//reduce解析map输出，将value中数据按照左右表分别保存，然后求笛卡尔积，输出
		public void reduce(Text key,Iterable<Text> values,Context context) throws IOException,InterruptedException{
			if(time == 0){//输入文件第一行
				context.write(new Text("factoryname"),new Text("addressname"));
				time++;
			}
			int factorynum = 0;
			String factory[] = new String[10];
			int adressnum = 0;
			String adress[] = new String[10];
			Iterator iter = values.iterator();
			while(iter.hasNext()){
				String record = iter.next().toString();
				int len = record.length();
				int i = 2;
				char type = record.charAt(0);
				String factoryname = new String();
				String adressname = new String();
				if(type == '1'){//左表
					factory[factorynum] = record.substring(2);
					factorynum++;
				}else{//右表
					adress[adressnum] = record.substring(2);
				}
			}
			if(factorynum!=0&&adressnum!=0){//笛卡尔积
				for(int m=0;m<factorynum;m++){
					for(int n=0;n<adressnum;n++){
						context.write(new Text(factory[m]), new Text(adress[n]));
					}
				}
			}
		}
	}
	/**
	 * @param args
	 */
	public static void main(String[] args) throws Exception{
		// TODO Auto-generated method stub
		Configuration conf = new Configuration();
		String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
		if(otherArgs.length != 2){
			System.err.println("Usage WordCount <int> <out>");
			System.exit(2);
		}
		Job job = new Job(conf,"word count");
		job.setJarByClass(MTJoin.class);
		job.setMapperClass(Map.class);
		job.setCombinerClass(Reduce.class);
		job.setReducerClass(Reduce.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}

}
hadoop--mapreduce代码之多表关联

猜你喜欢