HBase笔记-4.HBase的mapredece

1.驱动类PutInDcustomerJob.java

package com.chinalife.distributable.mergedcid.putIndcustomer;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.MultiTableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.mapreduce.Job;

import com.chinalife.distributable.util.CustomKeyOut;
import com.chinalife.distributable.util.CustomPC;
import com.chinalife.distributable.util.Util;

public class PutInDcustomerJob {
	 public static final String TABLE_TEMP = "distributable:dcust_temporary";
	 public static final String TABLE_DCUST = "distributable:dcustomer";
	 public static final String TABLE_INDEX = "distributable:dcustomer_index";
	 public static final String TABLE_STD = "distributable:dcust_std";
	 
	public static void main(String[] args) throws Exception {
		Configuration conf = HBaseConfiguration.create();
		//添加zookeeper节点
		conf.set("hbase.zookeeper.quorum", "weekend05:2181,weekend06:2181,weekend07:2181");
		
		//在configuration中添加属性,可以传递给map阶段使用
		conf.set("dcust_temporary", TABLE_TEMP);//不完整字段暂存表
        	conf.set("dcustomer", TABLE_DCUST);//最终客户表
        	conf.set("dcustomer_index", TABLE_INDEX);//索引表
        	conf.set("dcust_std", TABLE_STD);//源表
        
		String jobName = new String("PutInDcustomerJob");
		//得到mapredece的job
		Job job = new Job(conf, jobName);
		job.setJarByClass(PutInDcustomerJob.class);
		
		// 获取开始和结束的时间戳
		String start_stamp = Util.getLogicTimeFromHbase("distributable:dcustomer", "PutInDcusomerJobStamp");
		String end_stamp = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date());
		
		ArrayList<Scan> scanList = new ArrayList<Scan>();
		//添加源表数据源
		Scan scan1 = new Scan();
		scan1.setAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME, TABLE_STD.getBytes());
		scan1.setFilter(Util.getIncrFilterByStamp(start_stamp,end_stamp,"di") );
		scan1.setCaching(500);
		scan1.setCacheBlocks(false);
		scanList.add(scan1);
		//添加索引表数据源
		Scan scan2 = new Scan();
		scan2.setAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME, TABLE_INDEX.getBytes());
		scan2.setCaching(500);
		scan2.setCacheBlocks(false);
		scanList.add(scan2);
		//如果多表联合扫描调用scanlist参数的方法
		TableMapReduceUtil.initTableMapperJob(scanList,PutInDcustomerMapper.class,CustomKeyOut.class,MapWritable.class,job);
		TableMapReduceUtil.initTableReducerJob("", PutInDcustomerReducer.class, job);
		//因为最终结果写到多个表里,所以设置输出属性
		job.setOutputFormatClass(MultiTableOutputFormat.class);
		job.setNumReduceTasks(200);   
		//mapredece的二次排序
		//自定义的分区
		job.setPartitionerClass(CustomPC.CustomPartitioner.class);
		//自定义的分组
		job.setGroupingComparatorClass(CustomPC.CustomCombiner.class);
		boolean b = job.waitForCompletion(true);
		if (!b) {
		  throw new IOException("error with job!");
		}
	}

}
2.PutInDcustomerMapper.java

package com.chinalife.distributable.mergedcid.putIndcustomer;

import java.io.IOException;

import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;

import com.chinalife.distributable.mergedcid.TempUtil;
import com.chinalife.distributable.util.CustomKeyOut;
import com.chinalife.distributable.util.idCheck;

public class PutInDcustomerMapper extends TableMapper<CustomKeyOut, MapWritable> {
	
	/**
	 * map(keyout,valueout)(索引,找到的dcid/源表数据)
	 */
	@Override
	protected void map(
			ImmutableBytesWritable key,
			Result value,
			Context context)
			throws IOException, InterruptedException {
		CustomKeyOut keyOut;
		MapWritable mapWritable = new MapWritable();
		//区别数据来源
		boolean containsColumn = value.containsColumn("ci".getBytes(), "dcid".getBytes());
		if(containsColumn){
			//来自distributable:dcustomer_index
			keyOut = new CustomKeyOut(new String(key.get()).trim(),0);
			String dcidStr = new String(value.getValue("ci".getBytes(), "dcid".getBytes())).trim();
			mapWritable.put(new Text("old_dcid"), new Text(dcidStr));
		}else{
			//来自distributable:dcust_std
			mapWritable = getCustHashMap(value);
			//给源表数据添加rowkey字段
			mapWritable.put(new Text("src_rowKey"), new Text(key.get()));
			String indexStr = getIndex(mapWritable);
			if(indexStr != null){
				indexStr = indexStr.trim();
				keyOut = new CustomKeyOut(indexStr,1);
				if(indexStr.equals("surplusField")){
					keyOut = new CustomKeyOut(indexStr,2);
				}
			}else{
				//如果字段不全标记放入暂存库
				keyOut = new CustomKeyOut("imperfectField",3);
			}
		}
		context.write(keyOut, mapWritable);
	}
	/**
	 * 将一条distributable:dcust_std数据封装成一个MapWritable
	 * @param value
	 * @return
	 */
	public MapWritable getCustHashMap(Result value){
		MapWritable custHashMap = new MapWritable();
		for(Cell cell:value.rawCells())
    	{
    		Text cellKey = new Text(new String(CellUtil.cloneQualifier(cell)).trim());
    		Text cellValue = new Text(new String(CellUtil.cloneValue(cell)).trim());
    		if(TempUtil.stringIsNullOrEmpty(cellKey)&& TempUtil.stringIsNullOrEmpty(cellValue))
    		{
    			custHashMap.put(cellKey, cellValue);    			
    		}
    	}
		return custHashMap;
	}
	
	/**
	 * 根据字段和索引优先级选出一个索引
	 * @param custHashMap
	 * @return
	 */
	public static String getIndex(MapWritable custHashMap) {
		String name = custHashMap.get(new Text("name"))==null?"":custHashMap.get(new Text("name")).toString().trim();
		String gender = custHashMap.get(new Text("gender"))==null?"":custHashMap.get(new Text("gender")).toString().trim();
		String birthday = custHashMap.get(new Text("birthday"))==null?"":custHashMap.get(new Text("birthday")).toString().trim();
		 
		String idNo = custHashMap.get(new Text("idNo"))==null?"":custHashMap.get(new Text("idNo")).toString().trim();
		String passportNo = custHashMap.get(new Text("passportNo"))==null?"":custHashMap.get(new Text("passportNo")).toString().trim();
		String officerNo = custHashMap.get(new Text("officerNo"))==null?"":custHashMap.get(new Text("officerNo")).toString().trim();
		String inCNNo = custHashMap.get(new Text("inCNNo"))==null?"":custHashMap.get(new Text("inCNNo")).toString().trim();
		String HKNo = custHashMap.get(new Text("HKNo"))==null?"":custHashMap.get(new Text("HKNo")).toString().trim();	
		String TaiWanNo = custHashMap.get(new Text("TaiWanNo"))==null?"":custHashMap.get(new Text("TaiWanNo")).toString().trim();
		String otherNo = custHashMap.get(new Text("otherNo"))==null?"":custHashMap.get(new Text("otherNo")).toString().trim();
		
		String mobileNo = custHashMap.get(new Text("mobileNo"))==null?"":custHashMap.get(new Text("mobileNo")).toString().trim();
		String wechatNo = custHashMap.get(new Text("wechatNo"))==null?"":custHashMap.get(new Text("wechatNo")).toString().trim();
		String email = custHashMap.get(new Text("email"))==null?"":custHashMap.get(new Text("email")).toString().trim();
		String qqNo = custHashMap.get(new Text("QQ"))==null?"":custHashMap.get(new Text("QQ")).toString().trim();
		String otherContact = custHashMap.get(new Text("otherContact"))==null?"":custHashMap.get(new Text("otherContact")).toString().trim();
		
		//姓名+身份证号
		if (TempUtil.stringIsNullOrEmpty(name)&&TempUtil.stringIsNullOrEmpty(idNo)) {
			return reverse(idNo) +"~I~"+ name;
		}
		//姓名+护照号
		if (TempUtil.stringIsNullOrEmpty(name)&&TempUtil.stringIsNullOrEmpty(passportNo)) {
			return reverse(passportNo) +"~P~"+  name;
		}
		//姓名+性别+生日+(一个证件号)
//		if (TempUtil.stringIsNullOrEmpty(name)&&TempUtil.stringIsNullOrEmpty(gender)
//				&&TempUtil.stringIsNullOrEmpty(birthday)&&TempUtil.stringIsNullOrEmpty(idNo)) {
//			return reverse(idNo) +"~I~"+ name +"~"+ gender +"~"+ birthday;
//		}
//		if (TempUtil.stringIsNullOrEmpty(name)&&TempUtil.stringIsNullOrEmpty(gender)
//				&&TempUtil.stringIsNullOrEmpty(birthday)&&TempUtil.stringIsNullOrEmpty(passportNo)) {
//			return reverse(passportNo) +"~P~"+ name +"~"+ gender +"~"+ birthday;
//		}
		if (TempUtil.stringIsNullOrEmpty(name)&&TempUtil.stringIsNullOrEmpty(gender)
				&&TempUtil.stringIsNullOrEmpty(birthday)&&TempUtil.stringIsNullOrEmpty(officerNo)) {
			return reverse(officerNo) +"~S~"+ name +"~"+ gender +"~"+ birthday;
		}
		if (TempUtil.stringIsNullOrEmpty(name)&&TempUtil.stringIsNullOrEmpty(gender)
				&&TempUtil.stringIsNullOrEmpty(birthday)&&TempUtil.stringIsNullOrEmpty(inCNNo)) {
			return reverse(inCNNo) +"~R~"+  name +"~"+ gender +"~"+ birthday;
		}
		if (TempUtil.stringIsNullOrEmpty(name)&&TempUtil.stringIsNullOrEmpty(gender)
				&&TempUtil.stringIsNullOrEmpty(birthday)&&TempUtil.stringIsNullOrEmpty(HKNo)) {
			return reverse(HKNo) +"~G~"+  name +"~"+ gender +"~"+ birthday;
		}
		if (TempUtil.stringIsNullOrEmpty(name)&&TempUtil.stringIsNullOrEmpty(gender)
				&&TempUtil.stringIsNullOrEmpty(birthday)&&TempUtil.stringIsNullOrEmpty( TaiWanNo)) {
			return reverse(TaiWanNo) +"~W~"+  name +"~"+ gender +"~"+ birthday;
		}
//		if (TempUtil.stringIsNullOrEmpty(name)&&TempUtil.stringIsNullOrEmpty(gender)
//				&&TempUtil.stringIsNullOrEmpty(birthday)&&TempUtil.stringIsNullOrEmpty(otherNo)) {
//			return reverse(otherNo) +"~O~"+  name +"~"+ gender +"~"+ birthday;
//		}
		if (TempUtil.stringIsNullOrEmpty(name)&&TempUtil.stringIsNullOrEmpty(otherNo)) {
			if (idCheck.isValidatedAllIdcard(otherNo)) {
				return reverse(otherNo) +"~I~"+ name;
			} else if (TempUtil.stringIsNullOrEmpty(gender) && TempUtil.stringIsNullOrEmpty(birthday)){ 
				return reverse(otherNo) +"~O~"+  name +"~"+ gender +"~"+ birthday;
			}
		}
		//姓名+手机号
		if (TempUtil.stringIsNullOrEmpty(name)&&TempUtil.stringIsNullOrEmpty(mobileNo)) {
			return reverse(mobileNo) +"~"+name;
		}
		//剩余字段
		if(    surplusField(name, wechatNo) ||
				surplusField(name, email) || 
				surplusField(name, qqNo) || 
				surplusField(name, otherContact)){
			return "surplusField";
		}
		
		return null;
	}
	/**
	 * 剩余字段但不符合索引生成规则
	 * @return
	 */
	public static boolean surplusField(String name ,String contactNo){
		if (TempUtil.stringIsNullOrEmpty(name)
			&&TempUtil.stringIsNullOrEmpty(contactNo)) {
			return true;
		} else {
			return false;
		}
	}
	// 翻转一个字符串
	public static String reverse(String str) {
		char[] org = str.toCharArray();
		char[] newChar = new char[org.length];
		int num = 1;
		for (char c : org) {
			newChar[org.length - num] = c;
			num++;
		}
		String newString = new String(newChar);
		return newString;
	}
}
3.PutInDcustomerReducer.java

package com.chinalife.distributable.mergedcid.putIndcustomer;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;

import com.chinalife.distributable.util.CustomKeyOut;

public class PutInDcustomerReducer extends TableReducer<CustomKeyOut, MapWritable, ImmutableBytesWritable> {
	private static int seq=0;
	private static String taskID = null;

	@Override
	protected void setup( Context context) throws IOException, InterruptedException {	
		taskID = context.getTaskAttemptID().toString().split("_")[4];
		taskID = taskID.substring(taskID.length()-3, taskID.length());
	}
	/**
	 * 1.字段不完整的数据放入暂存库
	 * 2.判断是否匹配到了dcid
	 * 	2.1匹配到了dcid,直接追加dcid字段放入dcustomer表中
	 * 	2.2没有匹配到dcid,生成新的dcid字段再放入到的dcustomer表中
	 * 		2.2.1新生成的dcid和index添加到distributable:dcustomer_index表中
	 * 3.将dcid回写到distributable:dcust_std表
	 */
	@Override
	protected void reduce(CustomKeyOut index,Iterable<MapWritable> mapWritables, Context context) throws IOException, InterruptedException {
		
		String dcust_temporary = context.getConfiguration().get("dcust_temporary");
		String dcustomer = context.getConfiguration().get("dcustomer");
		String dcustomer_index = context.getConfiguration().get("dcustomer_index");
		String dcust_std = context.getConfiguration().get("dcust_std");
		//索引字符串
		String indexStr = new String(index.getKeyOut().trim());
		
		if(indexStr.contains("surplusField")){
			//将不需要归并的数据生成唯一dcid存储,即不符合索引规则
			//---------------
			for(MapWritable mapWritable:mapWritables){
				//获取Dcid
				String newDcid = getNewDcid();
				//插入到的dcustomer表中
				ImmutableBytesWritable dcustomer_Immu = new ImmutableBytesWritable();
				dcustomer_Immu.set(Bytes.toBytes(dcustomer));
				Put dcustomerPut = getDcustomerPut(mapWritable, newDcid);
				context.write(dcustomer_Immu, dcustomerPut);
			}
			//---------------
		}else if(indexStr.contains("imperfectField")){
			//1.将字段不完整的数据放入暂存库
			ImmutableBytesWritable std_temp = new ImmutableBytesWritable();
			std_temp.set(Bytes.toBytes(dcust_temporary));
			Put src_rowData = null;
			
			for(MapWritable mapWritable : mapWritables){
				String src_rowKey = mapWritable.get(new Text("src_rowKey")).toString();
				//源表主键
				src_rowData = new Put(src_rowKey.getBytes());
				//添加其他字段
				Set<Writable> keySet = mapWritable.keySet();
				for(Writable key : keySet){
					if(!key.equals(new Text("src_rowKey"))){
						String value = mapWritable.get(key).toString();
						src_rowData.add("ci".getBytes(), key.toString().getBytes(), value.getBytes());
					}
				}
				src_rowData.add("ci".getBytes(), "hstamp".getBytes(), getStringDate().getBytes());
				//写入暂存库
				//System.out.println("distributable:std_imperfectField_temp:"+src_rowKey);
				context.write(std_temp, src_rowData);
			}
		}else{
			//字段完整的数据
			String old_dcid = null;
			boolean generateFlag = false;  //是否新生成dcid标识位
			HashMap<String, String> srcMap = null;
			Iterator<MapWritable> iterator = mapWritables.iterator();
			//取第一个元素,
			MapWritable mapWritable = iterator.next();
			//判断是否匹配到了dcid
			Writable dcidWritable = mapWritable.get(new Text("old_dcid"));
			if(dcidWritable == null){
				//没有匹配到dcid,生成新的dcid
				old_dcid = getNewDcid();
				generateFlag = true;
				//新生成的dcid和index添加到distributable:dcustomer_index表中
				ImmutableBytesWritable index_dcid = new ImmutableBytesWritable();
				index_dcid.set(Bytes.toBytes(dcustomer_index));
				Put index_dcid_put = new Put(indexStr.getBytes());
				index_dcid_put.add("ci".getBytes(), "dcid".getBytes(), old_dcid.getBytes());
				index_dcid_put.add("ci".getBytes(), "hstamp".getBytes(),getStringDate().getBytes() );
				context.write(index_dcid, index_dcid_put);
				
				//放入到的dcustomer表中------
				ImmutableBytesWritable dcustomer_Immu = new ImmutableBytesWritable();
				dcustomer_Immu.set(Bytes.toBytes(dcustomer));
				Put dcustomerPut = getDcustomerPut(mapWritable, old_dcid);
				context.write(dcustomer_Immu, dcustomerPut);
			} else {
				//记录匹配到的dcid
				old_dcid = dcidWritable.toString();
			}
			//遍历剩下的元素
			while(iterator.hasNext()){
				mapWritable = iterator.next();
				//如果是新生成的dcid
				if (generateFlag == true) {
					if (mapWritable.get(new Text("src_rowKey")) != null) {
						//插入到dcustomer表
						ImmutableBytesWritable dcustomer_Immu = new ImmutableBytesWritable();
						dcustomer_Immu.set(Bytes.toBytes(dcustomer));
						Put dcustomerPut = getDcustomerPut(mapWritable, old_dcid);
						context.write(dcustomer_Immu, dcustomerPut);
					} else {
						System.out.println("-----------------Wrong Data-------------------");
						for (Entry en : mapWritable.entrySet()) {
							System.out.println(en.getKey().toString() +"  " +en.getValue().toString());
						}
						System.out.println("The src_rowkey is :"  + mapWritable.get(new Text("src_rowKey")) + "--------" + indexStr);
						System.out.println("----------------------------------------------------");
					}
				} else {    //如果是通过归并得来的dcid
					if (mapWritable.get(new Text("src_rowKey")) != null) {
						//插入到dcustomer表
						ImmutableBytesWritable dcustomer_Immu = new ImmutableBytesWritable();
						dcustomer_Immu.set(Bytes.toBytes(dcustomer));
						String srcRowKey = mapWritable.get(new Text("src_rowKey")).toString();
						String partyId = null;
						if (mapWritable.get(new Text("party_id")) != null) {
							partyId = mapWritable.get(new Text("party_id")).toString();
						}
						String dcustomerRowKey = old_dcid+"~"+srcRowKey;
						Put dcustomerPut = new Put(dcustomerRowKey.getBytes());
						Set<Writable> keySet = mapWritable.keySet();
						for(Writable key : keySet){
							if(!key.equals(new Text("src_rowKey"))){
								String value = mapWritable.get(key).toString();
								dcustomerPut.add("ci".getBytes(), key.toString().getBytes(), value.getBytes());
							}
						}
						//如果存在party_id,则插入老客户标识位;//如果不存在party_id,则插入新准客户标识位和准客户标识位
//						if (partyId != null && !partyId.equals("")) {
//							dcustomerPut.add("ci".getBytes(), "partyflag".getBytes(), "1".getBytes());
//						} else {
//							dcustomerPut.add("ci".getBytes(), "dcidflag".getBytes(), "1".getBytes());
//						}
						dcustomerPut.add("ci".getBytes(), "hstamp".getBytes(),getStringDate().getBytes() );
						//复位地址解析标识位
						dcustomerPut.add("ci".getBytes(), "analyzeflag".getBytes(),"N".getBytes());
						context.write(dcustomer_Immu, dcustomerPut);
					} else {
						System.out.println("-----------------Wrong Data-------------------");
						for (Entry en : mapWritable.entrySet()) {
							System.out.println(en.getKey().toString() +"  " +en.getValue().toString());
						}
						System.out.println("The src_rowkey is :"  + mapWritable.get(new Text("src_rowKey")) + "--------" + indexStr);
						System.out.println("----------------------------------------------------");
					}
				}
			}
		}
		
	}
	
	/**
	 * 为新生成dcid的数据,根据Dcid和Map,生成Dcustomer的Put对象
	 * @return Put
	 */
	private Put getDcustomerPut(MapWritable mapWritable, String newDcid) {
		String srcRowKey = mapWritable.get(new Text("src_rowKey")).toString();
		String partyId = null;
		if (mapWritable.get(new Text("party_id")) != null) {
			partyId = mapWritable.get(new Text("party_id")).toString();
		}
		String dcustomerRowKey = newDcid+"~"+srcRowKey;
		Put dcustomerPut = new Put(dcustomerRowKey.getBytes());
		Set<Writable> keySet = mapWritable.keySet();
		for(Writable key : keySet){
			if(!key.equals(new Text("src_rowKey"))){
				String value = mapWritable.get(key).toString();
				dcustomerPut.add("ci".getBytes(), key.toString().getBytes(), value.getBytes());
			}
		}
		//如果存在party_id,则插入老客户标识位;//如果不存在party_id,则插入新准客户标识位和准客户标识位
//		if (partyId != null && !partyId.equals("")) {
//			dcustomerPut.add("ci".getBytes(), "partyflag".getBytes(), "1".getBytes());
//		} else {
//			dcustomerPut.add("ci".getBytes(), "newflag".getBytes(),"1".getBytes());
//			dcustomerPut.add("ci".getBytes(), "dcidflag".getBytes(), "1".getBytes());
//		}
		dcustomerPut.add("ci".getBytes(), "hstamp".getBytes(),getStringDate().getBytes() );
		
		//复位地址解析标识位
		dcustomerPut.add("ci".getBytes(), "analyzeflag".getBytes(),"N".getBytes());
		return dcustomerPut;
	}
	
	public String getNewDcid(){
		//dcid,那么用时间+7位流水号生成一个新的dcid
		String newDcid="";
		Date date=new Date();
		SimpleDateFormat format = new SimpleDateFormat("yyMMddHHmmss");//定义日期类型格式
		String dateString = format.format(date);//转换为字符串
		seq++;
		String innerId="000000"+seq;//补0
		innerId=innerId.substring(innerId.length()-6, innerId.length());
		newDcid=dateString+taskID+innerId;
		//反向新的dcid
		return reverse(newDcid);
	}
	// 翻转一个字符串
	public static String reverse(String str) {
		char[] org = str.toCharArray();
		char[] newChar = new char[org.length];
		int num = 1;
		for (char c : org) {
			newChar[org.length - num] = c;
			num++;
		}
		String newString = new String(newChar);
		return newString;
	}
	/**
	 * 格式化保存的时间
	 * @return
	 */
	public static String getStringDate() {
	   Date currentTime = new Date();
	   SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
	   String dateString = formatter.format(currentTime);
	   return dateString;
	}
}
4.CustomKeyOut.java

package com.chinalife.distributable.util;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

/**
 * 自定义的在mapreduce之间传递的类,用于二次排序的通用类。
 * 两个默认字段,keyOut和order。
 * 1.keyOut :原来的map阶段的keyOut字段
 * 2.order  :顺序号字段,用于第二次排序的顺序号	
 * @author wangqingchun
 *
 */
public class CustomKeyOut implements WritableComparable<CustomKeyOut>{
	
	public String keyOut;	//map(keyout,valueout)中的keyout
	public int	order;		//二次排序顺序号
	
	
	/**
	 * 在反序列化时,反射机制需要调用空参构造函数,所以显示定义了一个空参构造函数
	 */
	public CustomKeyOut() {}

	/**
	 * 为了对象数据的初始化方便,加入一个带参的构造函数
	 * @param keyOut	map(keyout,valueout)中的keyout
	 * @param order		二次排序顺序号
	 */
	public CustomKeyOut(String keyOut, int order) {
		this.keyOut = keyOut;
		this.order = order;
	}

	/**
	 * 注意顺序要和序列化的顺序一致
	 */
	@Override
	public void readFields(DataInput input) throws IOException {
		this.keyOut = input.readUTF();
		this.order = input.readInt();
	}

	/**
	 * 将对象数据序列化到流中
	 */
	@Override
	public void write(DataOutput output) throws IOException {
		output.writeUTF(keyOut);
		output.writeInt(order);
	}

	/**
	 * 树形排序规则的实现
	 * 现在的排序规则:先比较keyOut字段,然后比较order字段。
	 * 如果指定新的排序规则只需要override该方法即可
	 */
	@Override
	public int compareTo(CustomKeyOut otherObj) {
		int res = this.keyOut.compareTo(otherObj.keyOut);
		if(res == 0){
			return this.order - otherObj.order;
		}
		return res;
	}
	

	/**
	 * 自定义toString方法是输出keyout字段
	 */
	@Override
	public String toString() {
		return getKeyOut();
	}

	public String getKeyOut() {
		return keyOut;
	}

	public void setKeyOut(String keyOut) {
		this.keyOut = keyOut;
	}

	public int getOrder() {
		return order;
	}

	public void setOrder(int order) {
		this.order = order;
	}

	

}
5.CustomPC.java

package com.chinalife.distributable.util;

import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Partitioner;

/**
 * 自定义的分区,分组类
 * @author wangqingchun
 *
 */
public class CustomPC {
	
	/**
	 * 自定义的分区,分区规则只根据keyout字段的hashCode%numReduceTasks
	 * @author wangqingchun
	 *
	 */
	public static class CustomPartitioner extends Partitioner<CustomKeyOut,Object>{

		@Override
		public int getPartition(CustomKeyOut keyOut, Object valOut,
				int numReduceTasks) {
			return Math.abs(keyOut.getKeyOut().hashCode()%numReduceTasks);
		}
		
	}
	/**
	 * 自定义的分组,分区规则只根据keyout字段compare
	 * @author wangqingchun
	 *
	 */
	public static class CustomCombiner extends WritableComparator{
		
		public CustomCombiner(){
			super(CustomKeyOut.class,true);
		}
		@Override
		public int compare(WritableComparable a, WritableComparable b) {
			CustomKeyOut customKeyOut_a = (CustomKeyOut)a;
			CustomKeyOut customKeyOut_b = (CustomKeyOut)b;
			return customKeyOut_a.getKeyOut().compareTo(customKeyOut_b.getKeyOut());
		}
	}
}






猜你喜欢

转载自blog.csdn.net/wang11yangyang/article/details/73649305
今日推荐