Hadoop 从零开始学习系列-bulk load研究

以下场景适合应用bulk load

1.大批量数据导入，可以适用bulkload 来减轻regionserver的负载

第一步，把HDFS文件转化成Hfile文件，

第二步，把Hfile文件move到hbase里

代码如下：

package com.cloudera.examples.hbase.bulkimport;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

 
public class GeneratePutHFileAndBulkLoadToHBase {

     
    public static class ConvertWordCountOutToHFileMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put>
    {
 
        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            String wordCountStr=value.toString();
            String[] wordCountArray=wordCountStr.split("\001");
            String word=wordCountArray[0];
            String kgn=wordCountArray[1];
            String kra=wordCountArray[2];
            String cpcz=wordCountArray[3];
            String teamcol=wordCountArray[4];
            String click=wordCountArray[5];
            String label=wordCountArray[6];
            String pp=wordCountArray[7];
            
            //创建HBase中的RowKey
            byte[] rowKey=Bytes.toBytes(word);
            ImmutableBytesWritable rowKeyWritable=new ImmutableBytesWritable(rowKey);
            byte[] family=Bytes.toBytes("cf");
            byte[] qualifier=Bytes.toBytes("kgn");
            byte[] hbaseValue=Bytes.toBytes(kgn);
            
            byte[] qualifier1=Bytes.toBytes("kra");
            byte[] hbaseValue1=Bytes.toBytes(kra);
            byte[] qualifier2=Bytes.toBytes("cpcz");
            byte[] hbaseValue2=Bytes.toBytes(cpcz);
            byte[] qualifier3=Bytes.toBytes("teamcol");
            byte[] hbaseValue3=Bytes.toBytes(teamcol);
            byte[] qualifier4=Bytes.toBytes("click");
            byte[] hbaseValue4=Bytes.toBytes(click);
            byte[] qualifier5=Bytes.toBytes("label");
            byte[] hbaseValue5=Bytes.toBytes(label);
            byte[] qualifier6=Bytes.toBytes("pp");
            byte[] hbaseValue6=Bytes.toBytes(pp);
            
            //可以根据条件来删选记录，例如时间等等
            
            // Put 用于列簇下的多列提交，若只有一个列，则可以使用 KeyValue 格式
            // KeyValue keyValue = new KeyValue(rowKey, family, qualifier, hbaseValue);
            Put put=new Put(rowKey);
            put.add(family, qualifier, hbaseValue);
            put.add(family, qualifier1, hbaseValue1);
            put.add(family, qualifier2, hbaseValue2);
            put.add(family, qualifier3, hbaseValue3);
            put.add(family, qualifier4, hbaseValue4);
            put.add(family, qualifier5, hbaseValue5);
            put.add(family, qualifier6, hbaseValue6);
            context.write(rowKeyWritable, put);
             
        }
         
    }
     
    public static void main(String[] arg) throws Exception {
    	 String[] args = {"/user/zhoulh/input","/user/zhoulh/output"};
        // TODO Auto-generated method stub
        Configuration hadoopConfiguration=new Configuration();
        System.setProperty("HADOOP_USER_NAME", "hbase");
        hadoopConfiguration.set("hbase.zookeeper.quorum", "n1,n2,n3,n4");
		hadoopConfiguration.set("hbase.zookeeper.property.clientPort", "2181");
		hadoopConfiguration.set("fs.defaultFS", "hdfs://n1:8020");
		
        String[] dfsArgs = new GenericOptionsParser(hadoopConfiguration, args).getRemainingArgs();
        FileSystem fs = FileSystem.get(hadoopConfiguration);
        fs.deleteOnExit(new Path(dfsArgs[1]));
        
        //只需要编写Mapper类，在Mapper类中对一个job的输出进行分析，并转换为HBase需要的KeyValue的方式。
        Job appUserProfileInterest=new Job(hadoopConfiguration, "app_user_profile_interest_bulkload");
         
        appUserProfileInterest.setJarByClass(GeneratePutHFileAndBulkLoadToHBase.class);
        appUserProfileInterest.setMapperClass(ConvertWordCountOutToHFileMapper.class);
        //ReducerClass 无需指定，框架会自行根据 MapOutputValueClass 来决定是使用 KeyValueSortReducer 还是 PutSortReducer
        //convertWordCountJobOutputToHFileJob.setReducerClass(KeyValueSortReducer.class);
        appUserProfileInterest.setMapOutputKeyClass(ImmutableBytesWritable.class);
        appUserProfileInterest.setMapOutputValueClass(Put.class);
        
        //指定在HDF上的输入输出目录
        FileInputFormat.addInputPath(appUserProfileInterest, new Path(dfsArgs[0]));
        FileOutputFormat.setOutputPath(appUserProfileInterest, new Path(dfsArgs[1]));
        //创建HBase的配置对象
        

        //创建目标表对象
        HTable app_user_profile_interest_hbase =new HTable(hadoopConfiguration, "dd_b_basic_u1_app_user_profile_interest_hbase");
        HFileOutputFormat2.configureIncrementalLoad(appUserProfileInterest,app_user_profile_interest_hbase);
        
        //生成Hfile
        int convertWordCountJobOutputToHFileJobResult=appUserProfileInterest.waitForCompletion(true)?0:1;
        System.out.println(convertWordCountJobOutputToHFileJobResult);
        //当生成完Hfile后，调用BulkLoad方式来将MR结果批量入库
        LoadIncrementalHFiles loader = new LoadIncrementalHFiles(hadoopConfiguration);
        //第一个参数为生成HFile的目录，第二个参数为目标表
        System.out.println("outpath:" + dfsArgs[1]);
        loader.doBulkLoad(new Path(dfsArgs[1]), app_user_profile_interest_hbase);
         
        System.out.println("done");
    }
 
}

如果出现在

loader.doBulkLoad(new Path(dfsArgs[1]), app_user_profile_interest_hbase);

卡住的情况，可以看server日志，发现是hbase用户没有操作out_put目录的权限。修改此目录的权限。则可以解除

另外，生成output目录下文件的用户也需要是hbase用户

Hadoop 从零开始学习系列-bulk load研究

猜你喜欢