buckload 批量导入数据到HBase

参考链接：

http://www.cnblogs.com/alexwu59/p/6635437.html

buckload 可以将hdfs文件转化为HFile，在将其加载到HBase的目录下。

优点：批量导入数据到HBase，速度快。过程简单，容易控制。

缺点：需要编写特定的MapReduce代码，打成Jar包；输入数据的格式是特定的，要在代码中做解析设置。另外，此工具单次导入的数据VERSIONS=1，也就是说，同一个cell，它只保存最后的那个数据，其余的都丢失了，所以不能通过这个工具，单此导入具有多个VERIONS版本的数据，不过多次导入时不受影响，会保留下多版本的数据。

具体步骤：

1）将代码打包，名为hbase-bulkload-common-1.0.jar

2)准备数据，input数据的格式应为：

rowkey1 columnFamily：columnName1 cell1

rowkey2 columnFamily：columnName2 cell2

rowkey3 columnFamily：columnName3 cell3

3)执行命令，将输入数据转化为HFile：

$HADOOP_CLASSPATH=`/opt/cloudera/parcels/CDH/bin/hbase classpath` hadoop jar hbase-bulkload-common-1.0.jar -D mapreduce.reduce.memory.mb=4096 ${inputPath}/data ${outputPath}/data_out ${HbaseTableName}

如果数据量过大，或者数据在主键上分布不平衡时，个别reduce可能执行很慢，需要设置更多的内存。

4)修改输出文件HFile的权限，避免报错：

$sudo -u hdfs hdfs dfs -chmod 777 ${outputPath}/data_out

5)将HFile导入到HBase中

$HADOOP_CLASSPATH=`/opt/cloudera/parcels/CDH/bin/hbase classpath` hadoop jar hbase-server-*.jar completebulkload ${outputPath}/data_out ${HbaseTableName}

Mapreduce设置参数：

hadoop jar app.jar -D mapreduce.job.queuename=root.etl.distcp

-D mapreduce.job.priority=HIGH

hadoop jar <jarName> -D mapreduce.map.memory.mb=5120

hadoop jar <jarName> -D mapreduce.reduce.memory.mb=4096

代码依赖Hadoop以及hbase客户端jar

hbase：

源代码如下：

package hbase.service;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.conf.Configured;

//import org.apache.hadoop.fs.FsShell;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.hbase.HBaseConfiguration;

import org.apache.hadoop.hbase.TableName;

import org.apache.hadoop.hbase.client.Admin;

import org.apache.hadoop.hbase.client.Connection;

import org.apache.hadoop.hbase.client.ConnectionFactory;

import org.apache.hadoop.hbase.client.Put;

import org.apache.hadoop.hbase.client.Table;

import org.apache.hadoop.hbase.io.ImmutableBytesWritable;

import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;

//import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;

import org.apache.hadoop.hbase.util.Bytes;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import java.io.IOException;

//import java.net.URI;

public class BulkLoadJob extends Configured implements Tool{

static Logger logger = LoggerFactory.getLogger(BulkLoadJob.class);

public static class BulkLoadMap extends

Mapper<LongWritable, Text, ImmutableBytesWritable, Put> {

public void map(LongWritable key, Text value, Context context)

throws IOException, InterruptedException {

String[] valueStrSplit = value.toString().split("\\s+");

/**

* 先判断输入的参数格式是否准确

* 空白符号分割，有三个部分；

* 中间的列簇与列名必须存在；

if(valueStrSplit.length == 3 && valueStrSplit[1].split(":").length >= 2 ) {

String hkey = valueStrSplit[0];

String family = valueStrSplit[1].split(":",2)[0];

String column = valueStrSplit[1].split(":",2)[1];

String hvalue = valueStrSplit[2];

final byte[] rowKey = Bytes.toBytes(hkey);

final ImmutableBytesWritable HKey = new ImmutableBytesWritable(rowKey);

Put put = new Put(rowKey);

byte[] cell = Bytes.toBytes(hvalue);

put.addColumn(Bytes.toBytes(family), Bytes.toBytes(column), cell);

context.write(HKey, put);

}

public static void main(String[] args) throws Exception {

int excord = ToolRunner.run(new Configuration(), new BulkLoadJob(), args);

System.exit(excord);

}

public int run(String[] arg0) throws Exception {

if(arg0.length != 3) {

logger.error("parameters error,requested parameter is 3,but input is " + arg0.length + ";" );

logger.error("paramer list:inputPath, outputPath, hbaseTableName" );

return 1;

}

String inputPath = arg0[0];

String outputPath = arg0[1];

String hbaseTableName = arg0[2];

logger.info("------------------------------------------");

logger.info("inputPath:" + inputPath);

logger.info("outputPath:" + outputPath);

logger.info("hbaseTableName:" + hbaseTableName);

logger.info("------------------------------------------");

Configuration conf;

Connection connection ;

Admin admin ;

Table table = null;

try {

conf = HBaseConfiguration.create();

Job job = Job.getInstance(getConf(), "Buck-HBaseLoad");

connection = ConnectionFactory.createConnection(conf);

admin = connection.getAdmin();

table = connection.getTable(TableName.valueOf(hbaseTableName));

job.setJarByClass(BulkLoadJob.class);

job.setMapperClass(BulkLoadJob.BulkLoadMap.class);

job.setMapOutputKeyClass(ImmutableBytesWritable.class);

job.setMapOutputValueClass(Put.class);

// speculation

job.setSpeculativeExecution(false);

job.setReduceSpeculativeExecution(false);

// in/out format

job.setInputFormatClass(TextInputFormat.class);

job.setOutputFormatClass(HFileOutputFormat2.class);

FileInputFormat.setInputPaths(job, inputPath);

FileOutputFormat.setOutputPath(job, new Path(outputPath));

HFileOutputFormat2.configureIncrementalLoad(job,table,connection.getRegionLocator(TableName.valueOf(hbaseTableName)));

/**

* 修改hdfs输出目录的权限为777；

* 注释部分为自动导入到hbase中；

if (job.waitForCompletion(true)) {

/* FsShell shell = new FsShell(conf);

try {

shell.run(new String[]{"hdfs","dfs","-chmod", "-R", "777", outputPath});

} catch (Exception e) {

logger.error("Couldnt change the file permissions ", e);

throw new IOException(e);

}

//载入到hbase表

LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf);

loader.doBulkLoad(new Path(outputPath),

admin,table,connection.getRegionLocator(TableName.valueOf(hbaseTableName)));

} else {

logger.error("loading failed.");

System.exit(1);

}

} catch (IllegalArgumentException e) {

e.printStackTrace();

} finally {

if (table != null) {

table.close();

}

return 0;

}

buckload 批量导入数据到HBase

猜你喜欢