HBase among the data stored in the HDFS ultimately above, support the operation of natural HBase MR, we can process the data directly through HBase among MR, and the MR may be stored processed results which go directly to HBase
A read myuser this table to which data is written in another table which HBase to
HBase read data among a table, and then writing data to HBase among them to another table. Note: We can use TableMapper and TableReducer to achieve read and write data from among HBase
This table will myuser among f1 column family name and age is written to the field f1 column family to which this table myuser2
1, this table is created myuser2
hbase(main):010:0> create 'myuser2','f1'
2, create a maven project, import the jar package
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0-mr1-cdh5.14.0</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.2.0-cdh5.14.0</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.2.0-cdh5.14.0</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.14.3</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
<!-- <verbal>true</verbal>-->
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.2</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*/RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
3, the development of MR procedures
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;
public class HBaseMR extends Configured implements Tool {
public static class HBaseMapper extends TableMapper<Text,Put>{
/**
* @param key 主键rowkey
* @param value 一行数据所有列的值都封装在value里面了
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
String rowKey = Bytes.toString(key.get());
Put put = new Put(key.get());
Cell[] cells = value.rawCells();
for (Cell cell : cells) {
if ("f1".equals(Bytes.toString(CellUtil.cloneFamily(cell)))){
if ("name".equals(Bytes.toString(CellUtil.cloneQualifier(cell)))){
put.add(cell);
}
if ("age".equals(Bytes.toString(CellUtil.cloneQualifier(cell)))){
put.add(cell);
}
}
}
if (!put.isEmpty()){
context.write(new Text(rowKey),put);
}
}
}
public static class HBaseReducer extends TableReducer<Text,Put,ImmutableBytesWritable> {
@Override
protected void reduce(Text key, Iterable<Put> values, Context context) throws IOException, InterruptedException {
for (Put value : values) {
context.write(null,value);
}
}
}
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(super.getConf());
job.setJarByClass(this.getClass());
Scan scan = new Scan();
//使用TableMapReduceUtil 工具类来初始化我们的mapper TableMapReduceUtil.initTableMapperJob(TableName.valueOf("myuser2"),scan,HBaseMapper.class,Text.class,Put.class,job);
//使用TableMapReduceUtil 工具类来初始化我们的reducer
TableMapReduceUtil.initTableReducerJob("myuser4",HBaseReducer.class,job);
job.setNumReduceTasks(1);
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181");
System.exit(ToolRunner.run(conf,new HBaseMR(),args));
}
}
4, packing operation
Requires the use of plug-in package, HBase will depend on the jar package jar package are driven into the project to go inside
Then execute
yarn jar hbaseStudy-1.0-SNAPSHOT.jar cn.itcast.hbasemr.HBaseMR
Or we can set up our own environment variables
export HADOOP_HOME=/export/servers/hadoop-2.6.0-cdh5.14.0/
export HBASE_HOME=/export/servers/hbase-1.2.0-cdh5.14.0/
export HADOOP_CLASSPATH=`${HBASE_HOME}/bin/hbase mapredcp`
yarn jar original-hbaseStudy-1.0-SNAPSHOT.jar cn.itcast.hbasemr.HBaseMR
Second, the HDFS file read, to write them into the table HBase
1, prepare data files, and data files to be uploaded to HDFS above
hdfs dfs -mkdir -p /hbase/input
cd /export/servers/
vim user.txt
0007 zhangsan 18
0008 lisi 25
0009 wangwu 20
2, MR program development
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;
public class Hdfs2Hbase extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(super.getConf(), "hdfs2Hbase");
job.setJarByClass(Hdfs2Hbase.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://node01:8020/hbase/input"));
job.setMapperClass(HdfsMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
TableMapReduceUtil.initTableReducerJob("myuser2",HBaseReducer.class,job);
job.setNumReduceTasks(1);
boolean b = job.waitForCompletion(true);
return b?0:1;
}
public static void main(String[] args) throws Exception {
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181");
int run = ToolRunner.run(conf, new Hdfs2Hbase(), args);
System.exit(run);
}
public static class HdfsMapper extends Mapper<LongWritable,Text,Text,NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
context.write(value,NullWritable.get());
}
}
public static class HBaseReducer extends TableReducer<Text,NullWritable,ImmutableBytesWritable> {
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
String[] split = key.toString().split("\t");
Put put = new Put(Bytes.toBytes(split[0]));
put.addColumn("f1".getBytes(),"name".getBytes(),split[1].getBytes());
put.addColumn("f1".getBytes(),"age".getBytes(),Bytes.toBytes(Integer.parseInt(split[2])));
context.write(new ImmutableBytesWritable(Bytes.toBytes(split[0])),put);
}
}
}
Third, read HBase table data, and writing data to go above hdfs
Fourth, by the way bulkload bulk load data into HBase were to go
Way to load data into HBase among diverse, we can use javaAPI HBase or use sqoop our data is written or imported into HBase were to go, but these methods are not slow in the occupied Region is the data import process leads to efficiency is low, we can also MR procedures, our data directly converted into the final storage format HFile HBase then load data directly into which go to HBase
HBase in each Table in the root directory (/ HBase) with a storage folder, the folder name called Table, Table folder in each Region a folder with the same memory, each folder under each Region column family also used folder stores, and that some HFile files are stored under each column family, HFile HBase data is stored in HFDS under format, so HB ASE files stored hdfs final performance in the above form is HF Ile, if we can data directly converted into HF ile format, then our HB ASE can be read directly loaded HF file ile format, you can directly read the
advantage:
1. The import process does not take up resources Region
2. can quickly import vast amounts of data
3. To save memory
HBase normal data read and write process
Bulkload way using our data directly generated HFile format, then loaded directly onto a table which go HBase
Requirements: We will hdfs above this path /hbase/input/user.txt data files, convert HFile format, and then load this table to go inside to myuser2
1, the development of MR procedures
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;
public class BulkData extends Configured implements Tool {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181");
System.exit(ToolRunner.run(conf,new BulkLoadData(),args));
}
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(super.getConf());
Connection connection = ConnectionFactory.createConnection(super.getConf());
Table connectionTable = connection.getTable(TableName.valueOf("myuser2"));
RegionLocator regionLocator = connection.getRegionLocator(TableName.valueOf("myuser2"));
job.setJarByClass(BulkLoadData.class);
job.setMapperClass(BulkLoadMap.class);
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapOutputValueClass(Put.class);
HFileOutputFormat2.configureIncrementalLoad(job,connectionTable,regionLocator);
FileInputFormat.addInputPath(job,new Path("hdfs://node01:8020/hbase/input/"));
FileOutputFormat.setOutputPath(job,new Path("hdfs://node01:8020/hbase/output_hfile"));
return job.waitForCompletion(true)?0:1;
}
public static class BulkLoadMap extends Mapper<LongWritable,Text,ImmutableBytesWritable,Put> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
Put put = new Put(Bytes.toBytes(split[0]));
put.addColumn("f1".getBytes(),"name".getBytes(),split[1].getBytes());
put.addColumn("f1".getBytes(),"age".getBytes(),Bytes.toBytes(Integer.valueOf(split[2])));
context.write(new ImmutableBytesWritable(Bytes.toBytes(split[0])),put);
}
}
}
2, code labeled jar package and then run
yarn jar original-hbaseStudy-1.0-SNAPSHOT.jar cn.itcast.hbasemr.HBaseLoad
3, development code, load data
The output path following HFile files loaded into our table were to go hbase
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
public class LoadData {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("hbase.zookeeper.property.clientPort","2181");
conf.set("hbase.zookeeper.quorum","node01,node02,node03");
Connection connection = ConnectionFactory.createConnection(conf);
Admin admin = connection.getAdmin();
Table connectionTable = connection.getTable(TableName.valueOf("myuser"));
RegionLocator regionLocator = connection.getRegionLocator(TableName.valueOf("myuser"));
LoadIncrementalHFiles load = new LoadIncrementalHFiles(conf);
load.doBulkLoad(new Path("hdfs://node01:8020/hbase/output_hfile"),admin,connectionTable,regionLocator);
}
}
Or we can to load data from the command line
Hbase first package added to the jar hadoop path of the classpath
export HBASE_HOME=/export/servers/hbase-1.2.0-cdh5.14.0/
export HADOOP_HOME=/export/servers/hadoop-2.6.0-cdh5.14.0/
export HADOOP_CLASSPATH=`${HBASE_HOME}/bin/hbase mapredcp`
Then execute the following command hbase HFile directly into the table to which myuser2
yarn jar /export/servers/hbase-1.2.0-cdh5.14.0/lib/hbase-server-1.2.0-cdh5.14.0.jar completebulkload /hbase/output_hfile myuser2