HBase and MapReduce Integration

HBase among the data stored in the HDFS ultimately above, support the operation of natural HBase MR, we can process the data directly through HBase among MR, and the MR may be stored processed results which go directly to HBase

A read myuser this table to which data is written in another table which HBase to

HBase read data among a table, and then writing data to HBase among them to another table. Note: We can use TableMapper and TableReducer to achieve read and write data from among HBase

This table will myuser among f1 column family name and age is written to the field f1 column family to which this table myuser2

1, this table is created myuser2

hbase(main):010:0> create 'myuser2','f1'

2, create a maven project, import the jar package

<repositories>
        <repository>
            <id>cloudera</id>
            <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
        </repository>
    </repositories>

    <dependencies>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.6.0-mr1-cdh5.14.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>1.2.0-cdh5.14.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>1.2.0-cdh5.14.0</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>org.testng</groupId>
            <artifactId>testng</artifactId>
            <version>6.14.3</version>
            <scope>test</scope>
        </dependency>


    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.0</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                    <encoding>UTF-8</encoding>
                    <!--    <verbal>true</verbal>-->
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>2.2</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*/RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

3, the development of MR procedures 

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

public class HBaseMR extends Configured implements Tool {

    public static class HBaseMapper extends TableMapper<Text,Put>{
        /**
         * @param key   主键rowkey
         * @param value 一行数据所有列的值都封装在value里面了
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {

            String rowKey = Bytes.toString(key.get());
            Put put = new Put(key.get());
            Cell[] cells = value.rawCells();
            for (Cell cell : cells) {
                if ("f1".equals(Bytes.toString(CellUtil.cloneFamily(cell)))){
                    if ("name".equals(Bytes.toString(CellUtil.cloneQualifier(cell)))){
                     put.add(cell);
                    }
                    if ("age".equals(Bytes.toString(CellUtil.cloneQualifier(cell)))){
                     put.add(cell);
                    }
                }
            }
            if (!put.isEmpty()){
                context.write(new Text(rowKey),put);
            }
        }
    }

    public static class HBaseReducer extends TableReducer<Text,Put,ImmutableBytesWritable> {

        @Override
        protected void reduce(Text key, Iterable<Put> values, Context context) throws IOException, InterruptedException {
            for (Put value : values) {
                context.write(null,value);
            }
        }
    }

    @Override
    public int run(String[] args) throws Exception {

        Job job = Job.getInstance(super.getConf());
        job.setJarByClass(this.getClass());

        Scan scan = new Scan();
        //使用TableMapReduceUtil 工具类来初始化我们的mapper    TableMapReduceUtil.initTableMapperJob(TableName.valueOf("myuser2"),scan,HBaseMapper.class,Text.class,Put.class,job);
        //使用TableMapReduceUtil 工具类来初始化我们的reducer
        TableMapReduceUtil.initTableReducerJob("myuser4",HBaseReducer.class,job);

        job.setNumReduceTasks(1);
        return job.waitForCompletion(true)?0:1;

    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        conf.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181");
        System.exit(ToolRunner.run(conf,new HBaseMR(),args));
    }

}

4, packing operation

Requires the use of plug-in package, HBase will depend on the jar package jar package are driven into the project to go inside

Then execute

yarn jar hbaseStudy-1.0-SNAPSHOT.jar  cn.itcast.hbasemr.HBaseMR

Or we can set up our own environment variables

export HADOOP_HOME=/export/servers/hadoop-2.6.0-cdh5.14.0/

export HBASE_HOME=/export/servers/hbase-1.2.0-cdh5.14.0/

export HADOOP_CLASSPATH=`${HBASE_HOME}/bin/hbase mapredcp`

yarn jar original-hbaseStudy-1.0-SNAPSHOT.jar  cn.itcast.hbasemr.HBaseMR

Second, the HDFS file read, to write them into the table HBase

1, prepare data files, and data files to be uploaded to HDFS above

hdfs dfs -mkdir -p /hbase/input

cd /export/servers/

vim user.txt

0007    zhangsan    18
0008    lisi    25
0009    wangwu    20

2, MR program development

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

public class Hdfs2Hbase extends Configured implements Tool {
    @Override
    public int run(String[] args) throws Exception {
        Job job = Job.getInstance(super.getConf(), "hdfs2Hbase");
        job.setJarByClass(Hdfs2Hbase.class);
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job,new Path("hdfs://node01:8020/hbase/input"));
        job.setMapperClass(HdfsMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        TableMapReduceUtil.initTableReducerJob("myuser2",HBaseReducer.class,job);
        job.setNumReduceTasks(1);
        boolean b = job.waitForCompletion(true);

        return b?0:1;
    }


    public static void main(String[] args) throws Exception {
        Configuration conf = HBaseConfiguration.create();
        conf.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181");
        int run = ToolRunner.run(conf, new Hdfs2Hbase(), args);
        System.exit(run);
    }


    public static class HdfsMapper extends Mapper<LongWritable,Text,Text,NullWritable> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            context.write(value,NullWritable.get());
        }
    }

    public static class HBaseReducer extends TableReducer<Text,NullWritable,ImmutableBytesWritable> {

        @Override
        protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
            String[] split = key.toString().split("\t");
            Put put = new Put(Bytes.toBytes(split[0]));
            put.addColumn("f1".getBytes(),"name".getBytes(),split[1].getBytes());
            put.addColumn("f1".getBytes(),"age".getBytes(),Bytes.toBytes(Integer.parseInt(split[2])));
            context.write(new ImmutableBytesWritable(Bytes.toBytes(split[0])),put);
        }
    }

}

Third, read HBase table data, and writing data to go above hdfs

Fourth, by the way bulkload bulk load data into HBase were to go

Way to load data into HBase among diverse, we can use javaAPI HBase or use sqoop our data is written or imported into HBase were to go, but these methods are not slow in the occupied Region is the data import process leads to efficiency is low, we can also MR procedures, our data directly converted into the final storage format HFile HBase then load data directly into which go to HBase

HBase in each Table in the root directory (/ HBase) with a storage folder, the folder name called Table, Table folder in each Region a folder with the same memory, each folder under each Region column family also used folder stores, and that some HFile files are stored under each column family, HFile HBase data is stored in HFDS under format, so HB ASE files stored hdfs final performance in the above form is HF Ile, if we can data directly converted into HF ile format, then our HB ASE can be read directly loaded HF file ile format, you can directly read the

advantage:

1. The import process does not take up resources Region

2. can quickly import vast amounts of data

3. To save memory

HBase normal data read and write process

 

Bulkload way using our data directly generated HFile format, then loaded directly onto a table which go HBase

 

Requirements: We will hdfs above this path /hbase/input/user.txt data files, convert HFile format, and then load this table to go inside to myuser2

1, the development of MR procedures

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

public class BulkData extends Configured implements Tool {

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        conf.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181");
        System.exit(ToolRunner.run(conf,new BulkLoadData(),args));
    }

    @Override
    public int run(String[] args) throws Exception {

        Job job = Job.getInstance(super.getConf());
        Connection connection = ConnectionFactory.createConnection(super.getConf());
        Table connectionTable = connection.getTable(TableName.valueOf("myuser2"));
        RegionLocator regionLocator = connection.getRegionLocator(TableName.valueOf("myuser2"));
        job.setJarByClass(BulkLoadData.class);
        job.setMapperClass(BulkLoadMap.class);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(Put.class);
        HFileOutputFormat2.configureIncrementalLoad(job,connectionTable,regionLocator);
        FileInputFormat.addInputPath(job,new Path("hdfs://node01:8020/hbase/input/"));
        FileOutputFormat.setOutputPath(job,new Path("hdfs://node01:8020/hbase/output_hfile"));
        return job.waitForCompletion(true)?0:1;

    }

    public static class BulkLoadMap extends Mapper<LongWritable,Text,ImmutableBytesWritable,Put> {

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String[] split = value.toString().split("\t");
            Put put = new Put(Bytes.toBytes(split[0]));
            put.addColumn("f1".getBytes(),"name".getBytes(),split[1].getBytes());
            put.addColumn("f1".getBytes(),"age".getBytes(),Bytes.toBytes(Integer.valueOf(split[2])));
            context.write(new ImmutableBytesWritable(Bytes.toBytes(split[0])),put);
        }

    }

}

2, code labeled jar package and then run

yarn jar original-hbaseStudy-1.0-SNAPSHOT.jar  cn.itcast.hbasemr.HBaseLoad

3, development code, load data

The output path following HFile files loaded into our table were to go hbase

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;

public class LoadData {

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        conf.set("hbase.zookeeper.property.clientPort","2181");
        conf.set("hbase.zookeeper.quorum","node01,node02,node03");
        Connection connection = ConnectionFactory.createConnection(conf);
        Admin admin = connection.getAdmin();
        Table connectionTable = connection.getTable(TableName.valueOf("myuser"));
        RegionLocator regionLocator = connection.getRegionLocator(TableName.valueOf("myuser"));
        LoadIncrementalHFiles load = new LoadIncrementalHFiles(conf);
        load.doBulkLoad(new Path("hdfs://node01:8020/hbase/output_hfile"),admin,connectionTable,regionLocator);

    }
}

Or we can to load data from the command line

Hbase first package added to the jar hadoop path of the classpath

export HBASE_HOME=/export/servers/hbase-1.2.0-cdh5.14.0/

export HADOOP_HOME=/export/servers/hadoop-2.6.0-cdh5.14.0/

export HADOOP_CLASSPATH=`${HBASE_HOME}/bin/hbase mapredcp`

Then execute the following command hbase HFile directly into the table to which myuser2

yarn jar /export/servers/hbase-1.2.0-cdh5.14.0/lib/hbase-server-1.2.0-cdh5.14.0.jar completebulkload /hbase/output_hfile myuser2

Published 81 original articles · won praise 21 · views 2216

Guess you like

Origin blog.csdn.net/qq_44065303/article/details/103636770