通用mr,hbase表导入hbase

1、   创建两个表table1,table2

create 'table2',{NAME => 'cf1',VERSIONS => 3},{NAME => 'cf2',VERSIONS => 3}
create 'table1',{NAME => 'cf',VERSIONS => 3}

2、   向表table1中插入数据

hbase(main):026:0> put 'table1','1','cf:c1','31'
0 row(s) in 0.0320 seconds

hbase(main):027:0> put 'table1','1','cf:c1','30'
0 row(s) in 0.0100 seconds

hbase(main):028:0> put 'table1','1','cf:c2','male'
0 row(s) in 0.0080 seconds

hbase(main):029:0> put 'table1','2','cf:c1','31'
0 row(s) in 0.0080 seconds

hbase(main):030:0> put 'table1','2','cf:c2','female'
0 row(s) in 0.0090 seconds

hbase(main):031:0> put 'table1','3','cf:c1','28'
0 row(s) in 0.0110 seconds

hbase(main):032:0> put 'table1','3','cf:c2','female'
0 row(s) in 0.0170 seconds

hbase(main):033:0> put 'table1','4','cf:c1','29'
0 row(s) in 0.0090 seconds

hbase(main):034:0> put 'table1','4','cf:c2','male'
0 row(s) in 0.0140 seconds

3、

1、   使用mr程序将table1表中的数据导入到table2表中

在代码中,设置了6个参数,分别为两个表的表名table1,table2,table1表的要导出的列簇名fromFamily,列名fromQualifier,table2表要导入的列簇名toFamily,列名toQualifier,这样只要传入参数就可以实现选择列进行导出(fromtable)以及选择列进行导入(totable)。同时在代码中设置最大版本数就可以把table1中所有版本的数据导出

package demo;

import java.io.IOException;

import org.apache.hadoop.hbase.Cell;

importorg.apache.hadoop.hbase.KeyValue;

import org.apache.hadoop.hbase.client.Put;

import org.apache.hadoop.hbase.client.Result;

import org.apache.hadoop.hbase.io.ImmutableBytesWritable;

import org.apache.hadoop.hbase.mapreduce.TableMapper;

import org.apache.hadoop.hbase.util.Bytes;

publicclassH2HMapperextendsTableMapper<ImmutableBytesWritable,Put>{

    private StringfromFamily = null;

    private StringfromQualifier =null;

    private StringtoFamily = null;

    private StringtoQualifier =null;

    @Override

    protectedvoid setup(Contextcontext) throws IOException,InterruptedException {

        fromFamily =context.getConfiguration().get("fromfamily");

        fromQualifier =context.getConfiguration().get("fromqualifier");

        toFamily =context.getConfiguration().get("tofamily");

        toQualifier =context.getConfiguration().get("toqualifier");

    }

    @Override

    protectedvoidmap(ImmutableBytesWritablerowkey, Resultcolumns,Contextcontext)

            throws IOException,InterruptedException {

        Putput=newPut(rowkey.get());

        for (Cellkv : columns.rawCells()) {

            System.out.println(Bytes.toStringBinary(kv.getQualifier()));

            if(Bytes.toString(kv.getFamily()).equals(fromFamily) && Bytes.toStringBinary(kv.getQualifier()).equals(fromQualifier)){

                System.out.println("++++++++++++"+fromQualifier);

                put.addColumn(Bytes.toBytes(toFamily), Bytes.toBytes(toQualifier),kv.getValue());

                context.write(rowkey,put);

            }  

        }

    }

}

package demo;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.conf.Configured;

import org.apache.hadoop.hbase.client.Put;

import org.apache.hadoop.hbase.client.Scan;

import org.apache.hadoop.hbase.io.ImmutableBytesWritable;

import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;

import org.apache.hadoop.hbase.util.Bytes;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

publicclassDriverextendsConfiguredimplementsTool{

privatestaticfinal StringFROMTABLE="";

    privatestaticfinal StringTOTABLE="";

@Override

    publicint run(String[]args) throws Exception {

        byte[]family = Bytes.toBytes(args[2]);

        byte[]qualifier = Bytes.toBytes(args[3]);

        Scanscan=new Scan();

        scan.setMaxVersions();

        scan.addColumn(family,qualifier);

        Configurationconf=getConf();

        conf.set(FROMTABLE,args[0]);

        conf.set(TOTABLE,args[1]);

        conf.set("fromfamily",args[2]);

        conf.set("fromqualifier",args[3]);

        conf.set("tofamily",args[4]);

        conf.set("toqualifier",args[5]);

        StringjobName="From table"+FROMTABLE+",Import to"+TOTABLE;

        Jobjob= Job.getInstance(conf,jobName);

        job.setJarByClass(Driver.class);

        job.setNumReduceTasks(0);

        TableMapReduceUtil.initTableMapperJob(

                args[0],

                scan,

                H2HMapper.class,

                ImmutableBytesWritable.class,

                Put.class,

                job);

        TableMapReduceUtil.initTableReducerJob(args[1],null,job);

        returnjob.waitForCompletion(true)?0:1;

    }

    publicstaticvoid main(String[]args) throws Exception {

        args=new String[]{

                "table1",

                "table2",

                "cf",

                "c1",

                "cf1",

                "c3"

               

        };

        ToolRunner.run(getConfiguration(),newDriver(),args);

    }

    privatestatic Configurationconfiguration;

    publicstatic ConfigurationgetConfiguration(){

        if(configuration==null){

            configuration=newConfiguration();

    configuration.setBoolean("mapreduce.app-submission.cross-platform",true);//配置使用跨平台提交任务

            configuration.set("fs.defaultFS","hdfs://master:8020");//指定namenode

            configuration.set("mapreduce.framework.name","yarn");//指定使用yarn框架

            configuration.set("yarn.resourcemanager.address","master:8032");// 指定resourcemanager

            configuration.set("yarn.resourcemanager.scheduler.address","master:8030");//指定资源分配器

            configuration.set("mapreduce.jobhistory.address","master:10020");//指定historyserver

            configuration.set("hbase.master","master:16000");

            configuration.set("hbase.rootdir","hdfs://master:8020/hbase");

            configuration.set("hbase.zookeeper.quorum","slave1,slave2,slave3");

            configuration.set("hbase.zookeeper.property.clientPort","2181");

            configuration.set("mapreduce.job.jar","C:\\Users\\Administrator\\Desktop\\hbase1.jar");//设置jar包路径

        }  

        returnconfiguration;

    }

}

如果不设置版本数,即把scan.setMaxVersions()这句话注释掉,那么table1表导出的将是最新版本的数据。从第2步向table1中插入的数据可知,table1表中列簇名为cf,列名为c1的数据总共是5条,但是程序运行map的输出是4,说明程序并没有把table1表所有版本的数据都输出

Map-Reduce Framework
        Map input records=4
        Map output records=4
        Input split bytes=64
        Spilled Records=0
        Failed Shuffles=0
        Merged Map outputs=0
        GC time elapsed (ms)=169
        CPU time spent (ms)=2560
        Physical memory (bytes) snapshot=120504320
        Virtual memory (bytes) snapshot=848523264
        Total committed heap usage (bytes)=16130048

此时查看table2表中的数据,发现table2表也只有4条数据

hbase(main):081:0> scan 'table2'
ROW                                              COLUMN+CELL                                                                                                                                   
 1                                               column=cf1:c3, timestamp=1477596015451, value=30                                                                                              
 2                                               column=cf1:c3, timestamp=1477596015451, value=31                                                                                              
 3                                               column=cf1:c3, timestamp=1477596015451, value=28                                                                                              
 4                                               column=cf1:c3, timestamp=1477596015451, value=29                                                                                              
4 row(s) in 0.0670 seconds

如果设置版本数,就把scan.setMaxVersions()这句话加上,那么table1表导出的将是所有版本的数据。此时程序运行map的输出是5,说明程序把table1表所有版本的数据都输出

Map-Reduce Framework
        Map input records=4
        Map output records=5
        Input split bytes=64
        Spilled Records=0
        Failed Shuffles=0
        Merged Map outputs=0
        GC time elapsed (ms)=187
        CPU time spent (ms)=2500
        Physical memory (bytes) snapshot=122474496
        Virtual memory (bytes) snapshot=848461824
        Total committed heap usage (bytes)=16130048

此时我们再看看table2表中的数据

hbase(main):086:0> scan 'table2'
ROW                                              COLUMN+CELL                                                                                                                                   
 1                                               column=cf1:c3, timestamp=1477596557735, value=31                                                                                              
 2                                               column=cf1:c3, timestamp=1477596557735, value=31                                                                                              
 3                                               column=cf1:c3, timestamp=1477596557735, value=28                                                                                              
 4                                               column=cf1:c3, timestamp=1477596557735, value=29                                                                                              
4 row(s) in 0.0430 seconds

依然只有4条数据,这是为什么呢,我的理解是这样的,从截图的结果我们可以看到,所有数据的timestamp的值都是一样的,而行健+列名+时间戳确定唯一的一个cell,即每个cell里面只能存储一个值,所以table2表只能接收table1表最新版本的数据,那要怎样才能让table2表接收table1表所有版本的数据呢,其实只要将

put.addColumn(Bytes.toBytes(toFamily),Bytes.toBytes(toQualifier), kv.getValue());

改成

put.addColumn(Bytes.toBytes(toFamily),Bytes.toBytes(toQualifier), kv.getTimestamp(),kv.getValue());即可


猜你喜欢

转载自blog.csdn.net/abc_321a/article/details/53234445