一、背景说明
HBase是一个分布式的、面向列的开源NoSQL数据库,不同于传统关系型数据库,它在大数据量级下的性能表现堪称卓越。最近项目也在探索往Hbase方向迁移,故首先整理了一份Hbase入库效率方面的数据。
Hbase入库手段有三种,但针对项目实际情况,我采用了其中两种(JavaAPI和MapReduce)来进行入库操作,并进行比较。
二、测试环境
三台主机:一台master:192.168.13.74,两台slave(192.168.13.75/192.168.13.76)
Hadoop:Hadoop 2.6.0-cdh5.4.0
Hbase:HBase 1.0.0-cdh5.4.0
三、JavaAPI方式进行入库操作
1、新建java测试工程,新建测试类
2、导入相关jar包
3、新建测试类,通过HBase的API初始化连接
public static Configuration configuration; private static Admin admin = null; private static Random random = null;//生成主键使用 private static Connection connection = null; static { try { configuration = HBaseConfiguration.create(); configuration.set("hbase.zookeeper.quorum", "192.168.13.74"); configuration.set("hbase.zookeeper.property.clientPort", "2181"); connection = ConnectionFactory.createConnection(configuration); admin = connection.getAdmin(); random = new Random(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
4、增删改查操作
/** * 创建表 * * @param tableName */ public static void createTable(String tableName) { System.out.println("start create table ......"); TableName tn = TableName.valueOf(tableName); try { if (admin.tableExists(tn)) { admin.disableTable(tn); admin.deleteTable(tn); System.out.println(tableName + " is exist,detele...."); } HTableDescriptor hTableDescriptor = new HTableDescriptor(tn); hTableDescriptor.addFamily(new HColumnDescriptor("column1")); hTableDescriptor.addFamily(new HColumnDescriptor("column2")); hTableDescriptor.addFamily(new HColumnDescriptor("column3")); admin.createTable(hTableDescriptor); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println("end create table ......"); } /** * 插入数据 * * @param tableName */ public static void insertData(String tableName) { // System.out.println("start insert data ......"); Table table = null; TableName tn = TableName.valueOf(tableName); try { table = connection.getTable(tn); // System.out.println("init insert data ......"); Put put = new Put(String.valueOf(random.nextLong()).getBytes());// 一个PUT代表一行数据,再NEW一个PUT表示第二行数据,每行一个唯一的ROWKEY,此处rowkey为put构造方法中传入的值 put.addColumn("column1".getBytes(), null, "ddd".getBytes());// 本行数据的第一列 put.addColumn("column2".getBytes(), null, "bbb".getBytes());// 本行数据的第三列 put.addColumn("column3".getBytes(), null, "ccc".getBytes());// 本行数据的第三列 // System.out.println("insert data ......"); table.put(put); // System.out.println("insert data over......"); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { try { if (table != null) table.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } // System.out.println("end insert data ......"); } /** * 删除一张表 * * @param tableName */ public static void dropTable(String tableName) { try { TableName tn = TableName.valueOf(tableName); admin.disableTable(tn); admin.deleteTable(tn); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * 根据 rowkey删除一条记录 * * @param tablename * @param rowkey */ public static void deleteRow(String tablename, String rowkey) { Table table = null; TableName tn = TableName.valueOf(tablename); try { table = connection.getTable(tn); List list = new ArrayList(); Delete d1 = new Delete(rowkey.getBytes()); list.add(d1); table.delete(list); System.out.println("删除行成功!"); } catch (IOException e) { e.printStackTrace(); } finally { try { if (table != null) table.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } /** * 组合条件删除 * * @param tablename * @param rowkey */ public static void deleteByCondition(String tablename, String rowkey) { // 目前还没有发现有效的API能够实现 根据非rowkey的条件删除 这个功能能,还有清空表全部数据的API操作 } /** * 查询所有数据 * * @param tableName */ public static void QueryAll(String tableName) { Table table = null; TableName tn = TableName.valueOf(tableName); try { table = connection.getTable(tn); ResultScanner rs = table.getScanner(new Scan()); for (Result r : rs) { System.out.println("获得到rowkey:" + new String(r.getRow())); for (Cell cell : r.rawCells()) { System.out.println("列:" + new String(cell.getFamilyArray()) + "====值:" + new String(cell.getValueArray())); } } } catch (IOException e) { e.printStackTrace(); } finally { try { if (table != null) table.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } /** * 单条件查询,根据rowkey查询唯一一条记录 * * @param tableName */ public static void QueryByCondition1(String tableName) { Table table = null; TableName tn = TableName.valueOf(tableName); try { table = connection.getTable(tn); Get scan = new Get("112233bbbcccc".getBytes());// 根据rowkey查询 Result r = table.get(scan); System.out.println("获得到rowkey:" + new String(r.getRow())); for (Cell cell : r.rawCells()) { System.out.println("列:" + new String(cell.getFamilyArray()) + "====值:" + new String(cell.getValueArray())); } } catch (IOException e) { e.printStackTrace(); } finally { try { if (table != null) table.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } /** * 单条件按查询,查询多条记录 * * @param tableName */ public static void QueryByCondition2(String tableName) { Table table = null; TableName tn = TableName.valueOf(tableName); try { table = connection.getTable(tn); Filter filter = new SingleColumnValueFilter( Bytes.toBytes("column1"), null, CompareOp.EQUAL, Bytes.toBytes("ddd")); // 当列column1的值为ddd时进行查询 Scan s = new Scan(); s.setFilter(filter); ResultScanner rs = table.getScanner(s); for (Result r : rs) { System.out.println("获得到rowkey:" + new String(r.getRow())); for (Cell cell : r.rawCells()) { System.out.println("列:" + new String(cell.getFamilyArray()) + "====值:" + new String(cell.getValueArray())); } } } catch (Exception e) { e.printStackTrace(); } finally { try { if (table != null) table.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } /** * 组合条件查询 * * @param tableName */ public static void QueryByCondition3(String tableName) { Table table = null; TableName tn = TableName.valueOf(tableName); try { table = connection.getTable(tn); List<Filter> filters = new ArrayList<Filter>(); Filter filter1 = new SingleColumnValueFilter( Bytes.toBytes("column1"), null, CompareOp.EQUAL, Bytes.toBytes("aaa")); filters.add(filter1); Filter filter2 = new SingleColumnValueFilter( Bytes.toBytes("column2"), null, CompareOp.EQUAL, Bytes.toBytes("bbb")); filters.add(filter2); Filter filter3 = new SingleColumnValueFilter( Bytes.toBytes("column3"), null, CompareOp.EQUAL, Bytes.toBytes("ccc")); filters.add(filter3); FilterList filterList1 = new FilterList(filters); Scan scan = new Scan(); scan.setFilter(filterList1); ResultScanner rs = table.getScanner(scan); for (Result r : rs) { System.out.println("获得到rowkey:" + new String(r.getRow())); for (Cell cell : r.rawCells()) { System.out.println("列:" + new String(cell.getFamilyArray()) + "====值:" + new String(cell.getValueArray())); } } rs.close(); } catch (Exception e) { e.printStackTrace(); } finally { try { if (table != null) table.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
其中,组合条件的删除操作,暂时没有相关API支持
5、以上是基于单线程的操作,通过Thread可以实现多线程并发操作
public static class ImportThread extends Thread { public void HandleThread() { // this.TableName = "T_TEST_1"; } // public void run() { try { InsertProcess("test"); } catch (IOException e) { e.printStackTrace(); } finally { System.gc(); } } } /* * 多线程环境下线程插入函数 */ public static void InsertProcess(String tableName) throws IOException { // System.out.println("start insert data ......"); Table table = null; TableName tn = TableName.valueOf(tableName); int count = 15000; long start = System.currentTimeMillis(); try { table = connection.getTable(tn); List<Put> list = new ArrayList<Put>(); Put put = null; for(int i=0;i<count;i++) { // System.out.println("init insert data ......"); put = new Put(String.valueOf(random.nextLong()).getBytes());// 一个PUT代表一行数据,再NEW一个PUT表示第二行数据,每行一个唯一的ROWKEY,此处rowkey为put构造方法中传入的值 put.addColumn("column1".getBytes(), null, "ddd".getBytes());// 本行数据的第一列 put.addColumn("column2".getBytes(), null, "bbb".getBytes());// 本行数据的第三列 put.addColumn("column3".getBytes(), null, "ccc".getBytes());// 本行数据的第三列 // System.out.println("insert data ......"); list.add(put); } table.put(list); long stop = System.currentTimeMillis(); System.out.println("线程:"+Thread.currentThread().getId()+"插入数据:"+count+"共耗时:"+ (stop - start)*1.0/1000+"s"); // System.out.println("insert data over......"); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { try { if (table != null) table.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } /* * Mutil thread insert test */ public static void MultThreadInsert() throws InterruptedException { System.out.println("---------开始MultThreadInsert测试----------"); long start = System.currentTimeMillis(); int threadNumber = 5; Thread[] threads = new Thread[threadNumber]; for (int i = 0; i < threads.length; i++) { threads[i] = new ImportThread(); threads[i].start(); } for (int j = 0; j < threads.length; j++) { (threads[j]).join(); } long stop = System.currentTimeMillis(); System.out.println("MultThreadInsert:" + threadNumber * 10000 + "共耗时:" + (stop - start) * 1.0 / 1000 + "s"); System.out.println("---------结束MultThreadInsert测试----------"); }
6、基于以上程序,我们可以针对不同数量级和并发任务数的组合,来进行相关测试工作,测试结果如下:
从测试结果可以看出,JavaAPI方式调用的情况下,单线程入库速度为2000条/s~7000条/s之间,而在多线程并发状态下,最高速度能达到10900条/s,稍优于Mysql单节点的入库速度。但小数量级的入库速度,要慢于Mysql。波动幅度比较大。
注:在windows上用eclipse远程访问HDFS时,需要配置hosts文件,把HDFS所有主机的主机名与IP对应关系配置好,否则集群在通信时找不到主机:
192.168.13.74 traceMaster
192.168.13.75 traceSlave1
192.168.13.76 traceSlave2
下一节,我们再来尝试用MapReduce的方式来入库,看看效率是否能进一步提升