1、环境
- 表和表空间提前准备好(可参考:https://blog.csdn.net/FBB360JAVA/article/details/103963765);
- 使用 IDEA 安装 scala 插件
- 添加依赖
2、依赖和jar文件
2.1 pom文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.feng</groupId>
<artifactId>hbase-demo</artifactId>
<version>1.0-SNAPSHOT</version>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
</plugins>
</build>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.target>1.8</maven.compiler.target>
<maven.compiler.source>1.8</maven.compiler.source>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>2.1.5</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.4</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.6.7</version>
</dependency>
</dependencies>
</project>
本人因为网络问题,mapreduce 依赖未添加成功,直接使用了jar包。读者可直接使用依赖。
在maven仓库中找到该jar文件添加:
org\apache\hbase\hbase-mapreduce\2.1.5\hbase-mapreduce-2.1.5.jar
3、SparkWrite.scala
package org.feng.spark
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapred.JobConf
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by Feng on 2020/1/14 10:30
* CurrentProject's name is hbase-demo
* <pre>
* HBase 中的表结构:
* 命名空间:feng_namespace_test
* 表名:person_test_table
* 列族:person_test
* 列1:name
* 列2:age
* </pre>
* <p>
* com.fasterxml.jackson.databind.JsonMappingException: Incompatible Jackson version: 2.9.2
* 版本太高:调整为 2.6.7版本
* </p>
*/
object SparkWrite extends App {
val sparkConf = new SparkConf()
.setAppName("SparkWrite")
.setMaster("local[2]")
val sc = new SparkContext(sparkConf)
val hbaseConf = HBaseConfiguration.create()
// 端口
hbaseConf.set("hbase.zookeeper.property.clientPort", "2181")
//zookeeper
hbaseConf.set("hbase.zookeeper.quorum","chost")
val tableName = "feng_namespace_test:person_test_table"
val jobConf = new JobConf(hbaseConf)
jobConf.setOutputFormat(classOf[TableOutputFormat])
jobConf.set(TableOutputFormat.OUTPUT_TABLE, tableName)
// 准备数据
val inDataRDD = sc.makeRDD(Array("1001,marry,20", "1002,jack,23", "1003,tom,24"))
val rdd = inDataRDD.map(_.split(',')).map(data => {
val put = new Put(Bytes.toBytes(data(0)))
// 列族-列-值
put.addColumn(Bytes.toBytes("person_test"), Bytes.toBytes("name"), Bytes.toBytes(data(1)))
put.addColumn(Bytes.toBytes("person_test"), Bytes.toBytes("age"), Bytes.toBytes(data(2)))
// 返回结果
// 转化成 RDD[ImmutableBytesWritable, Put] 类型才能调用 saveAsHadoopDataset
(new ImmutableBytesWritable, put)
})
// 保存
rdd.saveAsHadoopDataset(jobConf)
sc.stop()
}
3.1 java进行测试
@Test
public void createTablePerson() throws IOException {
dao.createNameSpace("feng_namespace_test");
String tableName = "feng_namespace_test:person_test_table";
dao.createTable(tableName, "person_test");
}
@Test
public void scanTablePerson() throws IOException {
String tableName = "feng_namespace_test:person_test_table";
Assert.assertTrue(dao.tableExists(tableName));
Assert.assertTrue(dao.selectRow(tableName, "1001").contains("marry"));
Assert.assertTrue(dao.selectRow(tableName, "1002").contains("jack"));
Assert.assertTrue(dao.selectRow(tableName, "1003").contains("tom"));
}
4、SparkRead.scala
package org.feng.spark
import org.apache.hadoop.hbase.client.{ConnectionFactory, _}
import org.apache.hadoop.hbase.io._
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by Feng on 2020/1/14 11:50
* 使用 Spark 读取 Hbase 中的表的数据
*/
object SparkRead extends App {
val sparkConf = new SparkConf()
.setAppName("SparkRead")
.setMaster("local[2]")
val sc = new SparkContext(sparkConf)
val tableName = "feng_namespace_test:person_test_table"
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum","chost")
//设置zookeeper连接端口,默认2181
conf.set("hbase.zookeeper.property.clientPort", "2181")
conf.set(TableInputFormat.INPUT_TABLE, tableName)
val connection = ConnectionFactory.createConnection(conf)
val admin = connection.getAdmin
val name = TableName.valueOf(tableName)
// 表不存在
if(!admin.tableExists(name)){
// 创建表
}
//读取数据并转化成rdd
val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[ImmutableBytesWritable],
classOf[Result])
// 打印结果
/* hBaseRDD.foreach{
case (_, result) =>
val key = Bytes.toString(result.getRow)
val name = Bytes.toString(result.getValue("person_test".getBytes(),"name".getBytes()))
val age = Bytes.toString(result.getValue("person_test".getBytes(),"age".getBytes()))
println("Row key:"+key+"\tperson_test.Name:"+name+"\tperson_test.Age:"+age)
}*/
/*封装为 Person 并打印
* [Person:{key = 1001, name = marry, age = 20}]
* [Person:{key = 1002, name = jack, age = 23}]
* [Person:{key = 1003, name = tom, age = 24}]
* */
hBaseRDD.map{
case (_, result: Result) =>
val key = Bytes.toString(result.getRow)
val name = Bytes.toString(result.getValue("person_test".getBytes,"name".getBytes))
val age = Bytes.toString(result.getValue("person_test".getBytes,"age".getBytes))
Person(key, name, age.toInt)
}.foreach(println)
admin.close()
sc.stop()
}
/**
* 模板类:封装读出的数据内容,重写 toString
* @param key rowKey
* @param name 名字
* @param age 年龄
*/
case class Person(key:String,name:String, age:Int){
override def toString: String = "[Person:{key = " + key + ", name = " + name + ", age = " + age + "}]"
}