Spark 读写 Hbase(Windows环境)

1、环境

  1. 表和表空间提前准备好(可参考:https://blog.csdn.net/FBB360JAVA/article/details/103963765);
  2. 使用 IDEA 安装 scala 插件
  3. 添加依赖

2、依赖和jar文件

在这里插入图片描述

2.1 pom文件

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.feng</groupId>
    <artifactId>hbase-demo</artifactId>
    <version>1.0-SNAPSHOT</version>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>8</source>
                    <target>8</target>
                </configuration>
            </plugin>
        </plugins>
    </build>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <maven.compiler.target>1.8</maven.compiler.target>
        <maven.compiler.source>1.8</maven.compiler.source>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>2.1.5</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.4.4</version>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-databind</artifactId>
            <version>2.6.7</version>
        </dependency>
    </dependencies>
</project>

本人因为网络问题,mapreduce 依赖未添加成功,直接使用了jar包。读者可直接使用依赖。
在maven仓库中找到该jar文件添加:
org\apache\hbase\hbase-mapreduce\2.1.5\hbase-mapreduce-2.1.5.jar

3、SparkWrite.scala

package org.feng.spark

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapred.JobConf
import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by Feng on 2020/1/14 10:30
  * CurrentProject's name is hbase-demo
  * <pre>
  *   HBase 中的表结构:
  *   命名空间:feng_namespace_test
  *   表名:person_test_table
  *   列族:person_test
  *   列1:name
  *   列2:age
  * </pre>
  * <p>
  *   com.fasterxml.jackson.databind.JsonMappingException: Incompatible Jackson version: 2.9.2
  *   版本太高:调整为 2.6.7版本
  * </p>
  */
object SparkWrite extends App {
  val sparkConf = new SparkConf()
    .setAppName("SparkWrite")
    .setMaster("local[2]")

  val sc = new SparkContext(sparkConf)

  val hbaseConf = HBaseConfiguration.create()
  // 端口
  hbaseConf.set("hbase.zookeeper.property.clientPort", "2181")
  //zookeeper
  hbaseConf.set("hbase.zookeeper.quorum","chost")

  val tableName = "feng_namespace_test:person_test_table"
  val jobConf = new JobConf(hbaseConf)
  jobConf.setOutputFormat(classOf[TableOutputFormat])
  jobConf.set(TableOutputFormat.OUTPUT_TABLE, tableName)

  // 准备数据
  val inDataRDD = sc.makeRDD(Array("1001,marry,20", "1002,jack,23", "1003,tom,24"))
  val rdd = inDataRDD.map(_.split(',')).map(data => {
    val put = new Put(Bytes.toBytes(data(0)))
    // 列族-列-值
    put.addColumn(Bytes.toBytes("person_test"), Bytes.toBytes("name"), Bytes.toBytes(data(1)))
    put.addColumn(Bytes.toBytes("person_test"), Bytes.toBytes("age"), Bytes.toBytes(data(2)))
    // 返回结果
    // 转化成 RDD[ImmutableBytesWritable, Put] 类型才能调用 saveAsHadoopDataset
    (new ImmutableBytesWritable, put)
  })

  // 保存
  rdd.saveAsHadoopDataset(jobConf)
  sc.stop()
}


3.1 java进行测试

 	@Test
    public void createTablePerson() throws IOException {
        dao.createNameSpace("feng_namespace_test");
        String tableName = "feng_namespace_test:person_test_table";
        dao.createTable(tableName, "person_test");
    }

    @Test
    public void scanTablePerson() throws IOException {
        String tableName = "feng_namespace_test:person_test_table";
        Assert.assertTrue(dao.tableExists(tableName));

        Assert.assertTrue(dao.selectRow(tableName, "1001").contains("marry"));
        Assert.assertTrue(dao.selectRow(tableName, "1002").contains("jack"));
        Assert.assertTrue(dao.selectRow(tableName, "1003").contains("tom"));
    }

4、SparkRead.scala

package org.feng.spark

import org.apache.hadoop.hbase.client.{ConnectionFactory, _}
import org.apache.hadoop.hbase.io._
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.spark.{SparkConf, SparkContext}
/**
  * Created by Feng on 2020/1/14 11:50
  * 使用 Spark 读取 Hbase 中的表的数据
  */
object SparkRead extends App {

  val sparkConf = new SparkConf()
    .setAppName("SparkRead")
    .setMaster("local[2]")

  val sc = new SparkContext(sparkConf)
  val tableName = "feng_namespace_test:person_test_table"

  val conf = HBaseConfiguration.create()
  conf.set("hbase.zookeeper.quorum","chost")
  //设置zookeeper连接端口,默认2181
  conf.set("hbase.zookeeper.property.clientPort", "2181")
  conf.set(TableInputFormat.INPUT_TABLE, tableName)

  val connection = ConnectionFactory.createConnection(conf)
  val admin = connection.getAdmin
  val name = TableName.valueOf(tableName)
  // 表不存在
  if(!admin.tableExists(name)){
    // 创建表
  }

  //读取数据并转化成rdd
  val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
    classOf[ImmutableBytesWritable],
    classOf[Result])

  // 打印结果
/*  hBaseRDD.foreach{
    case (_, result) =>
    val key = Bytes.toString(result.getRow)
    val name = Bytes.toString(result.getValue("person_test".getBytes(),"name".getBytes()))
    val age = Bytes.toString(result.getValue("person_test".getBytes(),"age".getBytes()))
    println("Row key:"+key+"\tperson_test.Name:"+name+"\tperson_test.Age:"+age)
  }*/

  /*封装为 Person 并打印
  * [Person:{key = 1001, name = marry, age = 20}]
  * [Person:{key = 1002, name = jack, age = 23}]
  * [Person:{key = 1003, name = tom, age = 24}]
  * */
  hBaseRDD.map{
    case (_, result: Result) =>
      val key = Bytes.toString(result.getRow)
      val name = Bytes.toString(result.getValue("person_test".getBytes,"name".getBytes))
      val age = Bytes.toString(result.getValue("person_test".getBytes,"age".getBytes))
      Person(key, name, age.toInt)
  }.foreach(println)

  admin.close()
  sc.stop()
}

/**
  * 模板类:封装读出的数据内容,重写 toString
  * @param key rowKey
  * @param name 名字
  * @param age 年龄
  */
case class Person(key:String,name:String, age:Int){
  override def toString: String = "[Person:{key = " + key + ", name = " + name + ", age = " + age + "}]"
}


发布了108 篇原创文章 · 获赞 117 · 访问量 3万+

猜你喜欢

转载自blog.csdn.net/FBB360JAVA/article/details/103972529
今日推荐