Spark读取HBASE注册临时表使用sparkSQL查询计算

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接: https://blog.csdn.net/weixin_44455388/article/details/102568618

参考文档:

scala版写的SparkSQL程序读取Hbase表注册成表SQL查询
https://blog.csdn.net/qq_21383435/article/details/77328805

第一种思路:

使用newAPIHadoopRDD方法实现

package com.zhbr.mainClass

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{Result, Scan}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.{RowFactory, SparkSession}
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.util.Base64
import org.apache.spark.sql.types.{DataTypes, StructField}
import scala.collection.mutable.ListBuffer

//dlzzBean样例类
case class dlzzBean(P1:Double,P2:Double)

object Test {

  def main(args: Array[String]): Unit = {

    //获取sparkSession
    val sparkSession = SparkSession.builder().appName(this.getClass.getSimpleName.filter(!_.equals('$'))).getOrCreate()

    //获取sparkContext
    val sparkContext = sparkSession.sparkContext

    //设置日志级别
    sparkContext.setLogLevel("WARN")

    //指定HBASE的表
    val tableName = "DWD_AMR_GS_METER-E-CURVE_201902"

    //设置HBaseConfiguration
    val hbaseConf = HBaseConfiguration.create()
    //设备zookeeper集群地址
    hbaseConf.set("hbase.zookeeper.quorum","10.213.111.XXX,10.213.111.XXX,10.213.111.XXX")
    //设置zookeeper端口
    hbaseConf.set("hbase.zookeeper.property.clientPort","2181")
    //设置要读取的表名
    hbaseConf.set(TableInputFormat.INPUT_TABLE,tableName)

    //创建scan
    val scan = new Scan()
    scan.addColumn(Bytes.toBytes("DATA"),Bytes.toBytes("meterID-1"))
    scan.addColumn(Bytes.toBytes("DATA"),Bytes.toBytes("meterID-2"))

    //将scan类转换成string类型
    val proto = ProtobufUtil.toScan(scan)
    val ScanToString = Base64.encodeBytes(proto.toByteArray())
    hbaseConf.set(TableInputFormat.SCAN, ScanToString)

    //读取表,获取RDD
    val hbaseRDD = sparkContext.newAPIHadoopRDD(hbaseConf,
      classOf[TableInputFormat],
      classOf[ImmutableBytesWritable],
      classOf[Result])

	//--------------------------转换DateFrame(1)------------------------------------------------
    //遍历数据
    val valueRDD:RDD[Row] = hbaseRDD.map(result => {
      val P1 = result._2.getValue(Bytes.toBytes("DATA"), Bytes.toBytes("meterID-1"))
      val P2 = result._2.getValue(Bytes.toBytes("DATA"), Bytes.toBytes("meterID-2"))
      RowFactory.create(P1, P2)
    })

    //构建schema,转换成dateFrame类型
    val structFields=new ListBuffer[StructField]()
    structFields.append(DataTypes.createStructField("meterID-1", DataTypes.DoubleType, true))
    structFields.append(DataTypes.createStructField("meterID-2", DataTypes.DoubleType, true))
    val schema = DataTypes.createStructType(structFields.toArray)
    val DataDF = sparkSession.createDataFrame(valueRDD, schema)

	//--------------------------转换DateFrame(2)------------------------------------------------
    //遍历数据
    val valueRDD:RDD[Array[Byte],Array[Byte]] = hbaseRDD.map(result => {
      val P1 = result._2.getValue(Bytes.toBytes("DATA"), Bytes.toBytes("meterID-1"))
      val P2 = result._2.getValue(Bytes.toBytes("DATA"), Bytes.toBytes("meterID-2"))
      (P1, P2)
    })
	
	//转换成dateFrame类型
	val DataDF = sparkSession.createDataFrame(valueRDD, classOf[dlzzBean])

	//--------------------------转换DateFrame(3)------------------------------------------------
    //遍历数据,转换成dateFrame类型
	val DataDF = hbaseRDD.map(result=>(
            Bytes.toDouble(result._2.getValue(Bytes.toBytes("DATA"),Bytes.toBytes("meterID-1"))),
            Bytes.toDouble(result._2.getValue(Bytes.toBytes("DATA"),Bytes.toBytes("meterID-2")))
        )).toDF("meterID-1","meterID-2")
	
	//--------------------------使用sparkSQL-----------------------------------------------------
    //注册临时表
    DataDF.createTempView("DLZZ")
    sparkSession.sql("select * from DLZZ").show()

    //关闭sparkSession
    sparkSession.stop()
  }
}

第二种思路:

使用sparkSession读取json的方式实现

自定义工具类:

package com.zhbr.hbase.utils

import org.apache.hadoop.hbase.{Cell, CellUtil, HBaseConfiguration, HConstants, TableName}
import org.apache.hadoop.hbase.client.{Connection, ConnectionFactory, Scan, Table}
import org.apache.hadoop.hbase.filter.Filter
import org.apache.hadoop.hbase.util.Bytes
import scala.collection.{mutable}
import scala.collection.mutable.ListBuffer

object HBaseClient {
  /**
    * 扫描HBase并返回结果
    * @param tableName 表名
    * @param filter 过滤条件
    * @param startRow 起始行键
    * @param stopRow 结束行键
    * @return 扫描结果
    */
  def scan(tableName: String, filter: Filter, startRow: String, stopRow: String): List[Map[String, String]] = {
    val s = buildScan(filter, startRow, stopRow)
    val t = getConnection.getTable(TableName.valueOf(tableName))
    scanTable(t, s)
  }

  /**
    * 获取链接
    */
  private def getConnection: Connection = {
    val conf = HBaseConfiguration.create()
    conf.set(HConstants.ZOOKEEPER_QUORUM, "10.213.111.XXX,10.213.111.XXX,10.213.111.XXX")
    conf.set(HConstants.ZOOKEEPER_CLIENT_PORT, "2181")
    ConnectionFactory.createConnection(conf)
  }

  /**
    * 执行扫描
    * @param table 表
    * @param scan scan
    */
  private def scanTable(table: Table, scan: Scan): List[Map[String, String]] = {
    val scanner = table.getScanner(scan)
    val ite = scanner.iterator()
    val result = new ListBuffer[Map[String, String]]
    import scala.collection.JavaConversions._
    while (ite.hasNext){
      val map = new mutable.ListMap[String, String]
      val cells = ite.next().listCells()
      for (cell <- cells){
        map += readCell(cell)
      }
      result += map.toMap
    }
    result.toList
  }

  /**
    * 读取单元格
    * @param cell 单元格
    */
  private def readCell(cell: Cell) = {
    val qualifier = Bytes.toString(CellUtil.cloneQualifier(cell))
    val value = Bytes.toString(CellUtil.cloneValue(cell))
    (qualifier, value)
  }

  /**
    * 构建Scan实例
    * @param filter 过滤条件
    * @param startRow 起始行键
    * @param stopRow 结束行键
    */
  private def buildScan(filter: Filter, startRow: String, stopRow: String): Scan ={
    val scan = new Scan()
    scan.setMaxVersions()
    scan.setCaching(2000)
    scan.setCacheBlocks(false)
    scan.addColumn(Bytes.toBytes("DATA"),Bytes.toBytes("meterID-1"))
    scan.addColumn(Bytes.toBytes("DATA"),Bytes.toBytes("meterID-2"))
    if(filter != null)
      scan.setFilter(filter)
    if(startRow != null)
      scan.setStartRow(Bytes.toBytes(startRow))
    if(stopRow != null)
      scan.setStopRow(Bytes.toBytes(stopRow))
    scan
  }
}

主程序:

package com.zhbr.mainClass

import com.zhbr.hbase.utils.HBaseClient
import org.apache.spark.sql.{SparkSession}
import scala.util.parsing.json.JSONObject

object DataProcessMain {

  def main(args: Array[String]): Unit = {

    //获取sparkSession
    val sparkSession = SparkSession.builder().appName(this.getClass.getSimpleName.filter(!_.equals('$'))).getOrCreate()

    //获取sparkContext
    val sparkContext = sparkSession.sparkContext

    //设置日志级别
    sparkContext.setLogLevel("WARN")

    //指定HBASE的表
    val tableName = "DWD_AMR_GS_METER-E-CURVE_201902"

    //扫描HBase表,获取到结果数据
    val result = HBaseClient.scan(tableName,null,null,null)

    //将结果数据转化成json字符串
    val jsonString = result.map(e => JSONObject(e).toString())

    //将结果数据转换成RDD
    val jsonRDD = sparkContext.parallelize(jsonString)

    //将结果数据转换成DF
    val dlzzDF = sparkSession.read.json(jsonRDD)

    //注册临时表
    dlzzDF.createTempView("DLZZ")

    sparkSession.sql("select count(1) from DLZZ").show()

    //关闭sparkSession
    sparkSession.stop()
  }
}

猜你喜欢

转载自blog.csdn.net/weixin_44455388/article/details/102568618