Spark operating HBase

Spark wrote HBase

  To write data to HBase by Spark, we need to use PairRDDFunctions.saveAsHadoopDataset的方式.

package cn.com.win

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapred.JobConf
import org.apache.log4j.Logger
import org.apache.spark. SparkConf {,} SparkContext

object TestHbase {

  def main(args: Array[String]) {

    val log = Logger.getLogger("TestHbase")
    // Initialize Spark
    val conf = new SparkConf().setMaster("local[2]").setAppName("testHbase")
    val sc = new SparkContext(conf)

   // define HBase configuration val hconf = HBaseConfiguration.create() JobConf the JobConf new new = Val (hconf, this.getClass)
   // table specify the output format and output jobConf.setOutputFormat(classOf[TableOutputFormat]) jobConf.set(TableOutputFormat.OUTPUT_TABLE, "wifiTarget") val arr = Array(("tjiloaB#3#20190520", 10, 11), ("tjiloaB#3#20190521", 12, 22), ("tjiloaB#3#20190522", 13, 42)) Val eet = sc.parallelize (arr) Val localData = rdd.map (convert) localData.saveAsHadoopDataset(jobConf) sc.stop() }  

// 定义 函数 eet -> eet [(ImmutableBytesWritable, Put)] def convert(triple: (String, Int, Int)) = { val p = new Put(Bytes.toBytes(triple._1)) p.addColumn(Bytes.toBytes("wifiTargetCF"), Bytes.toBytes("inNum"), Bytes.toBytes(triple._2)) p.addColumn(Bytes.toBytes("wifiTargetCF"), Bytes.toBytes("outNum"), Bytes.toBytes(triple._3)) (new ImmutableBytesWritable, p) } }

  Results of the:

  

 

 

 Spark read HBase

  Spark read HBase, we mainly use the SparkContext provided newAPIHadoopRDDAPI table of contents loaded into the Spark in the form of RDDs.

Specified column:

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Scan
import org.apache.hadoop.hbase.filter.PrefixFilter
import org.apache.hadoop.hbase.mapreduce.{TableInputFormat, TableOutputFormat}
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.util.{Base64, Bytes}
import org.apache.spark. SparkConf {,} SparkContext

/**
  * AUTHOR Guozy
  * DATE   2020/2/7-0:33
  **/
object TestHbase2 {
  def main(args: Array[String]): Unit = {

    // Initialize Spark
    val conf = new SparkConf().setMaster("local[2]").setAppName("testHbase")
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    val sc = new SparkContext(conf)

    val scan = new Scan()
    val filter = new PrefixFilter("tjiloaB#3#20190520".getBytes())
    scan.setFilter (filter)
    val hconf = HBaseConfiguration.create()
    hconf.set(TableInputFormat.INPUT_TABLE, "wifiTarget")
    hconf.set(TableInputFormat.SCAN, convertScanToString(scan))

    val = dataRdd sc.newAPIHadoopRDD (conf, classOf [TableInputFormat]
      classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
      classOf[org.apache.hadoop.hbase.client.Result])

    val = dataRdd.count count ()
    println("dataRdd Count is " + count)
    dataRdd.cache ()

    dataRdd.map(_._2).filter(!_.isEmpty).take(20).foreach { result =>
      val key = Bytes.toString(result.getRow)
      val innum = Bytes.toInt(result.getValue(Bytes.toBytes("wifiTargetCF"), Bytes.toBytes("inNum")))
      val outnum = Bytes.toInt(result.getValue(Bytes.toBytes("wifiTargetCF"), Bytes.toBytes("outNum")))
      println(s"key:${key},inNum:${innum},outNum:${outnum}")
    }
    sc.stop()
  }

  /**
    * Scan to convert to a String
    */
  def convertScanToString(scan: Scan): String = {
    val proto = ProtobufUtil.toScan(scan);
    Base64.encodeBytes(proto.toByteArray());
  }

operation result:

Loop through columns:

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Scan
import org.apache.hadoop.hbase.filter.PrefixFilter
import org.apache.hadoop.hbase.mapreduce.{TableInputFormat, TableOutputFormat}
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.util.{Base64, Bytes}
import org.apache.spark. SparkConf {,} SparkContext

/**
  * AUTHOR Guozy
  * DATE   2020/2/7-0:33
  **/
object TestHbase2 {
  def main(args: Array[String]): Unit = {

    // Initialize Spark
    val conf = new SparkConf().setMaster("local[2]").setAppName("testHbase")
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    val sc = new SparkContext(conf)

    val scan = new Scan()
    val filter = new PrefixFilter("tjiloaB#3#20190520".getBytes())
    scan.setFilter (filter)
    val hconf = HBaseConfiguration.create()
    hconf.set(TableInputFormat.INPUT_TABLE, "wifiTarget")
    hconf.set(TableInputFormat.SCAN, convertScanToString(scan))

    val = dataRdd sc.newAPIHadoopRDD (conf, classOf [TableInputFormat]
      classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
      classOf[org.apache.hadoop.hbase.client.Result])

    val = dataRdd.count count ()
    println("dataRdd Count is " + count)
    dataRdd.cache ()

    dataRdd.map(_._2).filter(!_.isEmpty).take(20).foreach { result =>
      val key = Bytes.toString(result.getRow)
      val cells = result.listCells().iterator()
         while (cells.hasNext) {
           val cell = cells.next()
           val innum = Bytes.toInt(cell.getValueArray, cell.getValueOffset, cell.getValueLength)
           val outnum = Bytes.toInt(result.getValue(Bytes.toBytes("wifiTargetCF"), Bytes.toBytes("outNum")))
           println(s"key:${key},inNum:${innum},outNum:${outnum}")
         }
    }
    sc.stop()
  }

  /**
    * Scan to convert to a String
    */
  def convertScanToString(scan: Scan): String = {
    val proto = ProtobufUtil.toScan(scan);
    Base64.encodeBytes(proto.toByteArray());
  }

operation result

 

 

Note: When introducing the package, TableInputFormat corresponding packet is  org.apache.hadoop.hbase.mapreduce, instead  org.apache.hadoop.hbase.maped

 

Guess you like

Origin www.cnblogs.com/Gxiaobai/p/12275218.html