Spark集成组件

Spark向Hbase写数据

package spark.hbase


import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.RDD
import org.apache.spark.{
    
    SparkConf, SparkContext}
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat

object _02SparkWriteHBase {
    
    
  def main(args: Array[String]): Unit = {
    
    
    //TODO 构建配置类
    val sc: SparkContext = new SparkContext(new SparkConf().setAppName("SougoRecord").setMaster("local[2]"))
    //TODO 从localhost读取文件
    val InputRDD = sc.textFile("", 2)
    //TODO  处理数据(高阶函数)
    val resultRDD: RDD[(String, Int)] = InputRDD
      .filter(line => line != null && line.trim.length > 0)
      .flatMap(line => line.trim.split("\\s+"))
      .map(word => (word, 1))
      .reduceByKey((tmp, item) => tmp + item)
    resultRDD.foreach(println)

    //	TODO  / TODO: step 1. 转换RDD为RDD[(RowKey, Put)]
    /*
      * HBase表的设计:
        * 表的名称:htb_wordcount
        * Rowkey: word
        * 列簇: info
        * 字段名称: count
      create 'htb_wordcount', 'info'
     */
    val putsRDD = resultRDD
      .map {
    
     case (word, count) =>
        val rowkey = new ImmutableBytesWritable(Bytes.toBytes(word))
        //TODO 构建put对象
        val put = new Put(rowkey.get())
        put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("count"), Bytes.toBytes(count + " "))
        rowkey -> put
      }
    val conf = HBaseConfiguration.create()
    conf.set("hbase.zookeeper.quorum", "node1.itcast.cn")
    conf.set("hbase.zookeeper.property.clientPort", "2181")
    conf.set("zookeeper.znode.parent", "/hbase")
    conf.set(TableOutputFormat.OUTPUT_TABLE, "htb_wordcount")



    putsRDD saveAsNewAPIHadoopFile(
      "datas/hbase/htb_wordcount/", //
      classOf[ImmutableBytesWritable], //
      classOf[Put],
      classOf[TableOutputFormat[ImmutableBytesWritable]],
      conf
    )

    sc.stop()

  }
}

Spark从Hbase中读数据

package cn.itcast.spark.hbase

import org.apache.spark.sql.{
    
    DataFrame, SparkSession}

/**
 * 自定义外部数据源HBase,实现数据读写功能
 */
object _05SparkHBaseTest {
    
    
	
	def main(args: Array[String]): Unit = {
    
    
		
		// 创建SparkSession实例对象时
		val spark: SparkSession = SparkSession.builder()
			.appName(this.getClass.getSimpleName.stripSuffix("$"))
			.master("local[2]")
			.config("spark.sql.shuffle.partitions", "2")
			.getOrCreate()
		import spark.implicits._
		
		// 从HBase表中加载数据
		val hbaseDF: DataFrame = spark.read
			.format("hbase")
    		.option("zkHosts", "node1.itcast.cn")
    		.option("zkPort", "2181")
    		.option("hbaseTable", "stus")
    		.option("family", "info")
    		.option("selectFields", "name,age")
			.load()
		// 自己实现数据源,从Hbase表中读取数据的所有数据类型都是String类型
		hbaseDF.printSchema()
		hbaseDF.show(10, truncate = false)
		
		// 应用结束,关闭资源
		spark.stop()
	}
	
}

Spark向hive读数据

package cn.itcast.spark.hive

import org.apache.spark.sql.SparkSession

/**
 * SparkSQL集成Hive,读取Hive表的数据进行分析
 */
object _04SparkSQLHiveTest {
    
    
	
	def main(args: Array[String]): Unit = {
    
    
		
		// TODO: 集成Hive,创建SparkSession实例对象时,进行设置HiveMetaStore服务地址
		val spark: SparkSession = SparkSession.builder()
    		.appName(this.getClass.getSimpleName.stripSuffix("$"))
    		.master("local[2]")
			// 显示指定集成Hive
    		.enableHiveSupport()
			// 设置Hive MetaStore服务地址
    		.config("hive.metastore.uris", "thrift://node1.itcast.cn:9083")
			.getOrCreate()
		import spark.implicits._
		
		// 方式一、DSL 分析数据
		val empDF = spark.read
			.table("db_hive.emp")
		empDF.printSchema()
		empDF.show(10, truncate = false)
		
		println("==================================================")
		
		// 方式二、编写SQL方式
		spark.sql("select * from db_hive.emp").show()
		
		// 应用结束,关闭资源
		spark.stop()
	}
	
}

Spark向Hive写入数据

    val spark: SparkSession = SparkSession
      .builder()
      .appName(this.getClass.getSimpleName.stripSuffix("$"))
      //设置线程数
      .master("local[2]")
      //继承hive
      .enableHiveSupport()
      // 设置Hive MetaStore服务地址
      .config("hive.metastore.uris", "thrift://node1.itcast.cn:9083")
      //动态分区要设置非严格模式
      .config("hive.exec.dynamic.partition.mode","nonstrict")
      // SparkSQL中产生Shuffle时,默认分区数目为200,
      .config("spark.sql.shuffle.partitions", "4")
      .getOrCreate()
       //读取json文件
    val InputFrame: DataFrame = spark.read.option("inferSchema",  		
    "true").json("dataset/pmt.json")

	//对数据进行处理处理成resultDF

    //写入hive中
    resultDF
      .coalesce(1)
      .write
      .format("hive") // 指定为hive数据源,否则报错
      .mode(SaveMode.Append)
      .partitionBy("data_str")
      .saveAsTable("itcast_ads.pmt_ads_info")

おすすめ

転載: blog.csdn.net/qq_45769990/article/details/116480543