[Big Data] Spark reads data sources such as Hive/Hbase/Elasticsearch/Kafka/Mysql

Spark reads Hiveserver2 data source

environmental information

  • hiveserver2 address
  • username
  • password

the code

def main(args: Array[String]): Unit = {
    
    
    val Array(url, database, table, username, password) = args

    val sparkConf = new SparkConf().setAppName("Spark Mysql Demo (Scala)")

    val spark: SparkSession = SparkSession.builder().config(sparkConf).getOrCreate()
    
	Dataset<Row> rowDataset = sparkSession.read()
                .format("jdbc")
                .option("url", url)
                .option("dbtable", "$database.$table")
                .option("user", username)
                .option("password", password)
                .option("driver", "org.apache.hive.jdbc.HiveDriver").load().filter("`table_name.day`='20210112'");
                
    rowDataset.show();

    spark.stop()
}

Spark reads mysql data

environmental information

  • mysql address
  • username
  • password
  • library table

the code

def main(args: Array[String]): Unit = {
    
    
    val Array(url, username, password, table) = args

    val sparkConf = new SparkConf().setAppName("Spark Mysql Demo (Scala)")

    val spark: SparkSession = SparkSession.builder().config(sparkConf).getOrCreate()

    val props = new Properties()
    props.setProperty("user", username)
    props.setProperty("password", password)
    val df: DataFrame = spark.read.jdbc(url, table, props)

    val rowNumbers: Long = df.count()
    println("数据总条数据: " + rowNumbers)

    // select 里的参数位为 mysql 的字段
    df.select("id").where("id >= 3").show()
    // 写入数据
    // df.write.mode(SaveMode.Append).jdbc(url,"tb_02",props)


    spark.stop()
   
}

Spark reads kafka

environmental information

  • broker address
  • topic information
  • consumer group id

the code

  def main(args: Array[String]): Unit = {
    
    

    val Array(brokers, topics, interval, groupId) = args

    val sparkConf = new SparkConf().setAppName("Spark Kafka Demo (Scala)")
    val ssc = new StreamingContext(sparkConf, Seconds(interval.toInt))

    // kafka参数
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> brokers,
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> groupId,
      "auto.offset.reset" -> "earliest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )

    // 消息
    val messages = KafkaUtils.createDirectStream[String, String](
      ssc,
      LocationStrategies.PreferConsistent,
      ConsumerStrategies.Subscribe[String, String](Array(topics), kafkaParams)
    )

    // 单词统计
    val lines = messages.map(_.value)
    val words = lines.flatMap(_.split(" "))
    val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
    wordCounts.print()

    // Start the computation
    ssc.start()
    ssc.awaitTermination()
  }

Spark reads hdfs data

environmental information

  • source address
  • Number of partitions read
  • target address

the code

  def main(args: Array[String]): Unit = {
    
    
	
	val Array(src, partition, dest) = args

    val sparkConf: SparkConf = new SparkConf().setAppName("Spark HDFS Demo (Scala)")
	// 1、创建session
    val session: SparkSession = SparkSession.builder().config(sparkConf).getOrCreate()

    // 2、依据sc创建rdd
    val sc: SparkContext = session.sparkContext

    val file: RDD[String] = sc.textFile(src, partition)

    file.saveAsTextFile(dest)

    session.stop()
  }

Spark reads the Hbase data source

environmental information

  • address of zk
  • The rootDir set by hbase on zk
  • Master address of hbase
  • table

the code

  def main(args: Array[String]): Unit = {
    
    

	val Array(zookeeper, rootdir, master, table) = args

    val sparkConf: SparkConf = new SparkConf().setAppName("Spark HBase Demo (Scala)")

    // 支持 hive 读写
    val spark: SparkSession = SparkSession.builder()
      .config(sparkConf)
      .getOrCreate()

    val hbaseConfig: Configuration = HBaseConfiguration.create()
    hbaseConfig.set("hbase.zookeeper.quorum",zookeeper)
    hbaseConfig.set("hbase.rootdir", rootdir)
    hbaseConfig.set("hbase.master", master)

    // 设置查询的表名
    hbaseConfig.set(TableInputFormat.INPUT_TABLE, table)

    val stuRDD: RDD[(ImmutableBytesWritable, Result)] = spark.sparkContext.newAPIHadoopRDD(hbaseConfig, classOf[TableInputFormat],
      classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
      classOf[org.apache.hadoop.hbase.client.Result])

    val count: Long = stuRDD.count()
    println("Students RDD Count:" + count)
    stuRDD.cache()

    // 遍历输出
    stuRDD.foreach({
    
     case (_, result) =>
      val key: String = Bytes.toString(result.getRow)
      println("Row key:" + key)
    })

    spark.stop()
  }

Spark reads es data source

environmental information

  • es username
  • es password
  • es service address
  • The clusterName of es is mainly because the cluster permission is turned off. need to specify
  • index of es

the code

  def main(args: Array[String]): Unit = {
    
    
	
	val Array(user, password, esIp, clusterName, index) = args

    val sparkConf: SparkConf = new SparkConf().setAppName("Spark Es Demo (Scala)").setMaster("local[*]")

    sparkConf.set("cluster.name", clusterName)
    sparkConf.set("es.internal.es.cluster.name", clusterName)
    sparkConf.set("es.internal.es.version", "7.12") // 防止 security_exception: action [cluster:monitor/main] is unauthorized
    sparkConf.set("es.index.auto.create", "true")
    sparkConf.set("es.nodes", esIp)
    sparkConf.set("es.port", "9200")
    sparkConf.set("es.mapping.date.rich", "false")
    sparkConf.set("es.index.read.missing.as.empty", "true")
    sparkConf.set("es.net.http.auth.user", user) //访问es的用户名
    sparkConf.set("es.net.http.auth.pass", password) //访问es的密码
    sparkConf.set("es.nodes.wan.only", "true")
    sparkConf.set("es.index.read.allow.red.status", "true") // 防止 security_exception: action [cluster:monitor/health] is unauthorized

    val sc = new SparkContext(sparkConf)

    write2Es(sc, index)
    read2Es(sc, index)

    sc.stop()
  }

  def write2Es(sc: SparkContext, index: String): Unit = {
    
    
    val numbers: Map[String, String] = Map("jsIp" -> "11111",
      "address" -> "11111", "enterprise" -> "北京",
      "xian" -> "11111", "ip" -> "11111",
      "source" -> "11111", "sheng" -> "11111",
      "phone" -> "11111", "shi" -> "11111",
      "ipLong" -> "333", "time" -> "2022-12-27 09:56:50",
      "qsIp" -> "11111", "contacts" -> "11111",
      "email" -> "[email protected]")
    val rdd: RDD[Map[String, Any]] = sc.makeRDD(Seq(numbers))
    EsSpark.saveToEs(rdd, s"${
      
      index}/_doc")
    println("--------------------End-----------------")
  }

  def read2Es(sc: SparkContext, index: String): Unit = {
    
    

    val rdd: RDD[(String, collection.Map[String, AnyRef])] = EsSpark.esRDD(sc, s"${
      
      index}/_doc")

    println("------------------rdd.count():" + rdd.count())
    rdd.foreach(line => {
    
    
      val key: String = line._1
      val value: collection.Map[String, AnyRef] = line._2
      println("------------------key:" + key)
      for (tmp <- value) {
    
    
        val key1: String = tmp._1
        val value1: AnyRef = tmp._2
        println("------------------key1:" + key1)
        println("------------------value1:" + value1)
      }
    })
  }

Configure maven dependencies

<properties>
        <scala.version>2.12</scala.version>
        <spark.version>3.2.1</spark.version>
        <hadoop.version>3.3.1</hadoop.version>
        <jackson.version>2.12.3</jackson.version>
        <s3.version>1.12.77</s3.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_${scala.version}</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-hive_${scala.version}</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_${scala.version}</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka-0-10_${scala.version}</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql-kafka-0-10_${scala.version}</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-core</artifactId>
            <version>${jackson.version}</version>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-databind</artifactId>
            <version>${jackson.version}</version>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-annotations</artifactId>
            <version>${jackson.version}</version>
        </dependency>
        <dependency>
            <groupId>com.amazonaws</groupId>
            <artifactId>aws-java-sdk-s3</artifactId>
            <version>${s3.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-aws</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>commons-httpclient</groupId>
            <artifactId>commons-httpclient</artifactId>
            <version>3.1</version>
        </dependency>
        <dependency>
            <groupId>org.elasticsearch</groupId>
            <artifactId>elasticsearch-hadoop</artifactId>
            <version>7.10.2</version>
            <exclusions>
                <exclusion>
                    <groupId>org.apache.spark</groupId>
                    <artifactId>spark-*</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <!--<dependency>
            <groupId>org.elasticsearch</groupId>
            <artifactId>elasticsearch-spark-30_${scala.version}</artifactId>
            <version>7.12.0</version>
            <exclusions>
                <exclusion>
                    <groupId>org.apache.spark</groupId>
                    <artifactId>spark-*</artifactId>
                </exclusion>
            </exclusions>
        </dependency>-->
        <dependency>
            <groupId>org.apache.hbase.connectors.spark</groupId>
            <artifactId>hbase-spark</artifactId>
            <version>1.0.0</version>
            <exclusions>
                <exclusion>
                    <groupId>org.apache.hadoop</groupId>
                    <artifactId>hadoop-hdfs</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.apache.spark</groupId>
                    <artifactId>spark-*</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>com.thoughtworks.paranamer</groupId>
            <artifactId>paranamer</artifactId>
            <version>2.8</version>
        </dependency>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.2</version>
        </dependency>
    </dependencies>

I hope it will be helpful to you who are viewing the article, remember to pay attention, comment, and favorite, thank you

Guess you like

Origin blog.csdn.net/u013412066/article/details/129467546