spark streaming读kafka写elasticsearch

spark streaming作为当下依然火热的流计算引擎,对于kafka和elasticsearch都有很好的api支持,以下是笔者在测试环境中的一个数据从kafka到es案例。

首先说一下几个重点:

  • kafka消费offset使用mysql保存,一开始使用了ScalikeJDBC这个包来实现对数据库的访问,后来感觉用起来不太灵活,所以就写了原生的mysql JDBC工具类,也有不少好处,比如事务控制更加灵活,不再需要导入ScalikeJDBC的那好几个jar包
  • 数据库获取offset后,会与kafka最早的offset做一个校验,因为机器资源紧张,kafka里数据只保存一天,如果spark streaming任务因为故障或是其他原因停了一段时间,这时再开启任务从mysql获取的offset有可能会超出kafka里offset的范围,这时就要把当前的offset进行替换。如果对数据处理速度有要求的话这一步可以注释掉,等需要的时候再开启。

第一步 Maven配置

这里相关工具的版本:
scala:2.11.8
spark:2.3.4
kafka:0.10.1.0
elasticsearch:7.0.0
maven配置如下:

	<properties>
        <scala.version>2.11.8</scala.version>
        <spark.version>2.3.4</spark.version>
    </properties>

    <dependencies>
        <!-- 导入scala的依赖 -->
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>${scala.version}</version>
        </dependency>

        <!-- 导入spark的依赖 -->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.elasticsearch</groupId>
            <artifactId>elasticsearch-spark-20_2.11</artifactId>
            <version>7.0.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.49</version>
        </dependency>
        <!-- mysql的连接驱动依赖 -->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.38</version>
        </dependency>
    </dependencies>

第二步 Mysql建表

CREATE TABLE `kafka_offset` (
  `topic` varchar(255) NOT NULL,
  `groupid` varchar(128) NOT NULL,
  `partition` int(11) NOT NULL,
  `fromoffset` bigint(20) DEFAULT NULL,
  `untiloffset` bigint(20) DEFAULT NULL
  PRIMARY KEY (`topic`,`groupid`,`partition`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8

第二步 JDBC工具类

JDBCUtil.scala

package com.test

import java.math.BigDecimal
import java.sql._

import org.apache.kafka.common.TopicPartition
import org.apache.spark.sql.Row
import org.apache.spark.streaming.kafka010.OffsetRange

import scala.language.postfixOps

object JDBCUtil {

  /*
  1.jdbc 地址
  2.用户名
  3.密码
   */
  val url="jdbc:mysql://localhost:3306/test?characterEncoding=utf-8&autoReconnect=true&failOverReadOnly=false&useSSL=false&rewriteBatchedStatements=true"
  val username="user"
  val password="12345"

  /*
  获取连接
   */
  def getConnection:Connection={
    Class.forName("com.mysql.jdbc.Driver")
    DriverManager.getConnection(url,username,password)
  }
  /*
  关闭连接
   */
  def closeConnection(conn:Connection): Unit ={
    if(conn!=null)
      conn.close()
  }
  /*
  关闭查询通道
   */
  def closePreparedStatement(prepareStatement:PreparedStatement): Unit ={
    if(prepareStatement!=null)
      prepareStatement.close()
  }

  def closeResultSet(rs:ResultSet):Unit={
    if(rs!=null)
      rs.close()
  }

  /**
    * 查询kafka的offset
    * @param groupid
    * @return
    */
  def selectKafkaOffset(groupid:String):Map[TopicPartition,Long]={
    val conn:Connection=getConnection
    val sql="select * from `kafka_offset` where `groupid`='"+groupid+"' "
    val pstmt:PreparedStatement=conn.prepareStatement(sql)
    val rs:ResultSet=pstmt.executeQuery()
    var map=Map[TopicPartition,Long]()
    while(rs.next()){
      map+=(new TopicPartition(rs.getString("topic"),rs.getInt("partition")) -> rs.getLong("untiloffset"))
    }
    closePreparedStatement(pstmt)
    closeResultSet(rs)
    closeConnection(conn)
    map
  }

  /**
    * 插入kafka的offset
    * @param offsetRanges
    * @param groupid
    */
  def replaceKafkaOffset(offsetRanges:scala.Array[OffsetRange],groupid:String):Unit={
    val conn:Connection=getConnection
    val sql="replace into `kafka_offset`(`topic`,`groupid`,`partition`,`fromoffset`,`untiloffset`) values (?,?,?,?,?)"
    val preparedStatement:PreparedStatement=conn.prepareStatement(sql)
    for(or <- offsetRanges){
      preparedStatement.setString(1,or.topic)
      preparedStatement.setString(2,groupid)
      preparedStatement.setInt(3,or.partition)
      preparedStatement.setLong(4,or.fromOffset)
      preparedStatement.setLong(5,or.untilOffset)
      preparedStatement.addBatch()
    }
    preparedStatement.executeBatch()
    closePreparedStatement(preparedStatement)
    closeConnection(conn)
  }

}

第三步 主程序

package com.test

import com.alibaba.fastjson.JSON
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord, KafkaConsumer}
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Minutes, Seconds, StreamingContext}
import org.elasticsearch.spark.rdd.EsSpark
import scala.collection.JavaConverters._

object KafkaToES {

  //	检查mysql里offset是否超出范围
  def getCheckedOffset(topics:Set[String],kafkaParams:Map[String, Object],fromdbOffset:Map[TopicPartition,Long]):Map[TopicPartition,Long]={
    val kc=new KafkaConsumer[String,String](kafkaParams.asJava)
    val beginOffsetMap=scala.collection.mutable.Map[TopicPartition,Long]()
    for (topic <- topics) {
      kc.partitionsFor(topic).asScala.foreach(partitionInfo => {
        val topicPartition=new TopicPartition(topic,partitionInfo.partition())
        kc.assign(Seq(topicPartition).asJava)
        kc.seekToBeginning(Seq(topicPartition).asJava)
        beginOffsetMap+=(topicPartition -> kc.position(topicPartition))
      })
    }
    kc.close()
    fromdbOffset.map(f => {
      val beginOffset=beginOffsetMap.getOrElse(f._1,0).toString.toLong
      if(beginOffset > f._2){
        (f._1,beginOffset)
      }else{
        f
      }
    })
  }

  def run():Unit={
		
	  val conf=new SparkConf().setAppName("KafkaToES")
	  //	是否自动创建es index
	  conf.set("es.index.auto.create", "true")
	  //	es节点地址
      conf.set("es.nodes","ip1,ip2")
	  //	es的账号密码,没有的话不用配
      conf.set("es.net.http.auth.user", "user")
      conf.set("es.net.http.auth.pass", "12345")
	  //	配置spark序列化
	  conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
	  //	设置executor的堆外内存,数据量大的话建议调大一点
	  conf.set("spark.executor.memoryOverhead","2048")
	  //   避免spark每次任务获取过多的kafka条数  公式为 kafka分片数*调度周期(秒)*maxRatePerPartition = 总条数
	  conf.set("spark.streaming.backpressure.enabled","true")
	  conf.set("spark.streaming.kafka.maxRatePerPartition","1000")
	  
	  val sc=new SparkContext(conf)
	  val streamContext=new StreamingContext(sc,Seconds(10))
	  
	  //	设置groupid和要读取的topic
	  val groupid="test"
	  val topics=Set("topic1","topic2")
	  
	  //	配置kafka相关参数
      val kafkaParams = Map[String, Object](
        "bootstrap.servers" -> "ip1:9092,ip2:9092",
        "key.deserializer" -> classOf[StringDeserializer],
        "value.deserializer" -> classOf[StringDeserializer],
        "group.id" -> groupid,
        "auto.offset.reset" -> "latest",
        "enable.auto.commit" -> (false: java.lang.Boolean),
		//	分区获取的最大数据量,根据实际内存调整
        "max.partition.fetch.bytes" -> "2097152"
      )
		
	  //	从mysql获取offset
      val fromdbOffset:Map[TopicPartition,Long]=JDBCUtil.selectKafkaOffset(groupid)

      var kafkaDStream:InputDStream[ConsumerRecord[String, String]]=null

      if(fromdbOffset.isEmpty){
	    //	第一次启动mysql里没有offset信息的情况
        kafkaDStream=KafkaUtils.createDirectStream[String, String](
          streamContext,
          LocationStrategies.PreferConsistent,
          ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
        )
      }else{
	    //	mysql里有offset数据,校验并消费
        val checkedOffset=getCheckedOffset(topics,kafkaParams,fromdbOffset)
        kafkaDStream=KafkaUtils.createDirectStream[String,String](
          streamContext,
          LocationStrategies.PreferConsistent,
          ConsumerStrategies.Assign[String,String](checkedOffset.keys,kafkaParams,checkedOffset)
        )
      }

	  //	开始处理消费到的数据
      kafkaDStream.foreachRDD(rdd => {
	    //	获取新的offset信息
        val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges

		val jsonRDD=rdd.map(_.value()).map(l =>{
		  //	业务处理部分
          val jsonObject=JSON.parseObject(l)
		  
		  //	拿到kafka里的message,需要的话进行字段解析,解析结果放入jsonObject
          val s=jsonObject.get("message").toString
		  
          jsonObject.toJSONString
        })

		//	配置插入es的相关参数
        val mapConf = Map(
		  //	要插入es的index
          ("es.resource" , "spark-test"),
		  //	插入过程中的一些配置,根据实际情况调整
          ("es.batch.size.bytes" , "10mb"),
          ("es.batch.size.entries" , "50000"),
          ("es.batch.write.retry.count", "10"),
          ("es.batch.write.retry.wait", "3000")
        )

		//	执行插入
        EsSpark.saveJsonToEs(jsonRDD,mapConf)
		
        //	提交offset到mysql
        JDBCUtil.replaceKafkaOffset(offsetRanges,groupid)
      })
	  
	 //	启动
    streamContext.start()
    streamContext.awaitTermination()
  }

  def main(args: Array[String]): Unit = {
    run()
  }
}

第四步 报错解决

1、org.apache.kafka.clients.consumer.OffsetOutOfRangeException: Offsets out of range with no configured reset policy for partitions
解决:使用offset校验方法,或者直接换个groupid,这样会丢一部分数据
2、Container killed by YARN for exceeding memory limits. 16.9 GB of 16 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead
解决:一开始按它说的调大了spark.yarn.executor.memoryOverhead这个参数,但是看日志的时候发现应该用spark.executor.memoryOverhead这个参数,可能是spark版本问题

猜你喜欢

转载自blog.csdn.net/weixin_42473019/article/details/104742668