SparkStreaming消费Kafka的offset的管理方式

目录

pom.xml 

1.自动提交偏移量(默认)

2.手动提交偏移量

3.Mysql管理偏移量

3.1 建表语句

3.2 配置文件

​ 

3.3 代码

4.Redis管理偏移量


pom.xml 

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>com.wedoctor.spark</groupId>
    <artifactId>spark-0708</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
        <scala.version>2.11.8</scala.version>
        <spark.version>2.2.0</spark.version>
        <hadoop.version>2.8.1</hadoop.version>
        <encoding>UTF-8</encoding>
    </properties>

    <dependencies>
        <!-- 导入scala的依赖 -->
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>${scala.version}</version>
        </dependency>
        <!-- 导入spark的依赖 -->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-hive_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>

        <dependency>
            <groupId>log4j</groupId>
            <artifactId>log4j</artifactId>
            <version>1.2.17</version>
        </dependency>
        <!--导入sparksql的依赖jar包-->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.41</version>
        </dependency>

        <!--导入 configFactory的jar包-->
        <dependency>
            <groupId>com.typesafe</groupId>
            <artifactId>config</artifactId>
            <version>1.3.0</version>
        </dependency>

        <!--导入streaming的jar包-->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>

        <!--导入redis的客户端jedis  jar包-->
        <dependency>
            <groupId>redis.clients</groupId>
            <artifactId>jedis</artifactId>
            <version>2.8.1</version>
        </dependency>

        <dependency>
            <groupId>org.scalikejdbc</groupId>
            <artifactId>scalikejdbc_2.11</artifactId>
            <version>2.5.0</version>
        </dependency>

        <!-- scalikejdbc-config_2.11 -->
        <dependency>
            <groupId>org.scalikejdbc</groupId>
            <artifactId>scalikejdbc-config_2.11</artifactId>
            <version>2.5.0</version>
        </dependency>


        <!--sparkstreaming +  kafka-->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>

        <!--kafka的jar包-->
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>0.10.2.1</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
        	<!-- 指定编译java的插件 -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.5.1</version>
            </plugin>
            <!-- 指定编译scala的插件 -->
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>3.2.2</version>
                <executions>
                    <execution>
                        <goals>
                            <goal>compile</goal>
                            <goal>testCompile</goal>
                        </goals>
                        <configuration>
                            <args>
                                <arg>-dependencyfile</arg>
                                <arg>${project.build.directory}/.scala_dependencies</arg>
                            </args>
                        </configuration>
                    </execution>
                </executions>
            </plugin>


            <!-- 打jar插件 -->
              <!--<plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>2.4.3</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                            <transformers>
                                <transformer
                                        implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
                                    <resource>reference.conf</resource>
                                </transformer>
                                &lt;!&ndash; 指定main方法 &ndash;&gt;
                                <transformer
                                        implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <mainClass></mainClass>
                                </transformer>
                            </transformers>
                        </configuration>
                    </execution>
                </executions>
            </plugin>-->
        </plugins>
    </build>
</project>

1.自动提交偏移量(默认)

package com.wedoctor.sparkstreaming.kafka_offset_manage

import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}

import scala.collection.mutable

object StreamingKafkaDemo {
  Logger.getLogger("org").setLevel(Level.ERROR)
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName(this.getClass.getSimpleName)

    val ssc: StreamingContext = new StreamingContext(conf, Seconds(2))
    val topics = Array("first")
    //kafka的配置参数
    val kafkaParams = mutable.HashMap[String, Object](
      "bootstrap.servers" -> "hdp-01:9092",
      "key.deserializer" -> classOf[StringDeserializer].getName,
      "value.deserializer" -> classOf[StringDeserializer].getName,
      "group.id" -> "streaming1",
      "auto.offset.reset" -> "earliest",
      //enable.auto.commit默认就是true
      "enable.auto.commit" -> "true"
    )
    val kafkaDs: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(ssc,
      /**
        * 运行策略:
        * PreferBrokers: 如果executor和broker在同一台机器上, 方便读取数据
        * PreferConsistent:  所有的kafka的数据,会被均匀的在各个executor执行  均衡的数据分布的策略  优先选择
        */
      LocationStrategies.PreferConsistent,
      // 订阅 主题
      ConsumerStrategies.Subscribe[String, String](topics, kafkaParams))
    // 后续的业务逻辑,和从socket中读取的数据的业务逻辑操作基本一致
    kafkaDs.foreachRDD(rdd=>{
      rdd.foreach(t=>println(t.value(),t.offset(),t.partition()))
    })
    ssc.start()
    ssc.awaitTermination()
  }
}

2.手动提交偏移量

package com.wedoctor.sparkstreaming.kafka_offset_manage

import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{CanCommitOffsets, ConsumerStrategies, HasOffsetRanges, KafkaUtils, LocationStrategies, OffsetRange}

import scala.collection.mutable

object StreamingKafkaOffset {
  Logger.getLogger("org").setLevel(Level.ERROR)
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName(this.getClass.getSimpleName)

    val ssc: StreamingContext = new StreamingContext(conf, Seconds(2))
    val topics = Array("first")
    //kafka的配置参数
    val kafkaParams = mutable.HashMap[String, Object](
      "bootstrap.servers" -> "hdp-01:9092",
      "key.deserializer" -> classOf[StringDeserializer].getName,
      "value.deserializer" -> classOf[StringDeserializer].getName,
      "group.id" -> "streaming1",
      "auto.offset.reset" -> "earliest",
      "enable.auto.commit" -> "false"
    )
    val kafkaDs: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(ssc,
      /**
        * 运行策略:
        * PreferBrokers: 如果executor和broker在同一台机器上, 方便读取数据
        * PreferConsistent:  所有的kafka的数据,会被均匀的在各个executor执行  均衡的数据分布的策略  优先选择
        */
      LocationStrategies.PreferConsistent,
      // 订阅 主题
      ConsumerStrategies.Subscribe[String, String](topics, kafkaParams))
    // 后续的业务逻辑,和从socket中读取的数据的业务逻辑操作基本一致
    kafkaDs.foreachRDD(rdd=>{
      /**
        * val topic: String,
        * val partition: Int,
        * val fromOffset: Long,
        * val untilOffset: Long)
        */
        //获取每一个分区的消费的偏移量
      val ranges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      ranges.foreach(println)
      //处理业务
      rdd.foreach(t=>println(t.value(),t.offset(),t.partition()))

      //手动提交偏移量
      kafkaDs.asInstanceOf[CanCommitOffsets].commitAsync(ranges)

    })
    ssc.start()
    ssc.awaitTermination()
  }
}

3.Mysql管理偏移量

3.1 建表语句

SET FOREIGN_KEY_CHECKS=0;
 
-- ----------------------------
-- Table structure for `mysqlOffset`
-- ----------------------------
DROP TABLE IF EXISTS `mysqlOffset`;
CREATE TABLE `mysqlOffset` (
  `topic` varchar(20) NOT NULL,
  `part` int(11) NOT NULL,
  `offset` int(11) DEFAULT NULL,
  `groupId` varchar(20) NOT NULL,
  PRIMARY KEY (`topic`,`part`,`groupId`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
 
-- ----------------------------
-- Records of mysqlOffset
-- ----------------------------

3.2 配置文件

db.url="jdbc:mysql://bigdata1:3306/test?characterEncoding=utf-8"
db.driver="com.mysql.jdbc.Driver"
db.password="123456"
db.user="root"
db.tablename="streaming_wc"

3.3 代码

package com.wedoctor.sparkstreaming.kafka_offset_manage

import java.sql.{Connection, DriverManager, ResultSet}

import com.typesafe.config.{Config, ConfigFactory}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils, LocationStrategies, OffsetRange}

import scala.collection.mutable

object StreamingMysqlOffset {
  Logger.getLogger("org").setLevel(Level.ERROR)
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName(this.getClass.getSimpleName)

    val ssc: StreamingContext = new StreamingContext(conf, Seconds(2))
    val topics = Array("first")
    val groupId = "streaming124"
    //kafka的配置参数
    val kafkaParams = mutable.HashMap[String, Object](
      "bootstrap.servers" -> "hdp-01:9092",
      "key.deserializer" -> classOf[StringDeserializer].getName,
      "value.deserializer" -> classOf[StringDeserializer].getName,
      "group.id" -> groupId,
      "auto.offset.reset" -> "earliest",
      "enable.auto.commit" -> "false"
    )

    //从Mysql中获取偏移量
    val offsetsMap: mutable.HashMap[TopicPartition, Long] = mutable.HashMap[TopicPartition,Long]()
    val config2: Config = ConfigFactory.load()
    val conn = DriverManager.getConnection(config2.getString("db.url"),"root","123456")
    val query = conn.prepareStatement("select * from mysqlOffset where topic = ? and groupId =?")
    query.setString(1,topics(0))
    query.setString(2,groupId)
    val rs: ResultSet = query.executeQuery()
    while (rs.next()){
      val part: Int = rs.getInt("part")
      val curOffset = rs.getLong("offset")
      // 把 从mysql中获取的偏移量写入配置参数
      offsetsMap += (new TopicPartition(topics(0),part) -> curOffset)
    }


    val kafkaDs: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(ssc,
      LocationStrategies.PreferConsistent,
      // 订阅 主题
      ConsumerStrategies.Subscribe[String, String](topics, kafkaParams,offsetsMap))
    // 后续的业务逻辑,和从socket中读取的数据的业务逻辑操作基本一致
    kafkaDs.foreachRDD(rdd=>{

      // 判断rdd非空
      if (!rdd.isEmpty()) {
        // 获取每一个分区的消费的偏移量
        val ranges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
        ranges.foreach(println)
        // 业务逻辑
        rdd.foreach(t => (t.value(), t.offset(), t.partition()))
        // 偏移量 写到msyql中
        val curTopic = topics(0)
        val config = ConfigFactory.load()
        for (t <- ranges) {
          // 联合主键  groupId topic partition
          val conn: Connection = DriverManager.getConnection(config.getString("db.url"), "root", "123456")
          val pstm = conn.prepareStatement("replace into mysqlOffset values (?,?,?,?)")
          pstm.setString(1, t.topic)
          pstm.setInt(2, t.partition)
          pstm.setLong(3, t.untilOffset)
          pstm.setString(4, groupId)

          pstm.execute()
          // 释放资源
        }
      }

    })
    ssc.start()
    ssc.awaitTermination()
  }
}

4.Redis管理偏移量

package com.wedoctor.sparkstreaming

import redis.clients.jedis.{Jedis, JedisPool, JedisPoolConfig}

object JedisUtils {

  def createJedisPool(host: String, port: Int) = {
    // 创建一个连接池的配置对象
    val config = new JedisPoolConfig()
    //最大空闲连接数, 默认8个
    config.setMaxIdle(1000)
    //最大连接数, 默认8个
    config.setMaxTotal(2000)
    // 获取一个连接池
    val pool: JedisPool = new JedisPool(config, host, port)
    pool
  }
  private def getJedis(host: String, port: Int): Jedis = {
    createJedisPool(host,port).getResource
  }

  def apply(host: String, port: Int): Jedis = {
    getJedis(host, port)
  }
}
package com.wedoctor.sparkstreaming.kafka_offset_manage

import java.util
import com.typesafe.config.{ConfigFactory}
import com.wedoctor.sparkstreaming.JedisUtils
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils, LocationStrategies, OffsetRange}
import redis.clients.jedis.Jedis

import scala.collection.mutable

object StreamingRedisOffset {
  Logger.getLogger("org").setLevel(Level.ERROR)
  def main(args: Array[String]): Unit = {

       val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName(this.getClass.getSimpleName)

    val ssc: StreamingContext = new StreamingContext(conf, Seconds(2))
    val topics = Array("first")
    val groupId = "streaming125"
    //kafka的配置参数
    val kafkaParams = mutable.HashMap[String, Object](
      "bootstrap.servers" -> "hdp-01:9092",
      "key.deserializer" -> classOf[StringDeserializer].getName,
      "value.deserializer" -> classOf[StringDeserializer].getName,
      "group.id" -> groupId,
      "auto.offset.reset" -> "earliest",
      "enable.auto.commit" -> "false"
    )

    //从resis中获取偏移量
    val offsetsMap = mutable.HashMap[TopicPartition, Long]()
    val jedis: Jedis = JedisUtils("hdp-01",6379)
    val partandOffset: util.Map[String, String] = jedis.hgetAll(topics(0) + "-" + groupId)
    import scala.collection.JavaConversions._
    for (part <- partandOffset) {
      offsetsMap += (new TopicPartition(topics(0), part._1.toInt) -> part._2.toLong)
    }

    val kafkaDs: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(ssc,
      LocationStrategies.PreferConsistent,
      // 订阅 主题
      ConsumerStrategies.Subscribe[String, String](topics, kafkaParams,offsetsMap))
    // 后续的业务逻辑,和从socket中读取的数据的业务逻辑操作基本一致
    kafkaDs.foreachRDD(rdd => {
      // 判断rdd非空
      if (!rdd.isEmpty()) {
        // 获取每一个分区的消费的偏移量
        val ranges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
        ranges.foreach(println)
        // 业务逻辑
        rdd.foreach(t => {
          println(t.value(), t.offset(), t.partition())
        })

        // 偏移量 写到redis中
        val curTopic = topics(0)
        val config = ConfigFactory.load()
        for (t <- ranges) {
          // hash  groupId topic partition offset   hset (key,p1-v1)
          // 拼接成key: groupId-topic
          val jedis: Jedis = JedisUtils("hdp-01", 6379)
          jedis.hset(t.topic + "-" + groupId, t.partition + "", t.untilOffset + "")
          jedis.close()
        }
      }

    })
    ssc.start()
    ssc.awaitTermination()
  }
}
发布了79 篇原创文章 · 获赞 107 · 访问量 8万+

猜你喜欢

转载自blog.csdn.net/zuochang_liu/article/details/98472314