版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/Lin_wj1995/article/details/80080359
在大数据的场景下,流式处理都会借助 Kafka 作为消息接入的中间件,且 SparkStreaming 中 Direct 方式的优越性,现在可以说都使用 Direct 方式来获取 Kafka 数据
Direct 方式是采用 Kafka 低级的 API 来获取数据,也就是说我们要自己来管理 这个offset
SparkStreaming 中可以用 StreamingContext 的 checkpiont 方法来自动帮我们管理 offset。但是有一些缺点:
- checkpoint 是在每次处理完成后自动帮我们提交的,但是如果我们想实现 at most onec 语义时,checkpoint就不满足
- 当 Spark 版本升级后,新版本不识别老版本 checkpoint 的信息
所以我们可以自己手动来管理 offset 来达到不同语义的要求,下面是将 offset 保存到 zookeeper 的样例代码:
main类:
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka.{HasOffsetRanges, OffsetRange, KafkaUtils}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import sql.StreamingExamples
object OffsetTest extends App{
StreamingExamples.setStreamingLogLevels()
val topic = "iso8583-r3p3"
val brokers = "ido001.gzcb.com:9092,ido002.gzcb.com:9092,ido003.gzcb.com:9092"
val sparkConf = new SparkConf().setAppName("Iso8583_KafkaDirect").setIfMissing("spark.master","local[*]")
val ssc = new StreamingContext(sparkConf, Seconds(3))
val fromOffSets = ZkUtil.getOffset(topic)
val messageHandler = (mmd: MessageAndMetadata[String,String]) => (mmd.message())
val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers, "group.id" -> "lwj")
val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, String](ssc, kafkaParams, fromOffSets, messageHandler)
//保存每个批次的offset
var offsetRanges = Array[OffsetRange]()
messages.transform(rdd => {
offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd
}).foreachRDD(rdd => {
//offset管理
val offsets = scala.collection.mutable.ArrayBuffer[String]()
for (o <- offsetRanges){
println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
offsets += s"${o.topic},${o.partition},${o.untilOffset}"
}
//todo offset保存的时间点 根据需求而定
ZkUtil.setOffset(offsets.toArray)
//todo 业务逻辑
println("#################")
//rdd.foreach(println)
println(rdd.count())
})
ssc.start()
ssc.awaitTermination()
}
ZkUtil 工具类:
import java.util
import java.util.concurrent.CountDownLatch
import kafka.common.TopicAndPartition
import org.apache.zookeeper.Watcher.Event
import org.apache.zookeeper._
/**
* Zookeeper工具类
*
* @author lwj
* @date 2018/04/25
*/
object ZkUtil extends Watcher with Serializable{
protected var countDownLatch: CountDownLatch = new CountDownLatch(1)
override def process(event: WatchedEvent): Unit = {
if (event.getState eq Event.KeeperState.SyncConnected) {
countDownLatch.countDown
}
}
val zk = new ZooKeeper("181.137.128.151:2181,181.137.128.152:2181,181.137.128.153:2181", 5000, ZkUtil)
val parentPath = "/lwj"
//默认partition的数量
val initPartitions = 3
//默认offset的值
val initOffset = 0+""
//这里虽然没有显示的调用,但是会被执行
if (zk.exists(parentPath, false) == null){
zk.create(parentPath, "0".getBytes, ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT)
}
/**
* 通过topic获取partition以及相应的offset
*
* @param topic
* @return
*/
def getOffset(topic:String): Map[TopicAndPartition, Long] ={
val zkPath = parentPath + "/" + topic
val map = scala.collection.mutable.Map[TopicAndPartition, Long]()
/**
* 如果topic节点不存在,那么就创建
* 并且直接初始化partition节点,而且初始化值都为 initOffset
*/
if (zk.exists(zkPath, false) == null){
zk.create(zkPath, "0".getBytes, ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT)
for(i <- 0 to initPartitions - 1){
zk.create(zkPath + "/" + i, initOffset.getBytes, ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT)
}
}
/**
* 返回offset
*/
val children = zk.getChildren(zkPath, false)
val iterator: util.Iterator[String] = children.iterator()
while (iterator.hasNext){
val child: String = iterator.next()
val offset = new String(zk.getData(zkPath +"/"+ child, false, null))
val tp = new TopicAndPartition(topic, child.toInt)
map += (tp -> offset.toLong)
}
map.toMap
}
/**
* 设置偏移量
*
* @param offsets "topic,partition,offset"
*/
def setOffset(offsets : Array[String]): Unit ={
offsets.foreach(off =>{
val splits: Array[String] = off.split(",")
val partitionPath = parentPath + "/" + splits(0) + "/" + splits(1)
if (zk.exists(partitionPath, false) == null){
//默认值是0
zk.create(partitionPath, splits(2).getBytes, ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT)
}else{
zk.setData(partitionPath, splits(2).getBytes, -1)
}
})
}
}
以上代码仅供参考,有什么问题或者更好的想法可以留言讨论讨论哈~