spark createDirectStream保存kafka offset(JAVA实现)

问题描述

原文地址:http://blog.csdn.net/xueba207/article/details/50381821

最近使用Spark streaming处理kafka的数据,业务数据量比较大,就使用了KafkaUtils的createDirectStream()方式,此方法直接从kafka的broker的分区中读取数据,跳过了zookeeper,并且没有receiver,是spark的task直接对接kakfa topic partition,能保证消息恰好一次语意,但是此种方式因为没有经过zk,topic的offset也就没有保存,当job重启后只能从最新的offset开始消费消息,造成重启过程中的消息丢失。

解决方案

一般,有两种方式可以先spark streaming 保存offset:spark checkpoint机制和程序中自己实现保存offset逻辑,下面分别介绍。

checkpoint机制

spark streaming job 可以通过checkpoint 的方式保存job执行断点,断点中有spark streaming context中的全部信息(包括有kakfa每个topic partition的offset)。checkpoint有两种方式,一个是checkpoint 数据和metadata,另一个只checkpoint metadata,一般情况只保存metadata即可,因此这里只介绍checkpoint metadata。

流程图

代码实现

  1. package com.nsfocus.bsa.example;

  2. import kafka.serializer.StringDecoder;

  3. import org.apache.spark.SparkConf;

  4. import org.apache.spark.api.java.function.Function;

  5. import org.apache.spark.streaming.Duration;

  6. import org.apache.spark.streaming.api.java.JavaDStream;

  7. import org.apache.spark.streaming.api.java.JavaPairInputDStream;

  8. import org.apache.spark.streaming.api.java.JavaStreamingContext;

  9. import org.apache.spark.streaming.api.java.JavaStreamingContextFactory;

  10. import org.apache.spark.streaming.kafka.KafkaUtils;

  11. import scala.Tuple2;

  12.  
  13. import java.util.HashMap;

  14. import java.util.HashSet;

  15. import java.util.Set;

  16.  
  17. /**

  18. * Checkpoint example

  19. *

  20. * @author Shuai YUAN

  21. * @date 2015/10/27

  22. */

  23. public class CheckpointTest {

  24.  
  25. private static String CHECKPOINT_DIR = "/checkpoint";

  26.  
  27. public static void main(String[] args) {

  28.  
  29. // get javaStreamingContext from checkpoint dir or create from sparkconf

  30. JavaStreamingContext jssc = JavaStreamingContext.getOrCreate(CHECKPOINT_DIR, new JavaStreamingContextFactory() {

  31. public JavaStreamingContext create() {

  32. return createContext();

  33. }

  34. });

  35.  
  36. jssc.start();

  37. jssc.awaitTermination();

  38.  
  39. }

  40.  
  41. public static JavaStreamingContext createContext() {

  42.  
  43. SparkConf sparkConf = new SparkConf().setAppName("tachyon-test-consumer");

  44.  
  45. Set<String> topicSet = new HashSet<String>();

  46. topicSet.add("test_topic");

  47.  
  48. HashMap<String, String> kafkaParam = new HashMap<String, String>();

  49. kafkaParam.put("metadata.broker.list", "test1:9092,test2:9092");

  50.  
  51. JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000));

  52.  
  53. // do checkpoint metadata to hdfs

  54. jssc.checkpoint(CHECKPOINT_DIR);

  55.  
  56. JavaPairInputDStream<String, String> message =

  57. KafkaUtils.createDirectStream(

  58. jssc,

  59. String.class,

  60. String.class,

  61. StringDecoder.class,

  62. StringDecoder.class,

  63. kafkaParam,

  64. topicSet

  65. );

  66.  
  67. JavaDStream<String> valueDStream = message.map(new Function<Tuple2<String, String>, String>() {

  68. public String call(Tuple2<String, String> v1) throws Exception {

  69. return v1._2();

  70. }

  71. });

  72. valueDStream.count().print();

  73.  
  74. return jssc;

  75. }

  76. }

自己实现保存offset到zk

开发者可以自己开发保存offset到zk的实现逻辑。spark streaming 的rdd可以被转换为HasOffsetRanges类型,进而得到所有partition的offset。

实现流程

源码实现

scala的实现网上很容易搜到,这里贴个Java实现的代码。

  1. package com.xueba207.test;

  2. import kafka.common.TopicAndPartition;

  3. import kafka.message.MessageAndMetadata;

  4. import kafka.serializer.StringDecoder;

  5. import org.apache.spark.SparkConf;

  6. import org.apache.spark.api.java.JavaRDD;

  7. import org.apache.spark.api.java.function.Function;

  8. import org.apache.spark.broadcast.Broadcast;

  9. import org.apache.spark.sql.DataFrame;

  10. import org.apache.spark.sql.SaveMode;

  11. import org.apache.spark.sql.hive.HiveContext;

  12. import org.apache.spark.streaming.Duration;

  13. import org.apache.spark.streaming.api.java.JavaDStream;

  14. import org.apache.spark.streaming.api.java.JavaInputDStream;

  15. import org.apache.spark.streaming.api.java.JavaStreamingContext;

  16. import org.apache.spark.streaming.kafka.HasOffsetRanges;

  17. import org.apache.spark.streaming.kafka.KafkaCluster;

  18. import org.apache.spark.streaming.kafka.KafkaUtils;

  19. import org.apache.spark.streaming.kafka.OffsetRange;

  20. import scala.Predef;

  21. import scala.Tuple2;

  22. import scala.collection.JavaConversions;

  23.  
  24. import java.util.HashMap;

  25. import java.util.HashSet;

  26. import java.util.Map;

  27. import java.util.Set;

  28. import java.util.concurrent.atomic.AtomicReference;

  29.  
  30. /**

  31.  * KafkaOffsetExample

  32.  *

  33.  * @author Shuai YUAN

  34.  * @date 2015/10/28

  35.  */

  36. public class KafkaOffsetExample {

  37.  
  38.     private static KafkaCluster kafkaCluster = null;

  39.  
  40.     private static HashMap<String, String> kafkaParam = new HashMap<String, String>();

  41.  
  42.     private static Broadcast<HashMap<String, String>> kafkaParamBroadcast = null;

  43.  
  44.     private static scala.collection.immutable.Set<String> immutableTopics = null;

  45.  
  46.     public static void main(String[] args) {

  47.  
  48.         SparkConf sparkConf = new SparkConf().setAppName("tachyon-test-consumer");

  49.  
  50.         Set<String> topicSet = new HashSet<String>();

  51.         topicSet.add("test_topic");

  52.  
  53.  
  54.         kafkaParam.put("metadata.broker.list", "test:9092");

  55.         kafkaParam.put("group.id", "com.xueba207.test");

  56.  
  57.         // transform java Map to scala immutable.map

  58.         scala.collection.mutable.Map<String, String> testMap = JavaConversions.mapAsScalaMap(kafkaParam);

  59.         scala.collection.immutable.Map<String, String> scalaKafkaParam =

  60.                 testMap.toMap(new Predef.$less$colon$less<Tuple2<String, String>, Tuple2<String, String>>() {

  61.                     public Tuple2<String, String> apply(Tuple2<String, String> v1) {

  62.                         return v1;

  63.                     }

  64.                 });

  65.  
  66.         // init KafkaCluster

  67.         kafkaCluster = new KafkaCluster(scalaKafkaParam);

  68.  
  69.         scala.collection.mutable.Set<String> mutableTopics = JavaConversions.asScalaSet(topicSet);

  70.         immutableTopics = mutableTopics.toSet();

  71.         scala.collection.immutable.Set<TopicAndPartition> topicAndPartitionSet2 = kafkaCluster.getPartitions(immutableTopics).right().get();

  72.  
  73.         // kafka direct stream 初始化时使用的offset数据

  74.         Map<TopicAndPartition, Long> consumerOffsetsLong = new HashMap<TopicAndPartition, Long>();

  75.  
  76.         // 没有保存offset时(该group首次消费时), 各个partition offset 默认为0

  77.         if (kafkaCluster.getConsumerOffsets(kafkaParam.get("group.id"), topicAndPartitionSet2).isLeft()) {

  78.  
  79.             System.out.println(kafkaCluster.getConsumerOffsets(kafkaParam.get("group.id"), topicAndPartitionSet2).left().get());

  80.  
  81.             Set<TopicAndPartition> topicAndPartitionSet1 = JavaConversions.setAsJavaSet(topicAndPartitionSet2);

  82.  
  83.             for (TopicAndPartition topicAndPartition : topicAndPartitionSet1) {

  84.                 consumerOffsetsLong.put(topicAndPartition, 0L);

  85.             }

  86.  
  87.         }

  88.         // offset已存在, 使用保存的offset

  89.         else {

  90.  
  91.             scala.collection.immutable.Map<TopicAndPartition, Object> consumerOffsetsTemp = kafkaCluster.getConsumerOffsets("com.nsfocus.bsa.ys.test", topicAndPartitionSet2).right().get();

  92.  
  93.             Map<TopicAndPartition, Object> consumerOffsets = JavaConversions.mapAsJavaMap(consumerOffsetsTemp);

  94.  
  95.             Set<TopicAndPartition> topicAndPartitionSet1 = JavaConversions.setAsJavaSet(topicAndPartitionSet2);

  96.  
  97.             for (TopicAndPartition topicAndPartition : topicAndPartitionSet1) {

  98.                 Long offset = (Long)consumerOffsets.get(topicAndPartition);

  99.                 consumerOffsetsLong.put(topicAndPartition, offset);

  100.             }

  101.  
  102.         }

  103.  
  104.         JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(5000));

  105.         kafkaParamBroadcast = jssc.sparkContext().broadcast(kafkaParam);

  106.  
  107.         // create direct stream

  108.         JavaInputDStream<String> message = KafkaUtils.createDirectStream(

  109.                 jssc,

  110.                 String.class,

  111.                 String.class,

  112.                 StringDecoder.class,

  113.                 StringDecoder.class,

  114.                 String.class,

  115.                 kafkaParam,

  116.                 consumerOffsetsLong,

  117.                 new Function<MessageAndMetadata<String, String>, String>() {

  118.                     public String call(MessageAndMetadata<String, String> v1) throws Exception {

  119.                         return v1.message();

  120.                     }

  121.                 }

  122.         );

  123.  
  124.         // 得到rdd各个分区对应的offset, 并保存在offsetRanges中

  125.         final AtomicReference<OffsetRange[]> offsetRanges = new AtomicReference<OffsetRange[]>();

  126.         JavaDStream<String> javaDStream = message.transform(new Function<JavaRDD<String>, JavaRDD<String>>() {

  127.             public JavaRDD<String> call(JavaRDD<String> rdd) throws Exception {

  128.                 OffsetRange[] offsets = ((HasOffsetRanges) rdd.rdd()).offsetRanges();

  129.                 offsetRanges.set(offsets);

  130.                 return rdd;

  131.             }

  132.         });

  133.  
  134.         // output

  135.         javaDStream.foreachRDD(new Function<JavaRDD<String>, Void>() {

  136.  
  137.             public Void call(JavaRDD<String> v1) throws Exception {

  138.                 if (v1.isEmpty()) return null;

  139.  
  140.                 //处理rdd数据,这里保存数据为hdfs的parquet文件

  141.                 HiveContext hiveContext = SQLContextSingleton.getHiveContextInstance(v1.context());

  142.                 DataFrame df = hiveContext.jsonRDD(v1);

  143.                 df.save("/offset/test", "parquet", SaveMode.Append);

  144.  
  145.  
  146.                 for (OffsetRange o : offsetRanges.get()) {

  147.  
  148.                     // 封装topic.partition 与 offset对应关系 java Map

  149.                     TopicAndPartition topicAndPartition = new TopicAndPartition(o.topic(), o.partition());

  150.                     Map<TopicAndPartition, Object> topicAndPartitionObjectMap = new HashMap<TopicAndPartition, Object>();

  151.                     topicAndPartitionObjectMap.put(topicAndPartition, o.untilOffset());

  152.  
  153.                     // 转换java map to scala immutable.map

  154.                     scala.collection.mutable.Map<TopicAndPartition, Object> testMap =

  155.                             JavaConversions.mapAsScalaMap(topicAndPartitionObjectMap);

  156.                     scala.collection.immutable.Map<TopicAndPartition, Object> scalatopicAndPartitionObjectMap =

  157.                             testMap.toMap(new Predef.$less$colon$less<Tuple2<TopicAndPartition, Object>, Tuple2<TopicAndPartition, Object>>() {

  158.                                 public Tuple2<TopicAndPartition, Object> apply(Tuple2<TopicAndPartition, Object> v1) {

  159.                                     return v1;

  160.                                 }

  161.                             });

  162.  
  163.                     // 更新offset到kafkaCluster

  164.                     kafkaCluster.setConsumerOffsets(kafkaParamBroadcast.getValue().get("group.id"), scalatopicAndPartitionObjectMap);

  165.  
  166. //                    System.out.println(

  167. //                            o.topic() + " " + o.partition() + " " + o.fromOffset() + " " + o.untilOffset()

  168. //                    );

  169.                 }

  170.                 return null;

  171.             }

  172.         });

  173.  
  174.         jssc.start();

  175.         jssc.awaitTermination();

  176.     }

  177.  
  178. }

猜你喜欢

转载自blog.csdn.net/nieji3057/article/details/81161433