Spark2.3 integration kafka010 managed manually offset

 

Code Example:

package cn.com.kong.streaming;


import kafka.utils.ZkUtils;
import org.I0Itec.zkclient.ZkClient;
import org.I0Itec.zkclient.ZkConnection;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.Time;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.*;
import scala.Tuple2;
import scala.collection.JavaConverters;
import scala.collection.Seq;

import java.io.IOException;
import java.util.*;

/**
 * create 'stream_kafka_offsets', {NAME=>'offsets', TTL=>2592000}
 */
public class KafkaOffsetToHBase {

    public static void main(String[] args) throws InterruptedException, IOException {

//        if (args.length < 6) {
//            System.err.println("Usage: KafkaOffsetToHBase <batch-duration-in-seconds> <kafka-bootstrap-servers> " +
//                    "<kafka-topics> <kafka-consumer-group-id> <hbase-table-name> <kafka-zookeeper-quorum>");
//            System.exit(1);
//        }
        long batchDuration = 10L;
        String bootstrapServers = "172.30.xx.xx:9092,172.30.x.xx:9092";
        String consumerGroupID = "group01";
        String topic = "topic02";
        String hbaseTableName = "stream_kafka_offsets";
        String zkQuorum = "172.30.x.xx:2181,172.30.x.xx:2181,172.30.x.xxx:2181";
        String zkKafkaRootDir = "brokers";
        int zkSessionTimeOut = 10000;
        int zkConnectionTimeOut = 10000;

        HashSet<String> topicsSet = new HashSet<>(Arrays.asList(topic.split(",")));
        // kafka相关配置参数
        HashMap<String, Object> kafkaParams = new HashMap<>();
        kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG,bootstrapServers);
        kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG,consumerGroupID);
        kafkaParams.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
        kafkaParams.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
        kafkaParams.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
        kafkaParams.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,false);

        SparkConf sparkConf = new SparkConf().setAppName("KafkaOffsetToHBase").setMaster("local[3]")
                .set("spark.io.compression.codec","snappy");

        JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(batchDuration));
        jssc.sparkContext().setLogLevel("WARN");

        Map<TopicPartition, Long> fromOffsetsMap = getLastCommittedOffsets(topic, consumerGroupID, hbaseTableName, zkQuorum, zkKafkaRootDir,
                zkSessionTimeOut, zkConnectionTimeOut);

        JavaInputDStream<ConsumerRecord<String, String>> inputDStream = KafkaUtils.createDirectStream(jssc, LocationStrategies.PreferConsistent()
                , ConsumerStrategies.Subscribe(topicsSet, kafkaParams, fromOffsetsMap));

        inputDStream.foreachRDD((javaRDD, time) -> {
            if(!javaRDD.isEmpty()){
                OffsetRange[] offsetRanges = ((HasOffsetRanges) javaRDD.rdd()).offsetRanges();
                for (OffsetRange offset : offsetRanges) {
                    System.out.println("topic:"+offset.topic()+",partition:"+offset.partition()+",fromOffset:"
                            offset.fromOffset + () + ", untilOffset:" + offset.untilOffset ()); 
                } 
                // processing logic 
                JavaRDD <String> newRDD = javaRDD.map (Record -> record.value ());
                 Long COUNT = newRDD. COUNT (); 

                // save the offset to HBase
                 // note here: If the offset calculation results are stored separately, may appear to save the results a success, but failed to offset save
                 // then the next start time, there will be some consumption data is repeated, it is necessary to make an idempotent 
                saveOffsets (Topic, consumerGroupID, offsetRanges, hbaseTableName, Time); 
                System.out.println ( "number of messages already processed article:" + COUNT); 
            } 
        });
 
        jssc.start (); 
        jssc.awaitTermination (); 

    } 


    / ** 
     * save offset 
     * @param topic_name
     * @param group_id
     * @param offsetRanges
     * @param hbaseTableName
     * @param batchTime
     */
    private static void saveOffsets(String topic_name, String group_id, OffsetRange[] offsetRanges
            , String hbaseTableName, Time batchTime) throws IOException {

        Configuration hbaseConf = HBaseConfiguration.create();
        hbaseConf.set("hbase.zookeeper.property.clientPort","2181");
        hbaseConf.set("hbase.zookeeper.quorum","172.30.x.xx,172.30.x.xxx,172.30.x.xx");
        Connection connection = ConnectionFactory.createConnection(hbaseConf);
        Table table = connection.getTable(TableName.valueOf(hbaseTableName));
        String rowKey = topic_name+":"+group_id+":"+batchTime.milliseconds();
        Put put = new Put(rowKey.getBytes());

        for (OffsetRange offset : offsetRanges) {
            put.addColumn(Bytes.toBytes("offsets"),Bytes.toBytes(String.valueOf(offset.partition()))
                    ,Bytes.toBytes(String.valueOf(offset.untilOffset())));
        }
        table.put(put);
        connection.close();
    }

    /**
     *
     * @param topic_name
     * @param group_id
     * @param hbaseTableName
     * @param zkQuorum
     * @param zkRootDir
     * @param sessionTimeOut
     * @param connectionTimeOut
     * @return
     * @throws IOException
     */
    private static Map<TopicPartition, Long> getLastCommittedOffsets(String topic_name, String group_id
            , String hbaseTableName, String zkQuorum, String zkRootDir, int sessionTimeOut
            , int connectionTimeOut) throws IOException {
        Configuration hbaseConf = HBaseConfiguration.create();
        hbaseConf.set("hbase.zookeeper.property.clientPort","2181");
        hbaseConf.set("hbase.zookeeper.quorum","172.30.x.xx,172.30.x.xx,172.30.x.xx");

        String zkUrl = zkQuorum + "/" + zkRootDir;
        Tuple2<ZkClient, ZkConnection> zkClientAndConnection = ZkUtils.createZkClientAndConnection(zkUrl
                , sessionTimeOut, connectionTimeOut);

        ZkUtils zkUtils = new ZkUtils(zkClientAndConnection._1, zkClientAndConnection._2, false);
        Seq<String> seq_topic = JavaConverters.asScalaIteratorConverter(Collections.singletonList(topic_name)
                .iterator()).asScala().toSeq();
        int zKNumberOfPartitionsForTopic  = zkUtils.getPartitionsForTopics(seq_topic).get(topic_name)
                .toList().head().size();

        //Connect to HBase to retrieve last committed offsets
        Connection conn = ConnectionFactory.createConnection(hbaseConf);
        Table table = conn.getTable(TableName.valueOf(hbaseTableName));
        String startRow = topic_name + ":" + group_id + ":" + System.currentTimeMillis();
        String stopRow = topic_name + ":" + group_id + ":" + 0;
        Scan scan = new Scan();
        ResultScanner scanner = table.getScanner(scan.setStartRow(startRow.getBytes()).setStopRow(
                stopRow.getBytes()).setReversed(true));

        Result result = scanner.next();
        //Set the number of partitions discovered for a topic in HBase to 0
        int hbaseNumberOfPartitionsForTopic = 0;
        if (result != null){
            //If the result from hbase scanner is not null, set number of partitions from hbase to the number of cells
            hbaseNumberOfPartitionsForTopic = result.listCells().size();
        }


        Map<TopicPartition, Long> fromOffsets = new HashMap<>();
        if(hbaseNumberOfPartitionsForTopic == 0){
            // initialize fromOffsets to beginning
            for (int= 0 Partition; Partition <zKNumberOfPartitionsForTopic -1; Partition ++) {
                fromOffsets.put ( new new TopicPartition (TOPIC_NAME, Partition), 0L ); 
            } 
        } the else  IF (zKNumberOfPartitionsForTopic> hbaseNumberOfPartitionsForTopic) {
             // handle new new Partitions Scenario WHERE have have been added to existing Kafka topic
             // If the execution of the new partition topic , zk is the current topic of recorded number is greater than the partition number of partition hbase saved 
            for ( int partition = 0; partition ++; partition <hbaseNumberOfPartitionsForTopic -1 ) {
                 // find the offset of each partition of recorded hbase 
                String offset = Bytes.toString (result.getValue (Bytes.toBytes ( "offsets" ), Bytes.toBytes (
                        String.valueOf (Partition)))); 
                fromOffsets.put ( new new TopicPartition (TOPIC_NAME, Partition), Long.valueOf (offset)); 
            } 
            for ( int Partition = hbaseNumberOfPartitionsForTopic; Partition <-zKNumberOfPartitionsForTopic. 1; Partition ++ ) { 
                fromOffsets.put ( new new TopicPartition (TOPIC_NAME, partition), 0L ); 
            } 

        } the else {
             // the initialize fromOffsets from last RUN
             // no new partition, after the restart of the application, read the offset value from the last recorded in hbase 
            for ( int Partition = 0; Partition <hbaseNumberOfPartitionsForTopic; Partition ++ ) { 
                String offset = Bytes.toString (result.getValue (Bytes.toBytes ( " Qoffsets " ), Bytes.toBytes ( 
                        String.valueOf (Partition)))); 
                fromOffsets.put ( new new TopicPartition (TOPIC_NAME, Partition), Long.valueOf (offset)); 
                System.out.println ( " last then assembled fromOffset processing --topic_name: "TOPIC_NAME + +", Partition: "Partition + +", fromOffset: "+ offset); 
            } 
        } 
        scanner.close (); 
        conn.Close (); 
        return fromOffsets;
    }

}

 

Official website: http://spark.apache.org/docs/latest/streaming-kafka-0-10-integration.html#storing-offsets

Guess you like

Origin www.cnblogs.com/zz-ksw/p/12521972.html