53, the input DStream Kafka actual data source

First, based on the way Receiver

1 Overview

Based Receiver way: 

Receiver Kafka is using high-level Consumer API to achieve. receiver obtained from Kafka data are stored in the memory of the Spark Executor, 
then Spark Streaming will start the job to deal with that data. 

However, in the default configuration, this approach may fail because the underlying data loss. If you want to enable highly reliable mechanism for zero data loss, you must enable Spark Streaming of 
write-ahead log mechanism (Write Ahead Log, WAL). This mechanism will be pre-on the received write data Kafka synchronization distributed file systems (such as HDFS) in the write log. Therefore, 
even if the underlying node failure occurs, the data can also be used in write-ahead log to recover. 




How to Kafka data source connection? 

1 , in dependency maven adding 
the groupId = org.apache.spark 
the artifactId = Spark-Streaming-kafka_2.10 
Version = 1.5.1 

2 , the input to create a third-party tools used DStream 
 JavaPairReceiverInputDStream <String, String> = kafkaStream 
     KafkaUtils.createStream (StreamingContext, [Quorum ZK], [Consumer Group ID], [per - Topic Number of Partitions to Kafka Consume]); 




 Points to note:

 . 1 , Topic in the partition Kafka, RDD and Spark in the partition is not related. So, in KafkaUtils.createStream (), the 
increase of the number of partition, a Receiver will only increase in the number of threads to read the partition. Spark does not increase the degree of parallelism in processing data. 

2 , it is possible to create a plurality of input DSTREAM Kafka, using different consumer group and Topic, received in parallel via a plurality of data receiver. 

3 , if the fault-tolerant file system, such as HDFS, enabled the write-ahead log mechanism, the received data will be copied to a write-ahead log. Therefore, 
in KafkaUtils.createStream (), the persistence level is set StorageLevel.MEMORY_AND_DISK_SER. 




Kafka command: 
bin /kafka-topics.sh --zookeeper 192.168.1.107:2181,192.168.1.108:2181,192.168.1.109:2181 --topic TestTopic --replication factor-1 --partitions 1 -create

bin/kafka-console-producer.sh --broker-list 192.168.1.107:9092,192.168.1.108:9092,192.168.1.109:9092 --topic TestTopic



[ZK quorum:
192.168.1.191:2181,192.168.1.192:2181,192.168.1.193:2181


2, java version

package cn.spark.study.streaming;

import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaPairReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;

import scala.Tuple2;

/**
 * 基于Kafka receiver方式的实时wordcount程序
 * @author Administrator
 *
 */
public class KafkaReceiverWordCount {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setMaster("local[2]")
                .setAppName("KafkaWordCount");  
        JavaStreamingContext jssc = New new JavaStreamingContext (the conf, Durations.seconds (. 5 )); 
        
        // use KafkaUtils.createStream () method to create the input data stream for Kafka 
        the Map <String, Integer> = topicThreadMap new new the HashMap <String, Integer> ();
         / / number of threads used to pull topic data 
        topicThreadMap.put ( "the WordCount",. 1 ); 
        
        // four parameters received here; first: StreamingContext
         // second: ZK quorum; third: consumer group id can write;   
         // fourth: per-Topic Number of Partitions to Kafka Consume 
        JavaPairReceiverInputDStream <String, String> Lines = KafkaUtils.createStream ( 
                JSSC,
                "192.168.1.135:2181,192.168.1.136:2181,192.168.1.137:2181", 
                "DefaultConsumerGroup", 
                topicThreadMap);
        
        // wordcount逻辑
        JavaDStream<String> words = lines.flatMap(
                
                new FlatMapFunction<Tuple2<String,String>, String>() {

                    private static final long serialVersionUID = 1L;

                    @Override
                    public Iterable<String> call(Tuple2<String, String> tuple)
                            throws Exception {
                        return Arrays.asList(tuple._2.split(" "));  
                    }
                    
                });
        
        JavaPairDStream<String, Integer> pairs = words.mapToPair(
                
                new PairFunction<String, String, Integer>() {

                    private static final long serialVersionUID = 1L;

                    @Override
                    public Tuple2<String, Integer> call(String word)
                            throws Exception {
                        return new Tuple2<String, Integer>(word, 1);
                    }
                    
                });
        
        JavaPairDStream<String, Integer> wordCounts = pairs.reduceByKey(
                
                new Function2<Integer, Integer, Integer>() {
            
                    private static final long serialVersionUID = 1L;

                    @Override
                    public Integer call(Integer v1, Integer v2) throws Exception {
                        return v1 + v2;
                    }
                    
                });
        
        wordCounts.print();  
        
        jssc.start();
        jssc.awaitTermination();
        jssc.close();
    }
    
}





##运行程序

##新建一个topic
[root@spark1 kafka]# bin--zookeeper 192.168.1.135:2181,192.168.1.136:2181,192.168.1.137:2181 --topic the WordCount --replication /kafka-topics.sh-factor --partitions. 1. 1 - Create 

## start producers, and can enter some data, statistical observation program output end of 
[the root @ spark1 Kafka] # bin /kafka-console-producer.sh --broker-List 192.168.1.135:9092,192.168.1.136:9092,192.168.1.137:9092 - topic WordCount


Second, based on Direct way

1 Overview

Receiver not based on this new direct mode, is 1 in the Spark .3 introduced, it is possible to ensure a more robust mechanism. Alternatively off using the Receiver to receive the data, this approach will periodically query Kafka, is obtained 
for each Topic + latest offset of the Partition, thereby defining a range of offset for each batch. When the data processing job starts, Kafka simple consumer will use to obtain the api Kafka offset specified range of data. 

This embodiment has the following advantages:
 1 , a simplified parallel Read: To read a plurality of Partition, and then need to create a plurality of input DStream them union operation. Spark Kafka partition is created with as much RDD partition, and will parallel 
read data from Kafka in. So between Kafka partition and RDD partition, there is a one to one mapping relationship. 

2 , high-performance: If you want to ensure zero data loss, based on the receiver in a way, it is necessary to open WAL mechanism. In fact, this way is inefficient, because the data is actually copied two, Kafka themselves have highly reliable mechanism, 
will copy the data, and there will copy to the WAL. And based on the direct way, do not rely Receiver, no need to open WAL mechanism, just copy the data made Kafka, then it can be restored by a copy of Kafka's. 

3 , once and only once the transaction mechanism:
    Receiver-based way is to use a high-level API to save Kafka had offset consumption in the ZooKeeper. This is the traditional way to consume Kafka data. In this way combined with the WAL mechanism to ensure data 
    zero loss of high reliability, but it can not guarantee that the data is processed once and only once, it may be processed twice. Because there may not be synchronized between the Spark and ZooKeeper. 
    Based on direct manner using kafka simple api, Spark Streaming himself responsible for tracking consumption offset, and stored in the checkpoint. Spark he must be synchronized so that you can ensure that the data 
    is a consumer and the consumer only once. 

 JavaPairReceiverInputDStream <String, String> directKafkaStream =  
     KafkaUtils.createDirectStream (StreamingContext, 
         [Key class ], [value class ], [Key Decoder class ], [value Decoder class ], 
         [Map of Kafka Parameters], [SET of Topics to Consume] ); 



 Kafka used to live command: 
bin /kafka-topics.sh --zookeeper 192.168.1.107:2181,192.168.1.108:2181,192.168.1.109:2181 --topic TestTopic --replication --partitions. 1. 1-factor -create

bin/kafka-console-producer.sh --broker-list 192.168.1.107:9092,192.168.1.108:9092,192.168.1.109:9092 --topic TestTopic


192.168.1.191:2181,192.168.1.192:2181,192.168.1.193:2181

metadata.broker.list

2, java version

package cn.spark.study.streaming;

import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import kafka.serializer.StringDecoder;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;

import scala.Tuple2;

/**
 * 基于Kafka Direct方式的实时wordcount程序
 * @author Administrator
 *
 */
public class KafkaDirectWordCount {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setMaster("local[2]") 
                .SetAppName ( "KafkaDirectWordCount" );   
        JavaStreamingContext JSSC = new new JavaStreamingContext (conf, Durations.seconds (5 )); 
        
        // First, create a parameter kafka the Map 
        the Map <String, String> = kafkaParams new new HashMap <String, String > (); 
        kafkaParams.put ( "metadata.broker.list" , 
                 "192.168.1.135:9092,192.168.1.136:9092,192.168.1.137:9092" ); 
        
        // then, to create a set, placed inside, topic you want to read
         // this, what we call, you do it to yourself well, you can read multiple parallel topic 
        the Set <String> Topics = new new HashSet <String>();
        topics.add("WordCount");
        
        // 创建输入DStream
        JavaPairInputDStream<String, String> lines = KafkaUtils.createDirectStream(
                jssc, 
                String.class, 
                String.class, 
                StringDecoder.class, 
                StringDecoder.class, 
                kafkaParams, 
                topics);
        
        // 执行wordcount操作
        JavaDStream<String> words = lines.flatMap(
                
                new FlatMapFunction<Tuple2<String,String>, String>() {

                    private static final long serialVersionUID = 1L;

                    @Override
                    public Iterable<String> call(Tuple2<String, String> tuple)
                            throws Exception {
                        return Arrays.asList(tuple._2.split(" "));  
                    }
                    
                });
        
        JavaPairDStream<String, Integer> pairs = words.mapToPair(
                
                new PairFunction<String, String, Integer>() {

                    private static final long serialVersionUID = 1L;

                    @Override
                    public Tuple2<String, Integer> call(String word) throws Exception {
                        return new Tuple2<String, Integer>(word, 1);
                    }
                    
                });
        
        JavaPairDStream<String, Integer> wordCounts = pairs.reduceByKey(
                
                new Function2<Integer, Integer, Integer>() {

                    private static final long serialVersionUID = 1L;

                    @Override
                    public Integer call(Integer v1, Integer v2) throws Exception {
                        return v1 + v2;
                    }
                    
                });
        
        wordCounts.print();
        
        jssc.start();
        jssc.awaitTermination();
        jssc.close();
    }
    
}

Guess you like

Origin www.cnblogs.com/weiyiming007/p/11345788.html