版权声明:未经同意,不得转载。 https://blog.csdn.net/qq_36235275/article/details/82501802
- 引入jar包依赖
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
- 编写scala
//Stream2Kafka
import kafka.serializer.StringDecoder
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.clients.producer.ProducerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
*
*/
object Stream2Kafka extends App {
//创建配置对象
val conf = new SparkConf().setAppName("kafka").setMaster("local[3]")
//创建SparkStreaming操作对象
val ssc = new StreamingContext(conf,Seconds(5))
//连接Kafka就需要Topic
//输入的topic
val fromTopic = "source"
//输出的Topic
val toTopic = "target"
//创建brokers的地址
val brokers = "master:9092,slave1:9092,slave3:9092,slave2:9092"
//Kafka消费者配置对象
val kafkaParams = Map[String, Object](
//用于初始化链接到集群的地址
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokers,
//Key与VALUE的序列化类型
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG->classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG->classOf[StringDeserializer],
//用于标识这个消费者属于哪个消费团体
ConsumerConfig.GROUP_ID_CONFIG->"kafka",
//如果没有初始化偏移量或者当前的偏移量不存在任何服务器上,可以使用这个配置属性
//可以使用这个配置,latest自动重置偏移量为最新的偏移量
ConsumerConfig.AUTO_OFFSET_RESET_CONFIG->"latest",
//如果是true,则这个消费者的偏移量会在后台自动提交
ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG->(false: java.lang.Boolean)
)
//创建DStream,连接到Kafka,返回接收到的输入数据
val inputStream = {
KafkaUtils.createDirectStream[String, String](
ssc,
//位置策略(可用的Executor上均匀分配分区)
LocationStrategies.PreferConsistent,
//消费策略(订阅固定的主题集合)
ConsumerStrategies.Subscribe[String, String](Array(fromTopic), kafkaParams))
}
inputStream.map{record => "hehe--"+record.value}.foreachRDD { rdd =>
//在这里将RDD写回Kafka,需要使用Kafka连接池
rdd.foreachPartition { items =>
val kafkaProxyPool = KafkaPool(brokers)
val kafkaProxy = kafkaProxyPool.borrowObject()
for (item <- items) {
//使用这个连接池
kafkaProxy.kafkaClient.send(new ProducerRecord[String, String](toTopic, item))
}
kafkaProxyPool.returnObject(kafkaProxy)
}
}
ssc.start()
ssc.awaitTermination()
}
//Kafka连接池
import org.apache.commons.pool2.impl.{DefaultPooledObject, GenericObjectPool}
import org.apache.commons.pool2.{BasePooledObjectFactory, PooledObject}
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig}
import org.apache.kafka.common.serialization.StringSerializer
//因为要将Scala的集合类型转换成Java的
import scala.collection.JavaConversions._
class KafkaProxy(broker:String){
val conf = Map(
//用于初始化链接到集群的地址
ProducerConfig.BOOTSTRAP_SERVERS_CONFIG -> broker,
//Key与VALUE的序列化类型
ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG->classOf[StringSerializer],
ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG->classOf[StringSerializer]
)
val kafkaClient = new KafkaProducer[String,String](conf)
}
//创建一个创建KafkaProxy的工厂
class KafkaProxyFactory(broker:String) extends BasePooledObjectFactory[KafkaProxy]{
//创建实例
override def create(): KafkaProxy = new KafkaProxy(broker)
//包装实例
override def wrap(t: KafkaProxy): PooledObject[KafkaProxy] = new DefaultPooledObject[KafkaProxy](t)
}
object KafkaPool {
private var kafkaPool:GenericObjectPool[KafkaProxy]=null
def apply(broker:String): GenericObjectPool[KafkaProxy] ={
if(kafkaPool == null){
this.kafkaPool = new GenericObjectPool[KafkaProxy](new KafkaProxyFactory(broker))
}
kafkaPool
}
}
- 启动zookeeper
zkServer.sh start
- 每个节点启动kafka
kafka-server-start.sh /opt/apps/Kafka/kafka_2.11_2.0.0/config/server.properties &
- 创建两个主题
[root@master ~]# kafka-topics.sh --create --zookeeper master:2181,slave1:2181,slave2:2181,slave3:2181,slave4:2181 --replication-factor 2 --partitions 2 --topic source
[root@master ~]# kafka-topics.sh --create --zookeeper master:2181,slave1:2181,slave2:2181,slave3:2181,slave4:2181 --replication-factor 2 --partitions 2 --topic target
- 启动producer 写入数据到source
[root@master ~]# kafka-console-producer.sh --broker-list master:9092,slave1:9092,slave2:9092,slave3:9092,slave4:9092 --topic source
- 启动consumer 监听target的数据
[root@master ~]# kafka-console-consumer.sh --bootstrap-server master:9092,slave1:9092,slave2:9092,slave3:9092,slave4:9092 --topic target
最终的流程是:创建两个主题,source、target,从kafka生产者输入数据source,接着到反序列化SparkStreaming的ConsumerConfig消费,接着通过代理序列化输出至SparkStreaming的ProducerConfig生产端,然后可以处理数据,处理完数据之后,发送到kafka的consumer消费者监听target,监听到的数据进行输出。