package kafka import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.kafka010._ import scala.collection.mutable //1.打开zk,kafka。2.启动kafka-connect(source部分)3.运行此文件 object DirectKafka { def main(args: Array[String]): Unit = { if (args.length < 2) { System.err.println( s""" |Usage: DirectKafkaWordCount <brokers> <topics> | <brokers> is a list of one or more Kafka brokers | <topics> is a list of one or more kafka topics to consume from | """.stripMargin) System.exit(1) } val Array(brokers,topics)=args var conf = new SparkConf() .setAppName("DirectKafka") .setMaster("local[2]") val ssc = new StreamingContext(conf, Seconds(2)) val topicsSet=topics.split(",").toSet val kafkaParams=mutable.HashMap[String,String]() //必须添加以下参数,否则会报错 kafkaParams.put("bootstrap.servers" ,brokers) kafkaParams.put("group.id", "group1") kafkaParams.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer") kafkaParams.put("value.deserializer" , "org.apache.kafka.common.serialization.StringDeserializer") val messages=KafkaUtils.createDirectStream [String,String]( ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String,String](topicsSet,kafkaParams ) ) // Get the lines, split them into words, count the words and print val lines = messages.map(_.value) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) wordCounts.print() // Start the computation ssc.start() ssc.awaitTermination() } }
maven依赖一定要跟Scala版本一致,否则执行代码会报错:
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka-0-10 --> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming-kafka-0-10_2.12</artifactId> <version>2.4.0</version> </dependency>