编写流计算程序

#!/usr/bin/env python3

from__future__import print_function
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

if__name__=“main”:
if len(sys.argv)!=3:
print(“Usage:KafkaWordCount.py”,file=sys.stderror)
exit(-1)
sc = SparkContext(appName=“PythonStreamingKafkaWordCount”)
ssc=StreamingContext(sc,1)
zkQuorum, topic =sys.argv[1:]#这是运行linux传进来的参数,第一个默认是self,第二个是zookeeper地址,第三个是topic名称
kvs = KafkaUtils.createStream(ssc,zkQuorum,“spark-streaming-consumer”,{topic,1})#构建输入源,第三个参数是消费者的族group,最后是topic的名称和分区的个数
lines =kvs.map(lambda x:x[1])
counts = lines.flatMap(lambda x: x.split(" ")).map(lambda y:(y,1)).reduceByKey(lambda a,b:a+b)
counts.pprint()
ssc.start()
ssc.awaitTermination()

新建终端来进行流计算
第一步进入上述代码目录
cd /usr/local/spark/mycode/streaming/kafka
/usr/local/spark/bin/spark-submit KafkaWordCount.py localhost:2181 wordsendertest#第一个参数是zookeeper服务器地址,第二个是订阅的主题的名称
然后再数据源终端敲单词就会统计出来了

发布了25 篇原创文章 · 获赞 0 · 访问量 370

猜你喜欢

转载自blog.csdn.net/qq_45371603/article/details/104653591
今日推荐