● 将代码打成jar包上传至linux
package com.ws.saprk
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object StreamingTextFile {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("StreamingTextFile")
val ssc = new StreamingContext(conf,Seconds(5))
//这边有个坑,不能在本地上运行(windows),而且linux下也只能往该目录下通过流的方式追加数据才会被读取
//比如echo xxxxx >> /root/test/game.log,会被streaming识别执行
//而且,原来在此目录存在的文件也不会被识别,只有新增的并且通过流的数据才会识别!!!!!
val test: DStream[String] = ssc.textFileStream("/root/test/")
val splitArr = test.flatMap(_.split(" "))
val result = splitArr.map(x=>(x,1)).reduceByKey(_+_)
result.print()
ssc.start()
ssc.awaitTermination()
}
}
● spark-submit 运行jar包
#这边使用ip简写(qjw-01)也有问题
[root@qjw-01 spark-2.1.3]# ./bin/spark-submit --master spark://192.168.0.21:7077 --class com.ws.saprk.StreamingTextFile /root/ws.jar
● 写入数据
[root@qjw-01 ~]# echo 1 2 3 4 5 6 7 8 9 1 2 43 5 6 5 >> /root/test/i.log
● 结果
-------------------------------------------
Time: 1539098465000 ms
-------------------------------------------
-------------------------------------------
Time: 1539098470000 ms
-------------------------------------------
(4,1)
(8,1)
(6,2)
(2,2)
(7,1)
(5,3)
(9,1)
(3,1)
(1,2)
(43,1)
-------------------------------------------