join
两个流之间的 join 需要两个流的批次大小一致,这样才能做到同时触发计算。计算过程就是对当前批次的两个流中各自的 RDD 进行 join,与两个 RDD 的 join 效果相同。
linux>nc -lk 1111
linux>telnet 192.168.58.200 1111
linux>nc -lk 2222
linux>telnet 192.168.58.200 2222
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{
Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{
DStream, ReceiverInputDStream}
object JoinTest {
def main(args: Array[String]): Unit = {
//1.创建 SparkConf
val sparkConf: SparkConf = new
SparkConf().setMaster("local[*]").setAppName("JoinTest")
//2.创建 StreamingContext
val ssc = new StreamingContext(sparkConf, Seconds(5))
//3.从端口获取数据创建流
val lineDStream1: ReceiverInputDStream[String] =
ssc.socketTextStream("192.168.58.200", 1111)
val lineDStream2: ReceiverInputDStream[String] =
ssc.socketTextStream("192.168.58.200", 2222)
//4.将两个流转换为 KV 类型
val wordToOneDStream: DStream[(String, Int)] = lineDStream1.flatMap(_.split(" ")).map((_, 1))
val wordToADStream: DStream[(String, String)] = lineDStream2.flatMap(_.split(" ")).map((_, "a"))
//5.流的 JOIN
val joinDStream: DStream[(String, (Int, String))] =
wordToOneDStream.join(wordToADStream)
//6.打印
joinDStream.print()
//7.启动任务
ssc.start()
ssc.awaitTermination()
}
}
优雅关闭
流式任务需要 7*24 小时执行,但是有时涉及到升级代码需要主动停止程序,但是分布式程序,没办法做到一个个进程去杀死,所有需要配置优雅的关闭。
使用外部文件系统来控制内部程序关闭。
import java.net.URI
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{
FileSystem, Path}
import org.apache.spark.streaming.{
StreamingContext, StreamingContextState}
class MonitorStop(ssc: StreamingContext) extends Runnable {
override def run(): Unit = {
val fs: FileSystem = FileSystem.get(new URI("hdfs://192.168.58.200:9000"), new
Configuration(), "atguigu")
while (true) {
try
Thread.sleep(5000)
catch {
case e: InterruptedException =>
e.printStackTrace()
}
val state: StreamingContextState = ssc.getState
val bool: Boolean = fs.exists(new Path("hdfs://192.168.58.200:9000/StopSpark"))
if (bool) {
if (state == StreamingContextState.ACTIVE) {
ssc.stop(stopSparkContext = true, stopGracefully = true)
System.exit(0)
}
}
}
}
}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{
DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{
Seconds, StreamingContext}
object SparkTest {
def createSSC(): _root_.org.apache.spark.streaming.StreamingContext = {
val update: (Seq[Int], Option[Int]) => Some[Int] = (values: Seq[Int], status:
Option[Int]) => {
//当前批次内容的计算
val sum: Int = values.sum
//取出状态信息中上一次状态
val lastStatu: Int = status.getOrElse(0)
Some(sum + lastStatu)
}
val sparkConf: SparkConf = new
SparkConf().setMaster("local[4]").setAppName("SparkTest")
//设置优雅的关闭
sparkConf.set("spark.streaming.stopGracefullyOnShutdown", "true")
val ssc = new StreamingContext(sparkConf, Seconds(5))
ssc.checkpoint("./ck")
val line: ReceiverInputDStream[String] = ssc.socketTextStream("192.168.58.200", 1111)
val word: DStream[String] = line.flatMap(_.split(" "))
val wordAndOne: DStream[(String, Int)] = word.map((_, 1))
val wordAndCount: DStream[(String, Int)] = wordAndOne.updateStateByKey(update)
wordAndCount.print()
ssc
}
def main(args: Array[String]): Unit = {
val ssc: StreamingContext = StreamingContext.getActiveOrCreate("./ck", () =>
createSSC())
new Thread(new MonitorStop(ssc)).start()
ssc.start()
ssc.awaitTermination()
}
}