build.sdt
name := "scala-spark-streaming-app"
version :="1.0"
scalaVersion :="2.10.4"
libarayDependencies += "org.apache.spark" %% "spark-mllib" %"1.1.0"
libraryDependencies +="org.apache.spark" %% "spark-streaming" %"1.1.0"
消息生成端
object StreamingProducer {
def main(args: Array[String]){
val random = new Random()
val MaxEvents=6
//读取可能的名称
val namesResource=
this.getClass.getResourceAsStream("/names.csv")
val names=scala.io.Source.fromInputStream(namesReource)
.getLines()
.toList
.head
.split(".")
.toSeq
//生成一系列可能的产品
val products= Seq(
"iPhone Cover"->9.99,
"Headphones"->5.49,
"Samsung Galaxy Cover"->8.95
"iPad Cover"->7.49
)
// 生成随机产品活动
def generateProductEvents(n:Int)={
(1 to n).map{i=> val (product, price)=products(random.nextInt(products.size))
val user =random.shuffle(names).head
(user, product, price)
}
}
//创建网络生成器
val listener = new ServerSocket(9999)
println("Listening on port:9999")
while (true){
val socket=listener.accept()
new Thread(){
override def run={
println("got client connected from :" +
socket.getInetAddress)
val out = new PrintWriter(socket.getOutputStream(),true)
while (true) {
Thread.sleep(1000)
val num = random.nextInt(MaxEvents)
val productEvents=generateProductEvents(num)
productEvents.foreach{event =>
out.write(event.productIterator.mkString(","))
out.write("\n")
}
out.flush()
println(s"created $num events...")
}
socket.close()
}
}.start()
}
}
}
//创建简单的流处理程序
object SimpleStreamingApp {
def main(args:Array[String]) {
val ssc = new StreamingContext("local[2]","Fisrst Streaming App", Seconds(10))
val stream = ssc.socketTestStream("localhost",9999)
stream.print()
ssc.start()
ssc.awaitTermination()
}
}
//复杂的streaming app 应用 计算DStream 中每一批的指标并打印结果
object StreamingAnalysticsApp{
def main(args: Array[String]){
val ssc= new StreamingContext("local[2]","First Streaming App", Seconds(10))
val stream=ssc.socketTextStream("locakhost",9999)
val events = stram.map{record => val event = record.split(",")
(event(0),event(1),event(2))}
// 使用foreachRDD 对流上的每个RDD应用任意处理函数
events.foreachRDD{(rdd,time)=>
val numPurchases = rdd.count()
val uniqueUsers = rdd.map{case(user,-,-)=>user
}.distinct().count()
val totalRevenue=rdd.map{case(-,-,price)=>price.toDouble}.sum()
val productsByPopularity = rdd.map { case (user,product,price)=>(product,1)}.reduceByKey(_+_).collect().sortBy(-_._2)
val mostPopular = productsByPopularity(0)
val formatter= new SimpleDateFormat
val dateStr = formatter.format(new Date(time.milliseconds))
println(s"==Batch start time:$dateStr ==")
println("Total purchases: " + numPurchases)
println("Unique users: " + uniqueUsers)
println("Total revenue: " + totalRevenue)
println("Most popular product: %s with %d
purchases".format(mostPopular._1, mostPopular._2))
}
ssc.start()
ssc.awaitTermination()
}
}
//有状态的流计算
object StreamingStateApp {
import org.apache.spark.streaming.StreamingContext._
def updateState(prices: Seq[(String,Double)], currentTotal:Option[(Int,Double)]) = {
val currentRevenue= prices.map(_._2).sum
val currentNumberPurchases = prices. size
val state = currentTotal.getOrElse((0,0.0))
Some ((currentNumberPurchases +state._1, currentRevenue +state._2))
}
def main(args:Array[String]) {
val ssc= new StreamingContext("local[2]","First Streaming App", Seconds(10))
// 对有状态的操作,需要设置一个检查点
ssc.checkpoint("/tmp/sparkstreaming/")
val stream = ssc.socketTextStream("localhost", 9999)
// 基于原始文本元素生成活动流
val events =stream.map{ record =>
val event =record.split(,)
(event(0),event(1),event(2).toDouble)}
val users = events.map{ case (user, product, price) => (user,(product,price))}
val revenuePerUser = users.updateStateByKey(updateState)
revenuePerUser.print()
ssc.start()
ssc.awaitTermination()
}
}