spark streaming 应用

build.sdt

name := "scala-spark-streaming-app"

version :="1.0"

scalaVersion :="2.10.4"

libarayDependencies += "org.apache.spark" %% "spark-mllib" %"1.1.0"

libraryDependencies +="org.apache.spark" %% "spark-streaming" %"1.1.0"

消息生成端

object StreamingProducer {

def main(args: Array[String]){

val random = new Random()

val MaxEvents=6

//读取可能的名称

val namesResource=

this.getClass.getResourceAsStream("/names.csv")

val names=scala.io.Source.fromInputStream(namesReource)

.getLines()

.toList

.head

.split(".")

.toSeq

//生成一系列可能的产品

val products= Seq(

"iPhone Cover"->9.99,

"Headphones"->5.49,

"Samsung Galaxy Cover"->8.95

"iPad Cover"->7.49

)

// 生成随机产品活动

def generateProductEvents(n:Int)={

(1 to n).map{i=> val (product, price)=products(random.nextInt(products.size))

val user =random.shuffle(names).head

(user, product, price)

}

//创建网络生成器

val listener = new ServerSocket(9999)

println("Listening on port:9999")

while (true){

val socket=listener.accept()

new Thread(){

override def run={

println("got client connected from :" +

socket.getInetAddress)

val out = new PrintWriter(socket.getOutputStream(),true)

while (true) {

Thread.sleep(1000)

val num = random.nextInt(MaxEvents)

val productEvents=generateProductEvents(num)

productEvents.foreach{event =>

out.write(event.productIterator.mkString(","))

out.write("\n")

}

out.flush()

println(s"created $num events...")

}

socket.close()

}

}.start()

}

//创建简单的流处理程序

object SimpleStreamingApp {

def main(args:Array[String]) {

val ssc = new StreamingContext("local[2]","Fisrst Streaming App", Seconds(10))

val stream = ssc.socketTestStream("localhost",9999)

stream.print()

ssc.start()

ssc.awaitTermination()

}

//复杂的streaming app 应用计算DStream 中每一批的指标并打印结果

object StreamingAnalysticsApp{

def main(args: Array[String]){

val ssc= new StreamingContext("local[2]","First Streaming App", Seconds(10))

val stream=ssc.socketTextStream("locakhost",9999)

val events = stram.map{record => val event = record.split(",")

(event(0),event(1),event(2))}

// 使用foreachRDD 对流上的每个RDD应用任意处理函数

events.foreachRDD{(rdd,time)=>

val numPurchases = rdd.count()

val uniqueUsers = rdd.map{case(user,-,-)=>user

}.distinct().count()

val totalRevenue=rdd.map{case(-,-,price)=>price.toDouble}.sum()

val productsByPopularity = rdd.map { case (user,product,price)=>(product,1)}.reduceByKey(_+_).collect().sortBy(-_._2)

val mostPopular = productsByPopularity(0)

val formatter= new SimpleDateFormat

val dateStr = formatter.format(new Date(time.milliseconds))

println(s"==Batch start time:$dateStr ==")

println("Total purchases: " + numPurchases)
println("Unique users: " + uniqueUsers)
println("Total revenue: " + totalRevenue)
println("Most popular product: %s with %d
purchases".format(mostPopular._1, mostPopular._2))

}

ssc.start()

ssc.awaitTermination()

}

//有状态的流计算

object StreamingStateApp {

import org.apache.spark.streaming.StreamingContext._

def updateState(prices: Seq[(String,Double)], currentTotal:Option[(Int,Double)]) = {

val currentRevenue= prices.map(_._2).sum

val currentNumberPurchases = prices. size

val state = currentTotal.getOrElse((0,0.0))

Some ((currentNumberPurchases +state._1, currentRevenue +state._2))

}

def main(args:Array[String]) {

val ssc= new StreamingContext("local[2]","First Streaming App", Seconds(10))

// 对有状态的操作，需要设置一个检查点

ssc.checkpoint("/tmp/sparkstreaming/")

val stream = ssc.socketTextStream("localhost", 9999)

// 基于原始文本元素生成活动流

val events =stream.map{ record =>

val event =record.split(,)

(event(0),event(1),event(2).toDouble)}

val users = events.map{ case (user, product, price) => (user,(product,price))}

val revenuePerUser = users.updateStateByKey(updateState)

revenuePerUser.print()

ssc.start()

ssc.awaitTermination()

}

猜你喜欢