源码在github:https://github.com/lidonglin-bit/Spark-Streaming
目录
一.准备数据
1.数据生成方式
使用代码的方式持续的生成数据, 然后写入到 kafka 中.
然后Structured Streaming 负责从 kafka 消费数据, 并对数据根据需求进行分析.
2.数据生成模块
模拟出来的数据格式:
时间戳,地区,城市,用户 id,广告 id
1566035129449,华南,深圳,101,2
- 步骤1: 开启集群
启动 zookeeper 和 Kafka - 步骤2: 创建 Topic
在 kafka 中创建topic: ads_log1602 - 步骤3: 产生循环不断的数据到指定的 topic
创建模块spark-realtime模块
导入依赖:
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.11.0.0</version>
</dependency>
- 1. 工具类: RandomNumUtil
用于生成随机数
import java.util.Random
import scala.collection.mutable
/**
* 随机生成整数的工具类
*/
object RandomNumUtil {
val random = new Random()
/**
* 返回一个随机的整数 [from, to]
*
* @param from
* @param to
* @return
*/
def randomInt(from: Int, to: Int): Int = {
if (from > to) throw new IllegalArgumentException(s"from = $from 应该小于 to = $to")
// [0, to - from) + from [form, to -from + from ]
random.nextInt(to - from + 1) + from
}
/**
* 随机的Long [from, to]
*
* @param from
* @param to
* @return
*/
def randomLong(from: Long, to: Long): Long = {
if (from > to) throw new IllegalArgumentException(s"from = $from 应该小于 to = $to")
random.nextLong().abs % (to - from + 1) + from
}
/**
* 生成一系列的随机值
*
* @param from
* @param to
* @param count
* @param canReat 是否允许随机数重复
*/
def randomMultiInt(from: Int, to: Int, count: Int, canReat: Boolean = true): List[Int] = {
if (canReat) {
(1 to count).map(_ => randomInt(from, to)).toList
} else {
val set: mutable.Set[Int] = mutable.Set[Int]()
while (set.size < count) {
set += randomInt(from, to)
}
set.toList
}
}
def main(args: Array[String]): Unit = {
println(randomMultiInt(1, 15, 10))
println(randomMultiInt(1, 8, 10, false))
}
}
- 2. 工具类: RandomOptions
import scala.collection.mutable.ListBuffer
/**
* 根据提供的值和比重, 来创建RandomOptions对象.
* 然后可以通过getRandomOption来获取一个随机的预定义的值
*/
object RandomOptions {
def apply[T](opts: (T, Int)*): RandomOptions[T] = {
val randomOptions = new RandomOptions[T]()
randomOptions.totalWeight = (0 /: opts) (_ + _._2) // 计算出来总的比重
opts.foreach {
case (value, weight) => randomOptions.options ++= (1 to weight).map(_ => value)
}
randomOptions
}
def main(args: Array[String]): Unit = {
// 测试
val opts = RandomOptions(("张三", 10), ("李四", 30), ("ww", 20))
println(opts.getRandomOption())
println(opts.getRandomOption())
}
}
// 工程师 10 程序猿 10 老师 20
class RandomOptions[T] {
var totalWeight: Int = _
var options = ListBuffer[T]()
/**
* 获取随机的 Option 的值
*
* @return
*/
def getRandomOption() = {
options(RandomNumUtil.randomInt(0, totalWeight - 1))
}
}
- 3. 样例类: CityInfo
/**
* 城市表
*
* @param city_id 城市 id
* @param city_name 城市名
* @param area 城市区域
*/
case class CityInfo(city_id: Long,
city_name: String,
area: String)
- 4. 生成模拟数据: MockRealTime
import java.util.Properties
import org.apache.kafka.clients.producer.{
KafkaProducer, ProducerRecord}
import scala.collection.mutable.ArrayBuffer
/**
* 生成实时的模拟数据
*/
object MockRealtime {
/*
数据格式:
timestamp area city userid adid
某个时间点 某个地区 某个城市 某个用户 某个广告
*/
def mockRealTimeData(): ArrayBuffer[String] = {
// 存储模拟的实时数据
val array = ArrayBuffer[String]()
// 城市信息
val randomOpts = RandomOptions(
(CityInfo(1, "北京", "华北"), 30),
(CityInfo(2, "上海", "华东"), 30),
(CityInfo(3, "广州", "华南"), 10),
(CityInfo(4, "深圳", "华南"), 20),
(CityInfo(4, "杭州", "华中"), 10))
(1 to 50).foreach {
i => {
val timestamp = System.currentTimeMillis()
val cityInfo = randomOpts.getRandomOption()
val area = cityInfo.area
val city = cityInfo.city_name
val userid = RandomNumUtil.randomInt(100, 105)
val adid = RandomNumUtil.randomInt(1, 5)
array += s"$timestamp,$area,$city,$userid,$adid"
Thread.sleep(10)
}
}
array
}
def createKafkaProducer: KafkaProducer[String, String] = {
val props: Properties = new Properties
// Kafka服务端的主机名和端口号
props.put("bootstrap.servers", "hadoop102:9092,hadoop103:9092,hadoop104:9092")
// key序列化
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
// value序列化
props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
new KafkaProducer[String, String](props)
}
def main(args: Array[String]): Unit = {
val topic = "ads_log1602"
val producer: KafkaProducer[String, String] = createKafkaProducer
while (true) {
mockRealTimeData().foreach {
msg => {
producer.send(new ProducerRecord(topic, msg))
Thread.sleep(100)
}
}
Thread.sleep(1000)
}
}
}
启动MockRealtime,测试kafka能否接收到数据
3.从 Kafka 读取数据1
考虑到学习的目的,使用另一个streaming和kafka继承版本
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.1.1</version>
</dependency>
</dependencies>
创建util项目
如果不会写spark-streaming-kafka-0-10_2.11的,看官网怎么写
http://spark.apache.org/docs/2.4.4/streaming-kafka-0-10-integration.html
- MyKafkaUtils
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
object MyKafkaUtils {
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "hadoop102:9092,hadoop103:9092,hadoop104:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "bigdata1602",
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (true: java.lang.Boolean)
)
/**
* 根据传入的参数,返回从kafka得到的流
* @param ssc
* @param topics
* @return
*/
def getkafkaStream(ssc:StreamingContext,topics:String*)={
KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent, //标配
Subscribe[String, String](topics.toIterable, kafkaParams)
).map(_.value())
}
}
创建app项目
- 创建App
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{
Seconds, StreamingContext}
import util.MyKafkaUtils
trait App {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("App")
val ssc = new StreamingContext(conf, Seconds(3))
val sourceStream = MyKafkaUtils.getkafkaStream(ssc, "ads_log1602")
sourceStream.print(1000)
ssc.start()
ssc.awaitTermination()
}
}
- 创建AreaTopApp
object AreaTopApp extends App {
}
- 测试
运行AreaTopApp
4.从 Kafka 读取数据2
创建bean项目
创建 AdsInfo
import java.sql.Timestamp
import java.text.SimpleDateFormat
import java.util.Date
case class AdsInfo(ts: Long,
area: String,
city: String,
userId: String,
adsId: String,
var timestamp: Timestamp = null,
var dayString: String = null, // 1029-12-18
var hmString: String = null) {
// 11:20
timestamp = new Timestamp(ts)
val date = new Date(ts)
dayString = new SimpleDateFormat("yyyy-MM-dd").format(date)
hmString = new SimpleDateFormat("HH:mm").format(date)
}
- app的补全
import bean.AdsInfo
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{
Seconds, StreamingContext}
import util.MyKafkaUtils
trait App {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("App")
val ssc = new StreamingContext(conf, Seconds(3))
ssc.checkpoint("ck1602")
val sourceStream = MyKafkaUtils.getkafkaStream(ssc, "ads_log1602")
val adsInfoStream: DStream[AdsInfo] = sourceStream.map(s => {
val split = s.split(",")
AdsInfo(split(0).toLong, split(1), split(2), split(3), split(4))
})
doSomething(adsInfoStream)
ssc.start()
ssc.awaitTermination()
}
def doSomething(adsInfoStream: DStream[AdsInfo]):Unit
}
- 测试AreaTopApp
import bean.AdsInfo
import org.apache.spark.streaming.dstream.DStream
object AreaTopApp extends App {
override def doSomething(adsInfoStream: DStream[AdsInfo]): Unit = {
adsInfoStream.print(1000)
}
}
- 结果
二.需求实现
1.每天每地区热门广告 Top3实现
import bean.AdsInfo
import org.apache.spark.streaming.dstream.DStream
object AreaTopApp extends App {
override def doSomething(adsInfoStream: DStream[AdsInfo]): Unit = {
val dayAreaGrouped = adsInfoStream.map(adsInfo => ((adsInfo.dayString, adsInfo.area, adsInfo.adsId), 1))
//先计算每天每地区每广告的点击量
.updateStateByKey((seq: Seq[Int], opt: Option[Int]) => {
Some(seq.sum + opt.getOrElse(0))
})
.map {
//map出来 (day,area)作key
case ((day, area, ads), count) => ((day, area), (ads, count))
}
.groupByKey()
//每组内进行排序取前三
val result = dayAreaGrouped.map {
case (key, it) =>
(key,it.toList.sortBy(-_._2).take(3))
}
result.print()
}
}
/*
每天每地区热门广告top3
1.先计算每天每地区每广告的点击量
((day,area,ads),1) => updateStateByKey
2.按照每天每地区分组
3.每组内排序,取前三
*/
结果
2.把数据写入到redis
搭建到redis写的框架
result.foreachRDD(rdd=>{
rdd.foreachPartition(it=>{
//1.建立到redis的连接
//2.写数据到redis
//3.关闭到redis的连接
})
})
分析如何存入redis
-
redis的配置
-
数据类型
-
基本操作
127.0.0.1:6379> keys *
(empty list or set)
127.0.0.1:6379> sadd "a" 97 98
(integer) 2
127.0.0.1:6379> keys *
1) "a"
127.0.0.1:6379> SMEMBERS a
1) "97"
2) "98"
127.0.0.1:6379> HSET "hash1" "dog" 100
(integer) 1
127.0.0.1:6379> HSET "hash1" "cat" 200
(integer) 1
127.0.0.1:6379> keys *
1) "hash1"
2) "a"
127.0.0.1:6379> HGET "hash1" dog
"100"
127.0.0.1:6379> HGETALL "hash1"
1) "dog"
2) "100"
3) "cat"
4) "200"
- 选择什么类型的数据
----
((2020-09-07,华北),List((3,13), (2,8), (1,6)))
((2020-09-07,华南),List((5,14), (1,9), (4,6)))
((2020-09-07,华东),List((2,13), (1,10), (5,7)))
((2020-09-07,华中),List((1,6), (5,3), (3,2)))
----
选择什么类型的数据
每天一个key
key value
"area:ads:count" + day hash
field value
area json
"华中" {
3:13,2:8,1:6}
方法一:把数据写入到redis中
- 导入依赖
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.9.3</version>
</dependency>
- util创建一个RedisUtil(redis客户端)
import redis.clients.jedis.{
JedisPool, JedisPoolConfig}
object RedisUtil {
private val conf = new JedisPoolConfig
conf.setMaxTotal(100) //提供100个连接池
conf.setMaxIdle(10) //最大10个空闲
conf.setMinIdle(10) //最小10个空闲
conf.setBlockWhenExhausted(true) //忙碌是否等待等待
conf.setMaxWaitMillis(10000) //最大等待时间10s
conf.setTestOnBorrow(true)
conf.setTestOnReturn(true)
val pool = new JedisPool(conf,"hadoop102",6379)
def getClient = pool.getResource
}
/*
1. 使用连接池创建客户端
2. 直接创建客户端
*/
- 补全app中AreaTopApp项目
import bean.AdsInfo
import org.apache.spark.streaming.dstream.DStream
import org.json4s.jackson.JsonMethods
import util.RedisUtil
object AreaTopApp extends App {
override def doSomething(adsInfoStream: DStream[AdsInfo]): Unit = {
val dayAreaGrouped = adsInfoStream.map(adsInfo => ((adsInfo.dayString, adsInfo.area, adsInfo.adsId), 1))
//1.先计算每天每地区每广告的点击量
.updateStateByKey((seq: Seq[Int], opt: Option[Int]) => {
Some(seq.sum + opt.getOrElse(0))
})
//2.map出来 (day,area)作key
.map {
case ((day, area, ads), count) => ((day, area), (ads, count))
}
.groupByKey()
//3 4.每组内进行排序取前三
val result = dayAreaGrouped.map {
case (key, it) =>
(key,it.toList.sortBy(-_._2).take(3))
}
//5.把数据写入到redis
result.foreachRDD(rdd=>{
rdd.foreachPartition((it: Iterator[((String, String), List[(String, Int)])]) =>{
//1.建立到redis的连接
val client = RedisUtil.getClient
//2.写数据到redis
it.foreach{
//((2020-09-07,华北),List((3,13), (2,8), (1,6)))
case ((day,area),adsCountList)=>
val key = "area:ads:count" + day
val field = area
//把集合转换成json字符串 json4s
//专门用于聚合转换成字符传(样例类不行)
import org.json4s.JsonDSL._
val value = JsonMethods.compact(JsonMethods.render(adsCountList))
client.hset(key,field,value)
}
//3.关闭到redis的连接
client.close() //其实是把这个客户端还给连接池
})
})
}
}
- 取redis中查看
想要看到中文,需要启动的时候输入
方法二:把数据写入到redis中(隐士转换)
- 导入依赖
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.9.3</version>
</dependency>
- util创建一个RealUtil
import org.apache.spark.streaming.dstream.DStream
import org.json4s.jackson.JsonMethods
object RealUtil {
implicit class MyRedis(stream:DStream[((String, String), List[(String, Int)])]){
def saveToRedis = {
stream.foreachRDD(rdd=>{
rdd.foreachPartition((it: Iterator[((String, String), List[(String, Int)])]) =>{
//1.建立到redis的连接
val client = RedisUtil.getClient
//2.写数据到redis
it.foreach{
//((2020-09-07,华北),List((3,13), (2,8), (1,6)))
case ((day,area),adsCountList)=>
val key = "area:ads:count" + day
val field = area
//把集合转换成json字符串 json4s
//专门用于聚合转换成字符传(样例类不行)
import org.json4s.JsonDSL._
val value = JsonMethods.compact(JsonMethods.render(adsCountList))
client.hset(key,field,value)
}
//3.关闭到redis的连接
client.close() //其实是把这个客户端还给连接池
})
})
}
}
}
- 补全app中AreaTopApp项目
import bean.AdsInfo
import org.apache.spark.streaming.dstream.DStream
import org.json4s.jackson.JsonMethods
import util.RedisUtil
object AreaTopApp extends App {
override def doSomething(adsInfoStream: DStream[AdsInfo]): Unit = {
val dayAreaGrouped = adsInfoStream.map(adsInfo => ((adsInfo.dayString, adsInfo.area, adsInfo.adsId), 1))
//1.先计算每天每地区每广告的点击量
.updateStateByKey((seq: Seq[Int], opt: Option[Int]) => {
Some(seq.sum + opt.getOrElse(0))
})
//2.map出来 (day,area)作key
.map {
case ((day, area, ads), count) => ((day, area), (ads, count))
}
.groupByKey()
//3 4.每组内进行排序取前三
val result: DStream[((String, String), List[(String, Int)])] = dayAreaGrouped.map {
case (key, it) =>
(key,it.toList.sortBy(-_._2).take(3))
}
//5.把数据写入到redis
import util.RealUtil._
result.saveToRedis
}
}
- 打开redis客户端
redis-cli --raw
多次查询看有没有变化