flink求实时热门topN

将数据封装成样例类,开启滑动窗口,将没个页面的状态欲缓存起来,统计后输出,在根据时间分类,求出topN

package com.yan



import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.state.{
    
    ListState, ListStateDescriptor}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.TimeCharacteristictopN
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector

import scala.collection.mutable.ListBuffer

//输出数据类型样例类
case class UserBehavior(userId: Long, itemId: Long, categoryId: Long, behavior: String, timestamp: Long)
//输统计结果类型样例类
case class itemViewCount(itemId: Long, timestamp: Long, count: Long)

object hot_item {
    
    
  def main(args: Array[String]): Unit = {
    
    
    //todo  1、准备执行环境
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    //todo  2、surcease
    val properties = new Properties()
    properties.setProperty("bootstrap.servers", "node01:9092")
    properties.setProperty("group.id", "consumer-group")
    properties.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
    properties.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
    properties.setProperty("auto.offset.reset", "latest")

    val dataStream = env.addSource(new FlinkKafkaConsumer[String]("hot_Item", new SimpleStringSchema(), properties)).map(data => {
    
    
      val dataArray = data.split(",")
      //  (userId: Int, itemId: Int, categoryId: Int, behavior: String, timestamp: Long)
      //  包装成UserBehavior
      UserBehavior(dataArray(0).trim.toInt, dataArray(1).trim.toInt, dataArray(2).trim.toInt, dataArray(3).trim, dataArray(4).trim.toLong)
    })
      .assignAscendingTimestamps(_.timestamp * 1000L)
    //todo  3、transform(处理数据)
    //提取行为为“pv“的数据
    val processStream = dataStream.filter(_.behavior == "pv")
      //根据itenId
      .keyBy(_.itemId)
      //建立窗口,得到itemViewCount的数据类型
      .timeWindow(Time.hours(1), Time.minutes(5))
      //todo  预聚合
      .aggregate(new myAgg(), new windowResult())
      //keyby时间
      .keyBy(_.timestamp)
      //求出前三名
      .process(new TopN_items(3))


    //todo  4、sink
    processStream.print()
    env.execute()


  }
}

//todo  自定义预聚合
class myAgg() extends AggregateFunction[UserBehavior, Long, Long] {
    
    
  //来一个加一个就行
  override def add(value: UserBehavior, accumulator: Long) = accumulator + 1L

  //初始值
  override def createAccumulator() = 0L

  //输出值
  override def getResult(accumulator: Long) = accumulator

  //两个值相加
  override def merge(a: Long, b: Long) = a + b
}

//统计平均值
class avr_Agg() extends AggregateFunction[UserBehavior, (Long, Long), Double] {
    
    
  override def add(value: UserBehavior, accumulator: (Long, Long)) = (accumulator._1 + value.timestamp, accumulator._2 + 1)

  override def createAccumulator() = (0L, 0)

  override def getResult(accumulator: (Long, Long)) = accumulator._1 / accumulator._2

  override def merge(a: (Long, Long), b: (Long, Long)) = (a._1 + b._1, a._2 + b._2)
}

class windowResult() extends WindowFunction[Long, itemViewCount, Long, TimeWindow] {
    
    
  override def apply(key: Long, window: TimeWindow, input: Iterable[Long], out: Collector[itemViewCount]): Unit = {
    
    
    //输出结果为itemViewCount类型key为类型, window.getEnd为时间,input.iterator.next()为count
    out.collect(itemViewCount(key, window.getEnd, input.iterator.next()))

  }
}

//todo  求topN
class TopN_items(topSize: Int) extends KeyedProcessFunction[Long, itemViewCount, String] {
    
    
  //保存所有状态为集合
  private var itemState: ListState[itemViewCount] = _

  //把输入数据转换为状态,用以保存
  override def open(parameters: Configuration) = {
    
    
    itemState = getRuntimeContext.getListState(new ListStateDescriptor[itemViewCount]("itemState", classOf[itemViewCount]))
  }

  //设定定时器,时间到了输出结果
  override def processElement(value: itemViewCount, ctx: KeyedProcessFunction[Long, itemViewCount, String]#Context, out: Collector[String]) = {
    
    
    //吧每条数据存入状态列表
    itemState.add(value)
    //注册定时器,定时输出结果。
    ctx.timerService().registerEventTimeTimer(value.timestamp + 10)
  }

  //使用定时器
  override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Long, itemViewCount, String]#OnTimerContext, out: Collector[String]) = {
    
    
    //将所有state状态中的数据取出,放到集合中
    val allItem: ListBuffer[itemViewCount] = new ListBuffer[itemViewCount]
    //todo  想要遍历需要导包
    //遍历状态,把每一条状态保存如allItem中
    import scala.collection.JavaConversions._
    for (item <- itemState.get()) {
    
    
      allItem += item
    }
    //将allItem按照count排序,取出设定好的前topSize条数据
    val sorted = allItem.sortBy(_.count)(Ordering.Long.reverse).take(topSize)
    //释放状态
    itemState.clear()
    //格式化数据,将数据输出
    val result = new StringBuilder()
    result.append("time : ").append(timestamp - 10).append("\n")
    //定义格式
    for (i <- sorted.indices) {
    
    
      val currentItem = sorted(i)
      result.append("NO: ").append(i + 1).append(" 商品ID为").append(currentItem.itemId).append("浏览量为: ").append(currentItem.count).append("\n")
    }
    result.append("======================")
    Thread.sleep(500)
    out.collect(result.toString())
  }
}

猜你喜欢

转载自blog.csdn.net/weixin_44429965/article/details/108041147