Flink seeking real-time popular topN

Encapsulate the data into sample classes, open the sliding window, cache the status of each page, and output after statistics. After sorting according to time, find the topN

package com.yan



import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.state.{
    
    ListState, ListStateDescriptor}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.TimeCharacteristictopN
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector

import scala.collection.mutable.ListBuffer

//输出数据类型样例类
case class UserBehavior(userId: Long, itemId: Long, categoryId: Long, behavior: String, timestamp: Long)
//输统计结果类型样例类
case class itemViewCount(itemId: Long, timestamp: Long, count: Long)

object hot_item {
    
    
  def main(args: Array[String]): Unit = {
    
    
    //todo  1、准备执行环境
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    //todo  2、surcease
    val properties = new Properties()
    properties.setProperty("bootstrap.servers", "node01:9092")
    properties.setProperty("group.id", "consumer-group")
    properties.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
    properties.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
    properties.setProperty("auto.offset.reset", "latest")

    val dataStream = env.addSource(new FlinkKafkaConsumer[String]("hot_Item", new SimpleStringSchema(), properties)).map(data => {
    
    
      val dataArray = data.split(",")
      //  (userId: Int, itemId: Int, categoryId: Int, behavior: String, timestamp: Long)
      //  包装成UserBehavior
      UserBehavior(dataArray(0).trim.toInt, dataArray(1).trim.toInt, dataArray(2).trim.toInt, dataArray(3).trim, dataArray(4).trim.toLong)
    })
      .assignAscendingTimestamps(_.timestamp * 1000L)
    //todo  3、transform(处理数据)
    //提取行为为“pv“的数据
    val processStream = dataStream.filter(_.behavior == "pv")
      //根据itenId
      .keyBy(_.itemId)
      //建立窗口,得到itemViewCount的数据类型
      .timeWindow(Time.hours(1), Time.minutes(5))
      //todo  预聚合
      .aggregate(new myAgg(), new windowResult())
      //keyby时间
      .keyBy(_.timestamp)
      //求出前三名
      .process(new TopN_items(3))


    //todo  4、sink
    processStream.print()
    env.execute()


  }
}

//todo  自定义预聚合
class myAgg() extends AggregateFunction[UserBehavior, Long, Long] {
    
    
  //来一个加一个就行
  override def add(value: UserBehavior, accumulator: Long) = accumulator + 1L

  //初始值
  override def createAccumulator() = 0L

  //输出值
  override def getResult(accumulator: Long) = accumulator

  //两个值相加
  override def merge(a: Long, b: Long) = a + b
}

//统计平均值
class avr_Agg() extends AggregateFunction[UserBehavior, (Long, Long), Double] {
    
    
  override def add(value: UserBehavior, accumulator: (Long, Long)) = (accumulator._1 + value.timestamp, accumulator._2 + 1)

  override def createAccumulator() = (0L, 0)

  override def getResult(accumulator: (Long, Long)) = accumulator._1 / accumulator._2

  override def merge(a: (Long, Long), b: (Long, Long)) = (a._1 + b._1, a._2 + b._2)
}

class windowResult() extends WindowFunction[Long, itemViewCount, Long, TimeWindow] {
    
    
  override def apply(key: Long, window: TimeWindow, input: Iterable[Long], out: Collector[itemViewCount]): Unit = {
    
    
    //输出结果为itemViewCount类型key为类型, window.getEnd为时间,input.iterator.next()为count
    out.collect(itemViewCount(key, window.getEnd, input.iterator.next()))

  }
}

//todo  求topN
class TopN_items(topSize: Int) extends KeyedProcessFunction[Long, itemViewCount, String] {
    
    
  //保存所有状态为集合
  private var itemState: ListState[itemViewCount] = _

  //把输入数据转换为状态,用以保存
  override def open(parameters: Configuration) = {
    
    
    itemState = getRuntimeContext.getListState(new ListStateDescriptor[itemViewCount]("itemState", classOf[itemViewCount]))
  }

  //设定定时器,时间到了输出结果
  override def processElement(value: itemViewCount, ctx: KeyedProcessFunction[Long, itemViewCount, String]#Context, out: Collector[String]) = {
    
    
    //吧每条数据存入状态列表
    itemState.add(value)
    //注册定时器,定时输出结果。
    ctx.timerService().registerEventTimeTimer(value.timestamp + 10)
  }

  //使用定时器
  override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Long, itemViewCount, String]#OnTimerContext, out: Collector[String]) = {
    
    
    //将所有state状态中的数据取出,放到集合中
    val allItem: ListBuffer[itemViewCount] = new ListBuffer[itemViewCount]
    //todo  想要遍历需要导包
    //遍历状态,把每一条状态保存如allItem中
    import scala.collection.JavaConversions._
    for (item <- itemState.get()) {
    
    
      allItem += item
    }
    //将allItem按照count排序,取出设定好的前topSize条数据
    val sorted = allItem.sortBy(_.count)(Ordering.Long.reverse).take(topSize)
    //释放状态
    itemState.clear()
    //格式化数据,将数据输出
    val result = new StringBuilder()
    result.append("time : ").append(timestamp - 10).append("\n")
    //定义格式
    for (i <- sorted.indices) {
    
    
      val currentItem = sorted(i)
      result.append("NO: ").append(i + 1).append(" 商品ID为").append(currentItem.itemId).append("浏览量为: ").append(currentItem.count).append("\n")
    }
    result.append("======================")
    Thread.sleep(500)
    out.collect(result.toString())
  }
}

Guess you like

Origin blog.csdn.net/weixin_44429965/article/details/108041147