Aggregation conversion operator (reduce aggregation (reduce))

import org.apache.flink.api.common.functions.ReduceFunction
import org.apache.flink.api.java.functions.KeySelector
import org.apache.flink.streaming.api.scala.{StreamExecutionEnvironment, createTypeInformation}

/**
 * DATE:2022/10/4 13:22
 * AUTHOR:GX
 */
object TransformationAggTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    val stream = env.addSource(new ClickSource)
    //1.1 The same Key will definitely be assigned to the same partition, and different Keys may be assigned to the same partition.
    //keyBy -> keyedStream -> DataStream
    //Stateful stream processing. If you want to use aggregation operators, they must be used on data streams that only contain a limited number of keys. If there are infinite keys, memory resources will be consumed.
//    stream.keyBy(_.user)
//      .maxBy("timestamp")
//      .print()
    //1.2
//    stream.keyBy(new MyKeySelector)
//      .maxBy("timestamp")
//      .print()
    //2.reduce aggregation, extract the current most active users
    stream.map(x => (x.user,1L))
      .keyBy(_._1)
      .reduce(new MyReduce) //Statistics of each user's activity
      .keyBy(x => true) //Sort all data into the same group according to the same Key
//      .maxBy("_2")
      .reduce( (statu,x) => if (x._2 >= statu._2) x else statu ) //Select the current most active user
      .print()

    env.execute()
  }
  class MyKeySelector extends KeySelector[Event,String] {
    override def getKey(value: Event): String = value.user
  }
  class MyReduce extends ReduceFunction[(String,Long)]{
    override def reduce(value1: (String, Long), value2: (String, Long)): (String, Long) = {
      (value1._1,value1._2 + value2._2)
    }
  }
}

Guess you like

Origin blog.csdn.net/GX_0824/article/details/127161921