import org.apache.flink.api.common.functions.ReduceFunction import org.apache.flink.api.java.functions.KeySelector import org.apache.flink.streaming.api.scala.{StreamExecutionEnvironment, createTypeInformation} /** * DATE:2022/10/4 13:22 * AUTHOR:GX */ object TransformationAggTest { def main(args: Array[String]): Unit = { val env = StreamExecutionEnvironment.getExecutionEnvironment env.setParallelism(1) val stream = env.addSource(new ClickSource) //1.1 The same Key will definitely be assigned to the same partition, and different Keys may be assigned to the same partition. //keyBy -> keyedStream -> DataStream //Stateful stream processing. If you want to use aggregation operators, they must be used on data streams that only contain a limited number of keys. If there are infinite keys, memory resources will be consumed. // stream.keyBy(_.user) // .maxBy("timestamp") // .print() //1.2 // stream.keyBy(new MyKeySelector) // .maxBy("timestamp") // .print() //2.reduce aggregation, extract the current most active users stream.map(x => (x.user,1L)) .keyBy(_._1) .reduce(new MyReduce) //Statistics of each user's activity .keyBy(x => true) //Sort all data into the same group according to the same Key // .maxBy("_2") .reduce( (statu,x) => if (x._2 >= statu._2) x else statu ) //Select the current most active user .print() env.execute() } class MyKeySelector extends KeySelector[Event,String] { override def getKey(value: Event): String = value.user } class MyReduce extends ReduceFunction[(String,Long)]{ override def reduce(value1: (String, Long), value2: (String, Long)): (String, Long) = { (value1._1,value1._2 + value2._2) } } }
Aggregation conversion operator (reduce aggregation (reduce))
Guess you like
Origin blog.csdn.net/GX_0824/article/details/127161921
Ranking