Window flow
WindowedStream pure API configuration, runtime WindowedStream to the KeyedStream merge operations on a window and operation.
aggregate
By a field or by location (tuple) polymerization convection / packet
private def aggregate(aggregationType: AggregationType, field: String): DataStream[T] = { val position = fieldNames2Indices(getInputType(), Array(field))(0) aggregate(aggregationType, position) } def aggregate(aggregationType: AggregationType, position: Int): DataStream[T] = { val jStream = javaStream.asInstanceOf[JavaWStream[Product, K, W]] val reducer = aggregationType match { case AggregationType.SUM => new SumAggregator(position, jStream.getInputType, jStream.getExecutionEnvironment.getConfig) case _ => new ComparableAggregator( position, jStream.getInputType, aggregationType, true, jStream.getExecutionEnvironment.getConfig) } new DataStream[Product](jStream.reduce(reducer)).asInstanceOf[DataStream[T]] }
Aggregation function abstract class
public abstract class AggregationFunction<T> implements ReduceFunction<T> { private static final long serialVersionUID = 1L; /** * Aggregation types that can be used on a windowed stream or keyed stream. */ public enum AggregationType { SUM, MIN, MAX, MINBY, MAXBY, } }
Implement the abstract class has SumAggregator, ComparableAggregator
For example SumAggregator, ReduceFunction implements the interface (through inheritance over abstract class AggregationFunction)
public class SumAggregator<T> extends AggregationFunction<T> { private static final long serialVersionUID = 1L; private final FieldAccessor<T, Object> fieldAccessor; private final SumFunction adder; private final TypeSerializer<T> serializer; private final boolean isTuple; public SumAggregator(int pos, TypeInformation<T> typeInfo, ExecutionConfig config) { fieldAccessor = FieldAccessorFactory.getAccessor(typeInfo, pos, config); adder = SumFunction.getForClass(fieldAccessor.getFieldType().getTypeClass()); if (typeInfo instanceof TupleTypeInfo) { isTuple = true; serializer = null; } else { isTuple = false; this.serializer = typeInfo.createSerializer(config); } } public SumAggregator(String field, TypeInformation<T> typeInfo, ExecutionConfig config) { fieldAccessor = FieldAccessorFactory.getAccessor(typeInfo, field, config); adder = SumFunction.getForClass(fieldAccessor.getFieldType().getTypeClass()); if (typeInfo instanceof TupleTypeInfo) { isTuple = true; serializer = null; } else { isTuple = false; this.serializer = typeInfo.createSerializer(config); } } @Override @SuppressWarnings("unchecked") public T reduce(T value1, T value2) throws Exception { if (isTuple) { Tuple result = ((Tuple) value1).copy(); return fieldAccessor.set((T) result, adder.add(fieldAccessor.get(value1), fieldAccessor.get(value2))); } else { T result = serializer.copy(value1); return fieldAccessor.set(result, adder.add(fieldAccessor.get(value1), fieldAccessor.get(value2))); } } }
Common derivation method
sum
def sum(position: Int): DataStream[T] = aggregate(AggregationType.SUM, position)
maxBy
def maxBy(position: Int): DataStream[T] = aggregate(AggregationType.MAXBY, position)
Example code snippet
sum
val counts: DataStream[(String, Int)] = text.flatMap(_.toLowerCase().split("\\W+")) .filter(_.nonEmpty) .map((_, 1)) .keyBy(0) .countWindow(windowSize, slideSize) .sum(1)
maxBy
val counts: DataStream[(String, Int)] = text.flatMap(_.toLowerCase().split("\\W+")) .filter(_.nonEmpty) .map((_, 1)) .keyBy(0) .countWindow(windowSize, slideSize) .maxBy(1)
reduce
The same is applied separately for the handler set a key element of
def reduce(function: (T, T) => T): DataStream[T] = { if (function == null) { throw new NullPointerException("Reduce function must not be null.") } val cleanFun = clean(function) val reducer = new ScalaReduceFunction[T](cleanFun) reduce(reducer) }
Interface also implements ReduceFunction
final class ScalaReduceFunction[T](private[this] val function: (T, T) => T) extends ReduceFunction[T] { @throws(classOf[Exception]) override def reduce(a: T, b: T): T = { function(a, b) } }
Source call
// file: org.apache.flink.streaming.api.datastream.WindowedStream public <R> SingleOutputStreamOperator<R> reduce( ReduceFunction<T> reduceFunction, WindowFunction<T, R, K, W> function, TypeInformation<R> resultType) { if (reduceFunction instanceof RichFunction) { throw new UnsupportedOperationException("ReduceFunction of reduce can not be a RichFunction."); } //clean the closures function = input.getExecutionEnvironment().clean(function); reduceFunction = input.getExecutionEnvironment().clean(reduceFunction); final String opName = generateOperatorName(windowAssigner, trigger, evictor, reduceFunction, function); KeySelector<T, K> keySel = input.getKeySelector(); OneInputStreamOperator<T, R> operator; if (evictor != null) { @SuppressWarnings({"unchecked", "rawtypes"}) TypeSerializer<StreamRecord<T>> streamRecordSerializer = (TypeSerializer<StreamRecord<T>>) new StreamElementSerializer(input.getType().createSerializer(getExecutionEnvironment().getConfig())); ListStateDescriptor<StreamRecord<T>> stateDesc = new ListStateDescriptor<>("window-contents", streamRecordSerializer); operator = new EvictingWindowOperator<>(windowAssigner, windowAssigner.getWindowSerializer(getExecutionEnvironment().getConfig()), keySel, input.getKeyType().createSerializer(getExecutionEnvironment().getConfig()), stateDesc, new InternalIterableWindowFunction<>(new ReduceApplyWindowFunction<>(reduceFunction, function)), trigger, evictor, allowedLateness, lateDataOutputTag); } else { ReducingStateDescriptor<T> stateDesc = new ReducingStateDescriptor<>("window-contents", reduceFunction, input.getType().createSerializer(getExecutionEnvironment().getConfig())); operator = new WindowOperator<>(windowAssigner, windowAssigner.getWindowSerializer(getExecutionEnvironment().getConfig()), keySel, input.getKeyType().createSerializer(getExecutionEnvironment().getConfig()), stateDesc, new InternalSingleValueWindowFunction<>(function), trigger, allowedLateness, lateDataOutputTag); } return input.transform(opName, resultType, operator); }
Example code snippet
stream.keyBy(0) .timeWindow(Time.of(2500, TimeUnit.MILLISECONDS), Time.of(500, TimeUnit.MILLISECONDS)) .reduce((value1, value2) => (value1._1, value1._2 + value2._2)) .addSink(new SinkFunction[(Long, Long)] {})
process
233
apply
233