Flink 基础示例 —— 窗口的处理

窗口流

WindowedStream 是纯API构造,在运行时 WindowedStream 将与 KeyedStream 和窗口上的操作合并为一个操作。

aggregate

用于按字段或者按位置(元组)对流聚合/分组

  private def aggregate(aggregationType: AggregationType, field: String): DataStream[T] = {
    val position = fieldNames2Indices(getInputType(), Array(field))(0)
    aggregate(aggregationType, position)
  }

  def aggregate(aggregationType: AggregationType, position: Int): DataStream[T] = {

    val jStream = javaStream.asInstanceOf[JavaWStream[Product, K, W]]

    val reducer = aggregationType match {
      case AggregationType.SUM =>
        new SumAggregator(position, jStream.getInputType, jStream.getExecutionEnvironment.getConfig)

      case _ =>
        new ComparableAggregator(
          position,
          jStream.getInputType,
          aggregationType,
          true,
          jStream.getExecutionEnvironment.getConfig)
    }

    new DataStream[Product](jStream.reduce(reducer)).asInstanceOf[DataStream[T]]
  }

聚合函数抽象类

public abstract class AggregationFunction<T> implements ReduceFunction<T> {
	private static final long serialVersionUID = 1L;

	/**
	 * Aggregation types that can be used on a windowed stream or keyed stream.
	 */
	public enum AggregationType {
		SUM, MIN, MAX, MINBY, MAXBY,
	}
}

该抽象类的实现有 SumAggregator,ComparableAggregator

比如 SumAggregator,实现了 ReduceFunction 接口(通过继承抽象类 AggregationFunction 上的实现)

public class SumAggregator<T> extends AggregationFunction<T> {

	private static final long serialVersionUID = 1L;

	private final FieldAccessor<T, Object> fieldAccessor;
	private final SumFunction adder;
	private final TypeSerializer<T> serializer;
	private final boolean isTuple;

	public SumAggregator(int pos, TypeInformation<T> typeInfo, ExecutionConfig config) {
		fieldAccessor = FieldAccessorFactory.getAccessor(typeInfo, pos, config);
		adder = SumFunction.getForClass(fieldAccessor.getFieldType().getTypeClass());
		if (typeInfo instanceof TupleTypeInfo) {
			isTuple = true;
			serializer = null;
		} else {
			isTuple = false;
			this.serializer = typeInfo.createSerializer(config);
		}
	}

	public SumAggregator(String field, TypeInformation<T> typeInfo, ExecutionConfig config) {
		fieldAccessor = FieldAccessorFactory.getAccessor(typeInfo, field, config);
		adder = SumFunction.getForClass(fieldAccessor.getFieldType().getTypeClass());
		if (typeInfo instanceof TupleTypeInfo) {
			isTuple = true;
			serializer = null;
		} else {
			isTuple = false;
			this.serializer = typeInfo.createSerializer(config);
		}
	}

	@Override
	@SuppressWarnings("unchecked")
	public T reduce(T value1, T value2) throws Exception {
		if (isTuple) {
			Tuple result = ((Tuple) value1).copy();
			return fieldAccessor.set((T) result, adder.add(fieldAccessor.get(value1), fieldAccessor.get(value2)));
		} else {
			T result = serializer.copy(value1);
			return fieldAccessor.set(result, adder.add(fieldAccessor.get(value1), fieldAccessor.get(value2)));
		}
	}
}

常见派生方法

sum

def sum(position: Int): DataStream[T] = aggregate(AggregationType.SUM, position)

maxBy

def maxBy(position: Int): DataStream[T] = aggregate(AggregationType.MAXBY, position)

示例代码片段

sum

val counts: DataStream[(String, Int)] = text.flatMap(_.toLowerCase().split("\\W+"))
  .filter(_.nonEmpty)
  .map((_, 1))
  .keyBy(0)
  .countWindow(windowSize, slideSize)
  .sum(1)

maxBy

val counts: DataStream[(String, Int)] = text.flatMap(_.toLowerCase().split("\\W+"))
  .filter(_.nonEmpty)
  .map((_, 1))
  .keyBy(0)
  .countWindow(windowSize, slideSize)
  .maxBy(1)

reduce

同样的是分别针对某个 key 的元素集应用该处理函数

def reduce(function: (T, T) => T): DataStream[T] = {
    if (function == null) {
      throw new NullPointerException("Reduce function must not be null.")
    }
    val cleanFun = clean(function)
    val reducer = new ScalaReduceFunction[T](cleanFun)
    reduce(reducer)
  }

同样是实现了 ReduceFunction 接口

final class ScalaReduceFunction[T](private[this] val function: (T, T) => T)
    extends ReduceFunction[T] {
  
  @throws(classOf[Exception])
  override def reduce(a: T, b: T): T = {
    function(a, b)
  }
}

源调用

// file: org.apache.flink.streaming.api.datastream.WindowedStream

public <R> SingleOutputStreamOperator<R> reduce(
		ReduceFunction<T> reduceFunction,
		WindowFunction<T, R, K, W> function,
		TypeInformation<R> resultType) {

	if (reduceFunction instanceof RichFunction) {
		throw new UnsupportedOperationException("ReduceFunction of reduce can not be a RichFunction.");
	}

	//clean the closures
	function = input.getExecutionEnvironment().clean(function);
	reduceFunction = input.getExecutionEnvironment().clean(reduceFunction);

	final String opName = generateOperatorName(windowAssigner, trigger, evictor, reduceFunction, function);
	KeySelector<T, K> keySel = input.getKeySelector();

	OneInputStreamOperator<T, R> operator;

	if (evictor != null) {
		@SuppressWarnings({"unchecked", "rawtypes"})
		TypeSerializer<StreamRecord<T>> streamRecordSerializer =
			(TypeSerializer<StreamRecord<T>>) new StreamElementSerializer(input.getType().createSerializer(getExecutionEnvironment().getConfig()));

		ListStateDescriptor<StreamRecord<T>> stateDesc =
			new ListStateDescriptor<>("window-contents", streamRecordSerializer);

		operator =
			new EvictingWindowOperator<>(windowAssigner,
				windowAssigner.getWindowSerializer(getExecutionEnvironment().getConfig()),
				keySel,
				input.getKeyType().createSerializer(getExecutionEnvironment().getConfig()),
				stateDesc,
				new InternalIterableWindowFunction<>(new ReduceApplyWindowFunction<>(reduceFunction, function)),
				trigger,
				evictor,
				allowedLateness,
				lateDataOutputTag);

	} else {
		ReducingStateDescriptor<T> stateDesc = new ReducingStateDescriptor<>("window-contents",
			reduceFunction,
			input.getType().createSerializer(getExecutionEnvironment().getConfig()));

		operator =
			new WindowOperator<>(windowAssigner,
				windowAssigner.getWindowSerializer(getExecutionEnvironment().getConfig()),
				keySel,
				input.getKeyType().createSerializer(getExecutionEnvironment().getConfig()),
				stateDesc,
				new InternalSingleValueWindowFunction<>(function),
				trigger,
				allowedLateness,
				lateDataOutputTag);
	}

	return input.transform(opName, resultType, operator);
}

示例代码片段

stream.keyBy(0)
  .timeWindow(Time.of(2500, TimeUnit.MILLISECONDS), Time.of(500, TimeUnit.MILLISECONDS))
  .reduce((value1, value2) => (value1._1, value1._2 + value2._2))
  .addSink(new SinkFunction[(Long, Long)] {})

process

 233

apply

233

猜你喜欢

转载自www.cnblogs.com/lemos/p/12640664.html