Flink learning two Flink programming basic API

Flink learning two Flink programming basic API

1. Basic dependency introduction

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>1.14.4</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.12</artifactId>
            <version>1.14.4</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_2.12</artifactId>
            <version>1.14.4</version>
        </dependency>

If you want to use the Scala API, you need to replace

  • flink-java is flink-scala_2.12
  • flink-streaming-java_2.12 为 flink-streaming-scala_2.12

2 Programming abstraction Flink DataStream

  • DataStream represents a data stream, which can be bounded or unbounded
  • DataStream is similar to java collections, but it is immutable, and the data itself is immutable
  • Unable to add or delete data to a DataStream
  • The data in the DataStream can only be converted through the operator, converting one DataStream into another DataStream
  • DataStream can be obtained through the source operator, or converted from an existing DataStream

3. Flink programming template

No matter simple or complex Flink program, it will consist of the following parts

  • Get a program and execute the entry environment env
  • Through the data source component, load and create DataStream
  • Express calculation logic by calling operators on DataStream
  • Specify the data method of the calculation result through the sink operator
  • Trigger the commit run of the program on env

4 Getting Started with FLink – WordCount

4.1. Stream processing

package com.flink.slot;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

/**
 * socket 数据流中的数据处理
 */
public class WordCount {
    
    

	public static void main(String[] args) throws Exception {
    
    

		// 获取环境
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		// 设置全局并行度
		env.setParallelism(2);
		// 获取数据源
		DataStreamSource<String> dataStreamSource = env.socketTextStream("192.168.141.180", 9000);
		// 计算逻辑 词语统计
		// new FlatMapFunction<String, Tuple2<String,Integer>>()
		// 给定一行数据String wordLine 返回 多条数据集合 Tuple2<String,Integer>>,
		SingleOutputStreamOperator<Tuple2<String, Integer>> words = dataStreamSource
				.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
    
    
					@Override
					public void flatMap(String wordLine, Collector<Tuple2<String, Integer>> collector)
							throws Exception {
    
    
						String[] split = wordLine.split("\\s+");
						for (String word : split) {
    
    
							collector.collect(Tuple2.of(word, 1));
						}
					}
				}).setParallelism(2);// 可以为每个算子设置并行度

		// 获取到的流中使用 聚合计算 就是根据单词分组
		// 构建分组key new KeySelector 指定 Tuple2 中的那个参数作为key
		KeyedStream<Tuple2<String, Integer>, String> keyedStream = words
				.keyBy(new KeySelector<Tuple2<String, Integer>, String>() {
    
    
					@Override
					public String getKey(Tuple2<String, Integer> stringIntegerTuple2) throws Exception {
    
    
						return stringIntegerTuple2.getField(0);
					}
				});

		// Tuple2 的第二个字段累加
		SingleOutputStreamOperator<Tuple2<String, Integer>> streamOperator = keyedStream.sum(1);
		// sink 数据写入

		streamOperator.print();
		// 提交任务
		env.execute();
	}
}

4.2. Batch processing

package com.sff.flink;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;

public class _03_Batch_WordCount {
    
    

	public static void main(String[] args) throws Exception {
    
    

		//不同点1 : 创建环境 ExecutionEnvironment 批处理api
		ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
		env.setParallelism(1);

		// 不同点2: 获取数据源
		DataSource<String> dataSource = env
				.readTextFile("D:\\Resource\\FrameMiddleware\\FlinkNew\\filedata\\batchfile.txt");

		// 计算逻辑 拆分flatMap ,分组 groupBy ,聚合 sum sink 输出print();
		dataSource.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
    
    
			@Override
			public void flatMap(String words, Collector<Tuple2<String, Integer>> collector) throws Exception {
    
    
				String[] split = words.split("\\s+");
				for (String word : split) {
    
    
					collector.collect(Tuple2.of(word, 1));
				}
			}
		}).groupBy(0).sum(1).print();

		//不同点3: 不需要提交任务
	}
}

4.3. Flow batch processing

package com.sff.flink;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

public class _04_StreamBatchWordCount {
    
    

	public static void main(String[] args) throws Exception {
    
    

		// 创建环境
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		env.setParallelism(1);

		// 获取数据源
		DataStreamSource<String> streamSource = env
				.readTextFile("D:\\Resource\\FrameMiddleware\\FlinkNew\\filedata\\batchfile.txt");

		// 计算逻辑 streamSource
		streamSource.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
    
    
			@Override
			public void flatMap(String words, Collector<Tuple2<String, Integer>> collector) throws Exception {
    
    
				String[] split = words.split("\\s+");
				for (String word : split) {
    
    
					collector.collect(Tuple2.of(word, 1));
				}
			}
		}).keyBy(new KeySelector<Tuple2<String, Integer>, Object>() {
    
    
			@Override
			public Object getKey(Tuple2<String, Integer> stringIntegerTuple2) throws Exception {
    
    
				return stringIntegerTuple2.f0;
			}
		}).sum(1).print();

		// 流计算需要提交
		env.execute();
	}
}

4.4. Flink streaming and batch integration

The above 3. Comparing with 1 and 2 writing method, the writing method of streaming and batch integration, when the user writes a set of code, the bottom layer can be processed in streaming mode, or it can be automatically converted into batch processing

// 自动转换 处理模式 
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
// 指定流式处理数据
env.setRuntimeMode(RuntimeExecutionMode.STREAMING);
// 指定批处理数据
env.setRuntimeMode(RuntimeExecutionMode.BATCH);

4.5.lamdba wording attention

public class _04_StreamBatchWordCount__lamdba3 {
    
    

	public static void main(String[] args) throws Exception {
    
    

		// 创建环境
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		env.setParallelism(1);

		// 获取数据源
		DataStreamSource<String> streamSource = env
				.readTextFile("D:\\Resource\\FrameMiddleware\\FlinkNew\\filedata\\batchfile.txt");

		// 计算逻辑 streamSource
		// 数据打平
		SingleOutputStreamOperator<Tuple2<String, Integer>> singleOutputStreamOperator = streamSource
				.flatMap((String words, Collector<Tuple2<String, Integer>> collector) -> {
    
    
					String[] split = words.split("\\s+");
					for (String word : split) {
    
    
						collector.collect(Tuple2.of(word, 1));
					}
				});

		// 数据分组
		KeyedStream<Tuple2<String, Integer>, String> keyedStream = singleOutputStreamOperator
				.keyBy(stringIntegerTuple2 -> stringIntegerTuple2.f0);

		// 分组后按照字段求和
		SingleOutputStreamOperator<Tuple2<String, Integer>> sum = keyedStream.sum(1);

		// sink print 输出
		sum.print();

		// 流计算需要提交
		env.execute();
	}
}

// 代码会报错 类型擦除
Exception in thread "main" org.apache.flink.api.common.functions.InvalidTypesException: The return type of function 'main(_04_StreamBatchWordCount__lamdba3.java:28)' could not be determined automatically, due to type erasure. You can give type information hints by using the returns(...) method on the result of the transformation call, or by letting your function implement the 'ResultTypeQueryable' interface.
    
//处理方法
 // 处理方式一
singleOutputStreamOperator.returns(new TypeHint<Tuple2<String, Integer>>() {
    
    
});
// 处理方式二
singleOutputStreamOperator.returns(TypeInformation.of(new TypeHint<Tuple2<String, Integer>>() {
    
    
}));

// 处理方式三
singleOutputStreamOperator.returns(Types.TUPLE(Types.STRING, Types.INT));

4.6. Add the startup method of webUI

add dependencies

<dependency>
   <groupId>org.apache.flink</groupId>
   <artifactId>flink-runtime-web_2.12</artifactId>
   <version>1.14.4</version>
</dependency>
// 创建环境 编程入口
Configuration configuration = new Configuration();
configuration.setInteger(RestOptions.PORT, 8877);
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration);

5. Basic source operator

It can be divided into two categories first, one is used in test scenarios and the other is used in production

5.1 Test scene

		// 测试场景使用较多的方式
		// 方式一: 元素列表中获取
		// DataStreamSource<Integer> streamSource = env.fromElements(1, 3, 5, 7, 9);

		// 方式二: 集合中获取
		// DataStreamSource<Integer> streamSource = env.fromCollection(Arrays.asList(1,
		// 3, 5, 7, 9));

		// 方式三: 集合中获取
		// DataStreamSource<Long> streamSource = env.generateSequence(1, 100);

		// 方式四: 基于socket
		// DataStreamSource<String> streamSource =
		// env.socketTextStream("192.168.141.180", 9000);

		// 方式五 : 基于文件
		String filePath = "D:\\Resource\\FrameMiddleware\\FlinkNew\\filedata\\batchfile.txt";
 		//DataStreamSource<String> streamSource = env.readTextFile(filePath);
 		 
		// 方式六 : 基于文件, TextInputFormat OrcInputFormat 等 读取一次 或者多次
		// DataStreamSource<String> streamSource = env.readTextFile(filePath);
		// DataStreamSource<String> streamSource = env.readFile(new TextInputFormat(null), filePath,
		//		FileProcessingMode.PROCESS_CONTINUOUSLY, 1000);

5.2 Production Scenario

To connect to Kafka, in production, in order to enable Flink to obtain data efficiently, it is generally used in combination with some distributed message middleware, and Kafka is one of them;

Flink uses Kafka as a data source

add dependencies

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka_2.12</artifactId>
            <version>${flink.version}</version>
        </dependency>

How to use the old version of the API

/**
 * source 算子学习1 kafka 老的 连接器
 */
public class _01_SourceOperator_kafak_after_1_14 {
    
    

	public static void main(String[] args) throws Exception {
    
    

		// 获取环境
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		env.setParallelism(1);

		// 设置kafka的参数
		Properties properties = new Properties();
		// bootstrap.servers 服务器地址
		properties.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "CentOSA:9092,CentOSB:9092,CentOSC:9092");
		// auto.offset.reset 偏移量重置的策略,
		// earliest 没有消费过,就从头开始消费,有消费过,就从上次的消费点开始
		// latest 没有消费过,就从最新的开始消费,有消费过,就从上次的消费点开始
		properties.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
		// group.id 消费组
		properties.setProperty(ConsumerConfig.GROUP_ID_CONFIG, "test3");
		// 自动提交偏移量
		properties.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "true");
		//topic  反序列化器  ,kafka参数
		FlinkKafkaConsumer<String> kafkaConsumer = new FlinkKafkaConsumer<String>("flinkdemo", new SimpleStringSchema(),
				properties);

		DataStreamSource<String> dataStreamSource = env.addSource(kafkaConsumer);
		//无法保证exactly once 
		dataStreamSource.map(x -> "flink学习:" + x).print();
		// dataStreamSource.print();
		env.execute();
	}
}

New version of the API


/**
 * source 算子学习1 kafka 新的连接器 1.14 版本之后
 */
public class _01_SourceOperator_kafak_before_1_14 {
    
    

	public static void main(String[] args) throws Exception {
    
    

		// 获取环境
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		env.setParallelism(1);

		KafkaSource<String> kafkaSource = KafkaSource.<String>builder()
				.setBootstrapServers("CentOSA:9092,CentOSB:9092,CentOSC:9092")
				.setValueOnlyDeserializer(new SimpleStringSchema()).setTopics("flinkdemo").setGroupId("test1")
				.setStartingOffsets(OffsetsInitializer.earliest()) // 开始偏移量
				// .setBounded(OffsetsInitializer.committedOffsets()) // 一般不用 读取到指定offset
				// 就不处理了,程序退出;相当于批处理 相当于补数
				// .setUnbounded(OffsetsInitializer.earliest()) // 一般不用 设置为无界流,但是读取到指定offset
				// 停止读取数据 但是不退出
				.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest")
				// source 会把offset 维护在算子状态 topic partition offset 内部
				// kafkaSource 不依赖于kafka服务端里面的offset 而是优先使用自己状态里面的偏移量
				.build();
		// 后面说 flink kafka 重要的机制
//		WatermarkStrategy<String> watermarkStrategy = WatermarkStrategy.<String>forBoundedOutOfOrderness(Duration.ZERO)
//				.withTimestampAssigner(new SerializableTimestampAssigner<String>() {
    
    
//					@Override
//					public long extractTimestamp(String s, long l) {
    
    
//						String[] split = s.split(",");
//						return Long.parseLong(split[3]);
//					}
//				});
		DataStreamSource<String> dataStreamSource = env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(),
				"kafka-source");
		dataStreamSource.map(x -> "flink学习:" + x).print();
		env.execute();
	}
}

Summarize

In the above section, there are two sources in the two versions of kafka connector

  • Before 1.14
DataStreamSource<String> dataStreamSource = env.addSource(kafkaConsumer);
  • After 1.14 (included)
DataStreamSource<String> dataStreamSource = env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(),"kafka-source");

Among them, kafkaConsumer is the SourceFunction class, and kafkaSource is the Source class, both of which can come from the data source dataStreamSource

5.3 Custom Source

Custom Source mainly implements the SourceFunction class;

SourceFunction The most basic SourceFunction, the run method is the method to obtain data, and Flink will call this method to obtain data

1. Non-parallel SourceFunction

public class _04_CustomSourceFunction_1 {
    
    

	public static void main(String[] args) throws Exception {
    
    

		// 获取环境
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		env.setParallelism(1);
		DataStreamSource<Person> dataStreamSource = env.addSource(new ISourceFunction());
		dataStreamSource.map(x -> "flink学习:" + x).print();
		env.execute();
	}
}

class ISourceFunction implements SourceFunction<Person> {
    
    

	public static Integer index = 0;
	volatile boolean runFlag = true;

	@Override
	public void run(SourceContext<Person> sourceContext) throws Exception {
    
    
		Person person = null;
		while (runFlag) {
    
    
			index++;
			String name = UUID.randomUUID().toString();
			int anInt = new Random().nextInt(100);
			person = new Person(index, name, anInt, System.currentTimeMillis());
			System.out.println(Thread.currentThread());
			sourceContext.collect(person);
			Thread.sleep(1000);
		}

	}

	@Override
	public void cancel() {
    
    
		runFlag = false;
	}
}

2. Non-parallel RichSourceFunction

Rich's class mainly can also get the status of some runtime tasks

public class _04_CustomSourceFunction_2 {
    
    

	public static void main(String[] args) throws Exception {
    
    

		// 获取环境
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		env.setParallelism(1);
		DataStreamSource<Person> dataStreamSource = env.addSource(new IRichSourceFunction());
		dataStreamSource.map(x -> "flink学习:" + x).print();
		env.execute();
	}
}

class IRichSourceFunction extends RichSourceFunction<Person> {
    
    
	public static Integer index = 0;
	volatile boolean runFlag = true;

	/**
	 * Source 组件初始化
	 * 
	 * @param parameters
	 * @throws Exception
	 */
	@Override
	public void open(Configuration parameters) throws Exception {
    
    
		System.out.println("IRichSourceFunction open");
		RuntimeContext runtimeContext = getRuntimeContext();
		super.open(parameters);
	}

	/**
	 * Source生成数据的过程 核心工作
	 * 
	 * @param sourceContext
	 * @throws Exception
	 */
	@Override
	public void run(SourceContext<Person> sourceContext) throws Exception {
    
    
		Person person = null;
		while (runFlag) {
    
    
			index++;
			String name = UUID.randomUUID().toString();
			int anInt = new Random().nextInt(100);
			person = new Person(index, name, anInt, System.currentTimeMillis());
			sourceContext.collect(person);
			Thread.sleep(1000);
		}
	}

	/**
	 * job q取消
	 */
	@Override
	public void cancel() {
    
    
		System.out.println("IRichSourceFunction cancel");
		runFlag = false;
	}

	/**
	 * 组件关闭
	 * 
	 * @throws Exception
	 */
	@Override
	public void close() throws Exception {
    
    
		System.out.println("IRichSourceFunction close");
		super.close();
	}
}

3. Parallel ParallelSourceFunction

Parallelism can be customized

public class _04_CustomSourceFunction_3 {
    
    

	public static void main(String[] args) throws Exception {
    
    

		// 获取环境
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		env.setParallelism(1);
		DataStreamSource<Person> dataStreamSource = env.addSource(new IParallelSourceFunction());
		dataStreamSource.setParallelism(3);
		dataStreamSource.map(x -> "flink学习:" + x).print();
		env.execute();
	}
}

class IParallelSourceFunction implements ParallelSourceFunction<Person> {
    
    
	public static Integer index = 0;
	volatile boolean runFlag = true;

	/**
	 * Source生成数据的过程 核心工作
	 * 
	 * @param sourceContext
	 * @throws Exception
	 */
	@Override
	public void run(SourceContext<Person> sourceContext) throws Exception {
    
    
		Person person = null;
		while (runFlag) {
    
    
			index++;
			String name = UUID.randomUUID().toString();
			int anInt = new Random().nextInt(100);
			person = new Person(index, name, anInt, System.currentTimeMillis());
			sourceContext.collect(person);
			Thread.sleep(1000);
		}
	}

	@Override
	public void cancel() {
    
    
		runFlag = false;
	}

}

4. Parallel RichParallelSourceFunction

public class _04_CustomSourceFunction_4 {
    
    

   public static void main(String[] args) throws Exception {
    
    

      // 获取环境
      StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
      env.setParallelism(3);
      DataStreamSource<Person> dataStreamSource = env.addSource(new IRichParallelSourceFunction());
      dataStreamSource.map(x -> "flink学习:" + x).print();
      env.execute();
   }
}

class IRichParallelSourceFunction extends RichParallelSourceFunction<Person> {
    
    
   public static Integer index = 0;
   volatile boolean runFlag = true;

   /**
    * Source 组件初始化
    * 
    * @param parameters
    * @throws Exception
    */
   @Override
   public void open(Configuration parameters) throws Exception {
    
    
      System.out.println("IRichParallelSourceFunction open");
      RuntimeContext runtimeContext = getRuntimeContext();
      super.open(parameters);
   }

   /**
    * Source生成数据的过程 核心工作
    * 
    * @param sourceContext
    * @throws Exception
    */
   @Override
   public void run(SourceContext<Person> sourceContext) throws Exception {
    
    
      Person person = null;
      while (runFlag) {
    
    
         index++;
         String name = UUID.randomUUID().toString();
         int anInt = new Random().nextInt(100);
         person = new Person(index, name, anInt, System.currentTimeMillis());
         sourceContext.collect(person);
         Thread.sleep(1000);
      }
   }

   /**
    * job q取消
    */
   @Override
   public void cancel() {
    
    
      System.out.println("IRichParallelSourceFunction cancel");
      runFlag = false;
   }

   /**
    * 组件关闭
    * 
    * @throws Exception
    */
   @Override
   public void close() throws Exception {
    
    
      System.out.println("IRichParallelSourceFunction close");
      super.close();
   }
}

6. Basic transformation operator

6.1 Mapping operator

map mapping(DataStream ==> DataStream)

A piece of data maps out a piece of data x->x

public class _01_MapOperator {
    
    

	public static void main(String[] args) throws Exception {
    
    
		// 获取环境
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		env.setParallelism(1);
		DataStreamSource<String> streamSource = env.fromElements("ab", "asdasd", "asda", "asda,asdas");
		SingleOutputStreamOperator<String> outputStreamOperator = streamSource.map(x -> "string from file:" + x);
		SingleOutputStreamOperator<String> dataStream = outputStreamOperator.map(x -> x.toUpperCase());
		dataStream.print();
		env.execute();
	}
}
// 输出
STRING FROM FILE:AB
STRING FROM FILE:ASDASD
STRING FROM FILE:ASDA
STRING FROM FILE:ASDA,ASDAS

flatMap flat map (DataStream ==> DataStream)

A piece of data maps multiple pieces of data, and expands x->x1, x2...xn

public class _02_FlatMapOperator {
    
    

	public static void main(String[] args) throws Exception {
    
    
		// 获取环境
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		env.setParallelism(1);
		DataStreamSource<String> streamSource = env.fromElements("ab", "asdasd", "asda", "asda,asdas");
		SingleOutputStreamOperator<String> dataStream_2 = streamSource.flatMap(new FlatMapFunction<String, String>() {
    
    
			@Override
			public void flatMap(String s, Collector<String> collector) throws Exception {
    
    
				for (String s1 : s.split(",")) {
    
    
					collector.collect(s1);
				}
			}
		});
		dataStream_2.print();
		env.execute();
	}
}
//输出
ab
asdasd
asda
asda  //最后一个元素拆分成两个元素
asdas

project projection (DataStream ==> DataStream )

This operator can only be used for the Tuple data type, and several attributes can be obtained from the multiple attributes of Tuple;

If Tuple is the entire table, project is like taking out several fields

public class _03_ProjectOperator {
    
    

	public static void main(String[] args) throws Exception {
    
    
		// 获取环境
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		env.setParallelism(1);
		DataStreamSource<Tuple4<String, String, String, String>> dataStreamSource = env.fromElements(
				Tuple4.of("a1", "b1", "c1", "d1"), Tuple4.of("a2", "b2", "c2", "d2"), Tuple4.of("a3", "b3", "c3", "d3"),
				Tuple4.of("a4", "b4", "c4", "d4"));
		SingleOutputStreamOperator<Tuple> dataSource2 = dataStreamSource.project(1, 3);
		dataSource2.print();
		env.execute();
	}
}

//获取index 是1,3 元素, 0是其实index
(b1,d1)
(b2,d2)
(b3,d3)
(b4,d4)

6.2 Filter operator

filter filter(DataStream ==> DataStream)

x -> true/false , keep true data

public class _04_FilterOperator {
    
    

   public static void main(String[] args) throws Exception {
    
    
      StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
      env.setParallelism(1);
      DataStreamSource<String> streamSource = env.fromElements("ab", "asdasd", "asda", "asda,asdas");
      SingleOutputStreamOperator<String> dataStream = streamSource.filter(x -> x.length() > 3);
      dataStream.print();
      env.execute();
   }
}

6.3 Grouping operator

keyBy grouped by key (DataStream ==> DataStream )

When using Flink to process streaming data, it is often necessary to group the data, that is, divide the data into several groups according to the value of one or some fields, and aggregate or process each group of data. In Flink, keyByoperators can be used to implement data grouping operations.

Specifically, keyByoperators can accept one or more keys (keys) as parameters, and group data according to the values ​​of these keys. After executing keyBythe operator , Flink will group the data stream according to the value of the key, the data in the same group will be allocated to the same partition, and the data between different groups will be allocated to different partitions. The data in each partition is sorted according to the value of the key, which ensures that the data in the same group is in one partition, which is convenient for subsequent aggregation or processing operations.

public class _05_KeyByOperator {
    
    
	public static void main(String[] args) throws Exception {
    
    
		// 获取 Flink 执行环境
		final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		// 构造数据流,每个元素包含一个字符串和一个整数
		DataStream<Tuple2<String, Integer>> dataStream = env.fromElements(Tuple2.of("foo", 1), Tuple2.of("bar", 2),
				Tuple2.of("foo", 3), Tuple2.of("bar", 4));
		// 按照字符串字段进行分区,并打印每个元素所在的分区编号
		dataStream.keyBy(value -> value.f0).map(new RichMapFunction<Tuple2<String, Integer>, String>() {
    
    
			private int partitionId;
			@Override
			public void open(Configuration parameters) throws Exception {
    
    
				super.open(parameters);
				partitionId = getRuntimeContext().getIndexOfThisSubtask();
			}
			@Override
			public String map(Tuple2<String, Integer> value) throws Exception {
    
    
				return "Partition " + partitionId + ": " + value.toString();
			}
		}).print();
		// 执行任务
		env.execute("KeyBy Example");
	}
}
//
10> Partition 9: (bar,2)
10> Partition 9: (bar,4)
4> Partition 3: (foo,1)
4> Partition 3: (foo,3)

6.4 Rolling aggregation operator

What is rolling aggregation?

The characteristic of rolling aggregation is that when the data stream is still being generated, the aggregation results can be continuously updated, so as to realize real-time data analysis and processing. In Flink, rolling aggregation operators such as reduce and fold can be used , aggregate, etc. to achieve rolling aggregation

Existing implementations of such operators include sum max, maxBy, min, minBy

Example: sum operator, calculate the corresponding number according to User and gender

public class _05_SumOperator {
    
    
	public static void main(String[] args) throws Exception {
    
    
		// 获取环境
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		User user = new User("路人甲", 100, "男");
		User user1 = new User("路人A", 15, "男");
		User user2 = new User("路人D", 17, "男");
		User user5 = new User("路人D", 17, "女");
		User user3 = new User("路人乙", 13, "女");
		User user4 = new User("路人C", 18, "女");
		DataStreamSource<User> dataStreamSource = env.fromElements(user, user1, user2, user3, user4, user5);
		// keyedStream.print();
		// gender 分组个数
		//
		SingleOutputStreamOperator<Tuple2<String, Integer>> streamOperator = dataStreamSource
				.map(x -> Tuple2.of(x.getGender(), 1)) // 性别 tuple 1
				.returns(Types.TUPLE(Types.STRING, Types.INT))// 泛型的泛型需要处理
				.keyBy(0) // 按照 第一个角标分组
				.sum(1);// 第2个角标分组求和
		streamOperator.print();
		env.execute();
	}
}

// 
1> (,1)
1> (,2)
1> (,3)   ===> 最终数据
9> (,1)
9> (,2)
9> (,3)   ===> 最终数据

Example: sum operator, grouping by User and gender to find the maximum age

public class _06_MaxOperator {
    
    
	public static void main(String[] args) throws Exception {
    
    
		// 获取环境
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		User user = new User("路人甲", 10, "男");
		User user1 = new User("路人A", 150, "男");
		User user2 = new User("路人D", 17, "男");
		User user5 = new User("路人F", 29, "女");
		User user3 = new User("路人乙", 13, "女");
		User user4 = new User("路人C", 18, "女");
		User user6 = new User("路人E", 18, "女");
		DataStreamSource<User> dataStreamSource = env.fromElements(user, user1, user2, user3, user4, user5, user6);
		SingleOutputStreamOperator<User> age = dataStreamSource.keyBy(User::getGender).max("age");
		age.print();
		env.execute();
	}
}

// 
1> User(name=路人甲, age=10, gender=)
9> User(name=路人乙, age=13, gender=)
9> User(name=路人乙, age=18, gender=)
1> User(name=路人甲, age=150, gender=)
9> User(name=路人乙, age=29, gender=)
1> User(name=路人甲, age=150, gender=)  ==>>  name 字段在计算中取第一个,后续就只是更新聚合的value值 (age) ,age 字段是准确的 
9> User(name=路人乙, age=29, gender=)   ==>>  

Example: sum operator, grouping by User and gender to find the user with the largest

public class _07_MaxByOperator {
    
    
	public static void main(String[] args) throws Exception {
    
    
		// 获取环境
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment().setParallelism(1);
		User user = new User("路人甲", 10, "男");
		User user1 = new User("路人A", 150, "男");
		User user2 = new User("路人D", 17, "男");
		User user5 = new User("路人F", 29, "女");
		User user3 = new User("路人乙", 13, "女");
		User user4 = new User("路人C", 18, "女");
		User user6 = new User("路人E", 18, "女");
		DataStreamSource<User> dataStreamSource = env.fromElements(user, user1, user2, user3, user4, user5, user6);
		SingleOutputStreamOperator<User> age = dataStreamSource.keyBy(User::getGender).maxBy("age");
		age.print();
		env.execute();
	}
}
//
User(name=路人甲, age=10, gender=)
User(name=路人A, age=150, gender=)
User(name=路人A, age=150, gender=) ==>>  最大的人 
User(name=路人乙, age=13, gender=)
User(name=路人C, age=18, gender=)
User(name=路人F, age=29, gender=)
User(name=路人F, age=29, gender=)   ==>>  最大的人

**Difference:** The difference between max and maxBy is that the former only returns a number (other fields are not accurate, it is the first one, just update the aggregation field), and the latter returns a tuple containing the key and the entire element.

6.5 reduce operator

public interface ReduceFunction<T> extends Function {
    
    
    T reduce(T value1, T value2) throws Exception;
}
  • value1: Indicates the result value that has been aggregated in the current group.
  • value2: Indicates the currently processed data element.

Inside each group, the reduce operator executes the reduce method on all elements in turn, passes the result of the current aggregation to the next element as the value1 parameter, passes the currently processed element to the reduce method as the value2 parameter, and returns a new aggregation result. When all elements have been processed, the reduce operator will output the final aggregation result.

public class _08_ReduceOperator {
    
    

	public static void main(String[] args) throws Exception {
    
    
		// 获取环境
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment().setParallelism(1);
		User user = new User("路人甲", 10, "男");
		User user1 = new User("路人A", 150, "男");
		User user2 = new User("路人D", 17, "男");
		User user5 = new User("路人F", 29, "女");
		User user3 = new User("路人乙", 13, "女");
		User user4 = new User("路人C", 29, "女");
		User user7 = new User("路人V", 29, "女");
		User user6 = new User("路人E", 18, "女");
		DataStreamSource<User> dataStreamSource = env.fromElements(user, user1, user2, user3, user4, user5, user6,
				user7);

		SingleOutputStreamOperator<User> age = dataStreamSource.keyBy(User::getGender)
				.reduce(new ReduceFunction<User>() {
    
    
					@Override
					public User reduce(User user, User t1) throws Exception {
    
    
						return t1.getAge() >= user.getAge() ? t1 : user;  //后面数据一致的,可以进行覆盖
					}
				});
		age.print();
		env.execute();
	}
}
//
User(name=路人甲, age=10, gender=)
User(name=路人A, age=150, gender=)
User(name=路人A, age=150, gender=) ==>>
User(name=路人乙, age=13, gender=)
User(name=路人C, age=29, gender=)
User(name=路人F, age=29, gender=)
User(name=路人F, age=29, gender=)
User(name=路人V, age=29, gender=) ==>>

7. Basic Sink operator

In Flink, the Sink operator is an operator used to output DataStream or DataSet to an external system. Sink operators can output data streams to various external systems, such as Kafka, HDFS, Cassandra, etc. The Sink operator is a terminal operator, that is, it is the final result of a DataStream or DataSet

1. Print operator

Most used before

print();

2. Write file operator

//写文件
public class _01_SinkOperatior {
    
    
	public static void main(String[] args) throws Exception {
    
    
		// 获取环境
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment().setParallelism(1);
		User user = new User("路人甲", 10, "男");
		User user1 = new User("路人A", 150, "男");
		User user2 = new User("路人D", 17, "男");
		DataStreamSource<User> dataStreamSource = env.fromElements(user, user1, user2);

        //写CSV
		dataStreamSource.map(x -> Tuple3.of(x.getName(), x.getAge(), x.getGender()))
				.returns(new TypeHint<Tuple3<String, Integer, String>>() {
    
    
				}).writeAsCsv("D:\\Resource\\FrameMiddleware\\FlinkNew\\sinkout1\\", FileSystem.WriteMode.OVERWRITE);
        //写Txt
         dataStreamSource.writeAsText("D:\\Resource\\FrameMiddleware\\FlinkNew\\sinkout\\",
				FileSystem.WriteMode.OVERWRITE);
		env.execute();
	}
}

3. StreamFileSink production-level file writing

  • file write
  • The real CheckPoint guarantees Exactly once
  • Write files in buckets: write to different folders at different times,
  • Support text files, CSV files and SequenceFile files. Columnar storage

Add dependency support

        <!--stream file sink-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-parquet_2.12</artifactId>
            <version>1.14.4</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-avro</artifactId>
            <version>1.14.4</version>
        </dependency>

        <dependency>
            <groupId>org.apache.parquet</groupId>
            <artifactId>parquet-avro</artifactId>
            <version>1.11.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.7.7</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-files</artifactId>
            <version>1.14.4</version>
        </dependency>

Official description:https://nightlies.apache.org/flink/flink-docs-release-1.17/docs/connectors/datastream/filesystem/

FileSink writes incoming data to buckets. Given that the incoming stream can be unbounded, the data in each bucket is organized into partial files of finite size. Bucketing behavior is fully configurable, the default time-based bucketing we start writing a new bucket every hour. This means that each resulting bucket will contain files containing records received from the stream in 1-hour intervals.

The data in the bucket directory is split into multiple partial files. Each bucket will contain at least one partial file for each subTask of the sink that receives data for that bucket.

Create additional storage files based on a configurable rolling policy.

  • For Row-encoded Formats, the default policy rolls parts of files based on size, specifies a timeout for the maximum duration a file can be open, and a maximum inactivity timeout after a file is closed.
  • Because Bulk-encoded Formatswe roll each checkpoint, the user can specify other criteria based on size or time.

ImportantFileSink : Checkpointing needs to be enabled when using in mode STREAMING. Part files can only be done on successful checkpoints. If checkpointing is disabled, parts of the file will remain in the in-progressOR pendingstate forever and cannot be safely read by downstream systems.

[External link picture transfer failed, the source site may have an anti-leeching mechanism, it is recommended to save the picture and upload it directly (img-KJbNdM4a-1687269330637) (flink2 hand-painted\streamfilesink_bucketing.png)]

  1. finish: Indicates that the file has been successfully written and closed. This means that all data has been written to the file and it is safe to move the file to another location or perform other operations.
  2. pending: Indicates that the file is being written, but not yet completed. This usually happens when the file is too large to be done in one write, Flink will write the data in a temporary file and not rename it to the final file until it is completely written.
  3. in-progress: Indicates that the file is being written and more data can still be written. This usually happens when using a rolling policy. Flink will write data into a new file. When the file size reaches a certain threshold, it will create a new file and continue writing until all the data has been written.

Row-encoded format

public class _03_StreamFileSinkRowOperator {
    
    
	public static void main(String[] args) throws Exception {
    
    
		// 获取环境
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment().setParallelism(2);

		// 设置Checkpointing
//		env.enableCheckpointing(2000, CheckpointingMode.EXACTLY_ONCE);
//		env.getCheckpointConfig().setCheckpointStorage("file:///D:/Resource/FrameMiddleware/FlinkNew/sinkout3/");

		DataStreamSource<Person> dataStreamSource = env.addSource(new CustomSourceFunction());

		FileSink<String> flinkdemo = FileSink
				.forRowFormat(new Path("D:\\Resource\\FrameMiddleware\\FlinkNew\\sinkout3\\"),
						new SimpleStringEncoder<String>("utf-8"))
				.withRollingPolicy(DefaultRollingPolicy.builder().withRolloverInterval(10000L) // 间隔10s
						.withMaxPartSize(1024 * 1024)// 文件大小达到1M
						.build())
				.withBucketAssigner(new DateTimeBucketAssigner<String>()) // 文件分桶策略 默认日期+ 小时
				.withBucketCheckInterval(5)// 文件夹异步线程创建和检测周期
				.withOutputFileConfig(OutputFileConfig.builder()
                                      .withPartPrefix("flinkdemo") // 文件前缀
						.withPartSuffix(".txt") // 文件后缀
						.build())
				.build();

		dataStreamSource.map(JSON::toJSONString).sinkTo(flinkdemo);

		env.execute();
	}
}

output file

insert image description here

Modify // 设置Checkpointingthe following code to enable Checkpointing

insert image description here

The inprogress state indicates that it is in the writing state, and the file is safe to read. No suffix means finish status

Bulk-encoded Formats

Block format storage, which can be understood as columnar storage; the file format of columnar storage is generally parquet file, avro file, orc file, these files are all with detailed schema information (can be understood as table structure data), calculation The framework only needs to read the file data and parse it according to the format to obtain the schema of the block data;

4.KafkaSink

required dependencies

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka_2.12</artifactId>
            <version>${flink.version}</version>
        </dependency>
public class _05_KafkaSinkOperator {
    
    
    public static void main(String[] args) throws Exception {
    
    
        // 获取环境
        // 带webUI
        Configuration configuration = new Configuration();
        configuration.setInteger("rest.port", 8822);
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(configuration);

        // 设置Checkpointing
        env.enableCheckpointing(5000, CheckpointingMode.EXACTLY_ONCE);
        env.getCheckpointConfig().setCheckpointStorage("file:///D:/Resource/FrameMiddleware/FlinkNew/sinkout3/");

        DataStreamSource<Person> dataStreamSource = env.addSource(new CustomSourceFunction());

        KafkaSink<String> kafkaSink = KafkaSink.<String>builder().setBootstrapServers("CentOSA:9092,CentOSB:9092,CentOSC:9092")
                .setRecordSerializer(KafkaRecordSerializationSchema.<String>builder().setTopic("flinkdemo")
                        .setValueSerializationSchema(new SimpleStringSchema()).build())
                .setDeliverGuarantee(DeliveryGuarantee.AT_LEAST_ONCE).setTransactionalIdPrefix("demoddemo").build();

        dataStreamSource.map(JSON::toJSONString)
                .disableChaining()  //可选,可以使得算子不和后面的绑定到一起
                .sinkTo(kafkaSink);

        env.execute();
    }
}

5.JdbcSink

Need to add dependencies

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-jdbc_2.12</artifactId>
            <version>1.14.4</version>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>8.0.23</version>
        </dependency>

/**
 * 数据写入数据库
 */
public class _06_JdbcSinkOperator {
    
    
	public static void main(String[] args) throws Exception {
    
    
		// 获取环境
		// 带webUI
		Configuration configuration = new Configuration();
		configuration.setInteger("rest.port", 8822);
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(configuration);

		// 设置Checkpointing
		env.enableCheckpointing(5000, CheckpointingMode.EXACTLY_ONCE);
		env.getCheckpointConfig().setCheckpointStorage("file:///D:/Resource/FrameMiddleware/FlinkNew/sinkout3/");

		DataStreamSource<Person> dataStreamSource = env.addSource(new CustomSourceFunction());

		String sql = "INSERT INTO person ( name, age) VALUES ( ?, ?) on duplicate key update age=?";

		SinkFunction<Person> sinkFunction = JdbcSink.sink(sql, new JdbcStatementBuilder<Person>() {
    
    
			@Override
			public void accept(PreparedStatement preparedStatement, Person person) throws SQLException {
    
    
				preparedStatement.setString(1, person.getName());
				preparedStatement.setInt(2, person.getAge());
				preparedStatement.setInt(3, person.getAge());
			}
		}, JdbcExecutionOptions.builder()
                        .withBatchSize(2)  //两条数据一批插入
                        .withMaxRetries(3)  //失败插入重试次数
                        .build(),
				new JdbcConnectionOptions.JdbcConnectionOptionsBuilder()
                        .withPassword("root") //jdbc 连接信息
                        .withUsername("root")//jdbc 连接信息
						.withUrl("jdbc:mysql://192.168.141.131:3306/flinkdemo").build());

		dataStreamSource // 可选,可以使得算子不和后面的绑定到一起
				.addSink(sinkFunction);

		env.execute();
	}
}

Data end-to-end consistent jdbcsink

SinkFunction<Person> sinkFunction = JdbcSink.exactlyOnceSink(sql, new JdbcStatementBuilder<Person>() {
    
    
                    @Override
                    public void accept(PreparedStatement preparedStatement, Person person) throws SQLException {
    
    
                        preparedStatement.setString(1, person.getName());
                        preparedStatement.setInt(2, person.getAge());
                        preparedStatement.setInt(3, person.getAge());
                    }
                }, JdbcExecutionOptions.builder()
                        .withBatchSize(2)  //两条数据一批插入
                        .withMaxRetries(3)  //失败插入重试次数
                        .build(),
                JdbcExactlyOnceOptions.builder()
                        //mysql 不支持一个连接上多个事务,必须要设置为true
                        .withTransactionPerConnection(true)
                        .build(),
                //XADataSource 支持分布式事务的连接
                new SerializableSupplier<XADataSource>() {
    
    
                    @Override
                    public XADataSource get() {
    
    
                        MysqlXADataSource mysqlXADataSource = new MysqlXADataSource();
                        mysqlXADataSource.setURL("jdbc:mysql://192.168.141.131:3306/flinkdemo");
                        mysqlXADataSource.setPassword("root");
                        mysqlXADataSource.setUser("root");
                        return mysqlXADataSource;
                    }
                }
        );

6. RedisSink

Download the relevant source code compilation (the existing RedisSink does not have it), and install it locally

https://github.com/apache/bahir-flink

        <dependency>
            <groupId>org.apache.bahir</groupId>
            <artifactId>flink-connector-redis_2.11</artifactId>
            <version>1.1-SNAPSHOT</version>
        </dependency>

The operation and API of redis feel a bit problematic


/**
 * 数据写入redis
 */
public class _08_RedisSinkOperator {
    
    
	public static void main(String[] args) throws Exception {
    
    
		// 获取环境  // 带webUI
		Configuration configuration = new Configuration();
		configuration.setInteger("rest.port", 8822);
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(configuration);

		// 设置Checkpointing
		env.enableCheckpointing(5000, CheckpointingMode.EXACTLY_ONCE);
		env.getCheckpointConfig().setCheckpointStorage("file:///D:/Resource/FrameMiddleware/FlinkNew/sinkout3/");

		DataStreamSource<Person> dataStreamSource = env.addSource(new CustomSourceFunction());

        FlinkJedisPoolConfig jedisPoolConfig = new FlinkJedisPoolConfig.Builder()
                .setHost("192.168.141.141")
                .build();

        RedisSink<Person> personRedisSink = new RedisSink<>(jedisPoolConfig,new IRedisMapper());

        dataStreamSource  
				.addSink(personRedisSink).setParallelism(2);
		env.execute();
	}


    static class IRedisMapper implements RedisMapper<Person>{
    
    

        @Override
        public RedisCommandDescription getCommandDescription() {
    
    
            return new RedisCommandDescription(RedisCommand.RPUSH,"finkdemoredis");
        }

        @Override
        public String getKeyFromData(Person data) {
    
    
            return data.getName().substring(0,5);
        }

        @Override
        public String getValueFromData(Person data) {
    
    
            return JSON.toJSONString(data);
        }
    }
}

Guess you like

Origin blog.csdn.net/weixin_44244088/article/details/131316350