public class FlinkSourceFromKafka {
public static void main(String[] args) throws Exception {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
env.enableCheckpointing(5000);
final Properties properties = new Properties();
properties.setProperty("bootstrap.servers", "192.168.8.173:9092");
//设置消费组
properties.setProperty("group.id", "group_test");
//动态感知partition的变化 10ms动态获取topic的元数据,新增的partition会自动从最早的位点开始消费数据
properties.setProperty(FlinkKafkaConsumerBase.KEY_PARTITION_DISCOVERY_INTERVAL_MILLIS, "10");
final ArrayList<String> topics = new ArrayList<>();
topics.add("test1");
//配置消费者
//SimpleStringSchema 序列化方式返回的结果只有原数据,没有关于分区,topic的数据,我们选择自定义序列化
final FlinkKafkaConsumer<ConsumerRecord<String, String>> consumer = new FlinkKafkaConsumer<ConsumerRecord<String, String>>(topics, new CustomDeSerializationSchema(), properties);
//使用正则表达式,动态发现topic
// final FlinkKafkaConsumer<ConsumerRecord<String,String>> consumer = new FlinkKafkaConsumer<ConsumerRecord<String,String>>(Pattern.compile("^test_([A-Za-z0-9]*)$"), new CustomDeSerializationSchema(), properties);
//设置从最早的offset消费
consumer.setStartFromEarliest();
/**
* 手动指定相应的位置消费
* */
//final HashMap<KafkaTopicPartition, Long> map = new HashMap<>();
//map.put(new KafkaTopicPartition("test",0),10240L);
//map.put(new KafkaTopicPartition("test",1),10240L);
//map.put(new KafkaTopicPartition("test",2),10240L);
//consumer.setStartFromSpecificOffsets(map);
/**
* flink从指定的时间点开始消费
* */
// consumer.setStartFromTimestamp(1559801580000L);
/**
* Flink从topic中指定的group上次消费的位置开始消费,所以必须配置group.id参数
*/
//consumer.setStartFromGroupOffsets();
final SingleOutputStreamOperator<ConsumerRecord<String, String>> source = env.addSource(consumer).flatMap(
new FlatMapFunction<ConsumerRecord<String, String>, ConsumerRecord<String, String>>() {
@Override
public void flatMap(ConsumerRecord<String, String> value, Collector<ConsumerRecord<String, String>> collector) throws Exception {
System.out.println(value);
}
});
env.execute("consumer...");
}
}
上图中,自定义了序列化方式,为了获取到数据来源.序列化代码如下:
public class CustomDeSerializationSchema implements KafkaDeserializationSchema<ConsumerRecord<String, String>> {
//是否表示l流的最后一条元素,设置为false,表示数据会源源不断的到来
@Override
public boolean isEndOfStream(ConsumerRecord<String, String> nextElement) {
return false;
}
//这里返回一个ConsumerRecord<String,String>类型的数据,除了原数据还包括topic,offset,partition等信息
@Override
public ConsumerRecord<String, String> deserialize(ConsumerRecord<byte[], byte[]> record) throws Exception {
return new ConsumerRecord<String, String>(
record.topic(),
record.partition(),
record.offset(),
new String(record.key()),
new String(record.value()));
}
//指定数据的输入类型
@Override
public TypeInformation<ConsumerRecord<String, String>> getProducedType() {
return TypeInformation.of(new TypeHint<ConsumerRecord<String, String>>() {
});
}
}