flink学习笔记-flink的DataStream集成kafka

对于实时处理当中,我们实际工作当中的数据源一般都是使用kafka,所以我们一起来看看如何通过Flink来集成kafka。flink提供了一个特有的kafka connector去读写kafka topic的数据。flink消费kafka数据,并不是完全通过跟踪kafka消费组的offset来实现去保证exactly-once的语义,而是flink内部去跟踪offset和做checkpoint去实现exactly-once的语义,而且对于kafka的partition,Flink会启动对应的并行度去处理kafka当中的每个分区的数据

  • flink整合kafka官网介绍:https://ci.apache.org/projects/flink/flink-docs-release-1.6/dev/connectors/kafka.html

  • 第一步:导入jar包

    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-connector-kafka-0.11_2.11</artifactId>
        <version>1.8.1</version>
    </dependency>
    <dependency>
        <groupId>org.apache.kafka</groupId>
        <artifactId>kafka-clients</artifactId>
        <version>1.1.0</version>
    </dependency>
    
    <dependency>
        <groupId>org.slf4j</groupId>
        <artifactId>slf4j-api</artifactId>
        <version>1.7.25</version>
    </dependency>
    
    <dependency>
        <groupId>org.slf4j</groupId>
        <artifactId>slf4j-log4j12</artifactId>
        <version>1.7.25</version>
    </dependency>
    
  • 第二步:将kafka作为flink的source来使用

    import java.util.Properties
    import org.apache.flink.api.common.serialization.SimpleStringSchema
    import org.apache.flink.contrib.streaming.state.RocksDBStateBackend
    import org.apache.flink.streaming.api.CheckpointingMode
    import org.apache.flink.streaming.api.environment.CheckpointConfig
    import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
    import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011
    
    object FlinkKafkaSource {
    
      def main(args: Array[String]): Unit = {
        val env = StreamExecutionEnvironment.getExecutionEnvironment
        //隐式转换
        import org.apache.flink.api.scala._
        //checkpoint配置
        env.enableCheckpointing(100);
        env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
        env.getCheckpointConfig.setMinPauseBetweenCheckpoints(500);
        env.getCheckpointConfig.setCheckpointTimeout(60000);
        env.getCheckpointConfig.setMaxConcurrentCheckpoints(1);
        env.getCheckpointConfig.enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
        env.setStateBackend(new RocksDBStateBackend("hdfs://node01:8020/flink/checkDir",true))
    
        val topic = "test"
        val prop = new Properties()
        prop.setProperty("bootstrap.servers","node01:9092")
        prop.setProperty("group.id","con1")
        prop.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        prop.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
    
        var kafkaSoruce: FlinkKafkaConsumer011[String] = new FlinkKafkaConsumer011[String](topic, new SimpleStringSchema(), prop)
    
        kafkaSoruce.setCommitOffsetsOnCheckpoints(true)
        //设置statebackend
        env.setStateBackend(new RocksDBStateBackend("hdfs://node01:8020/flink_kafka/checkpoints",true));
        val result: DataStream[String] = env.addSource(kafkaSoruce)
        result.print()
        env.execute()
      }
    }
    
  • 第三步:将kafka作为flink的sink来使用

    import java.util.Properties
    import org.apache.flink.api.common.serialization.SimpleStringSchema
    import org.apache.flink.contrib.streaming.state.RocksDBStateBackend
    import org.apache.flink.streaming.api.CheckpointingMode
    import org.apache.flink.streaming.api.environment.CheckpointConfig
    import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
    import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer011
    import org.apache.flink.streaming.connectors.kafka.internals.KeyedSerializationSchemaWrapper
    
    object FlinkKafkaSink {
      def main(args: Array[String]): Unit = {
          val env = StreamExecutionEnvironment.getExecutionEnvironment
          //隐式转换
          import org.apache.flink.api.scala._
          //checkpoint配置
          env.enableCheckpointing(5000);
          env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
          env.getCheckpointConfig.setMinPauseBetweenCheckpoints(500);
          env.getCheckpointConfig.setCheckpointTimeout(60000);
          env.getCheckpointConfig.setMaxConcurrentCheckpoints(1);
          env.getCheckpointConfig.enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
          //设置statebackend
          env.setStateBackend(new RocksDBStateBackend("hdfs://node01:8020/flink_kafka_sink/checkpoints",true));
    
    
          val text = env.socketTextStream("node01",9000)
    
          val topic = "test"
          val prop = new Properties()
          prop.setProperty("bootstrap.servers","node01:9092")
          prop.setProperty("group.id","kafka_group1")
    
    
          //第一种解决方案,设置FlinkKafkaProducer011里面的事务超时时间
          //设置事务超时时间
          prop.setProperty("transaction.timeout.ms",60000*15+"");
          //第二种解决方案,设置kafka的最大事务超时时间
          //FlinkKafkaProducer011<String> myProducer = new FlinkKafkaProducer011<>(brokerList, topic, new SimpleStringSchema());
          //使用支持仅一次语义的形式
          val myProducer = new FlinkKafkaProducer011[String](topic,new KeyedSerializationSchemaWrapper[String](new SimpleStringSchema()), prop, FlinkKafkaProducer011.Semantic.EXACTLY_ONCE)
          text.addSink(myProducer)
          env.execute("StreamingFromCollectionScala")
      }
    }
    
发布了40 篇原创文章 · 获赞 59 · 访问量 1396

猜你喜欢

转载自blog.csdn.net/qq_26719997/article/details/105071060