Flink写入HDFS(文本,parquet,parquet+snappy)

flink版本:1.10.0

code:

    // 构建env环境
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.enableCheckpointing(1000*60) //1000*60 s一个checkpoint
    env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
    env.setStateBackend(new RocksDBStateBackend(checkpointPath, true))
    env.getCheckpointConfig.enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION) //job取消,cp还在
    

    //任务失败会进行五次重试,重试间隔为50s
    env.setRestartStrategy(RestartStrategies.
      fixedDelayRestart(5, 
        50000)) 
    //kafak参数
    val properties = new Properties()
    properties.setProperty("bootstrap.servers", TEST_BROKERS)
    properties.setProperty("group.id", "toHdfs-Parquet-group")
    val kafkaConsumer = new FlinkKafkaConsumer[String](LAUNCHS_TOPIC, new SimpleStringSchema(), properties)
    val source = env.addSource(kafkaConsumer)
    // 解析json
    val kafkaSource=source.flatMap(new RichFlatMapFunction[String, LogSchema] {
    
    
      var gson:Gson=_
      override def open(parameters: Configuration): Unit = {
    
    
        gson=new Gson
      }
      override def flatMap(value: String, out: Collector[LogSchema]): Unit = {
    
    
        val LogSchema = gson.fromJson(value, classOf[LogSchema])
        out.collect(LogSchema)
      }
    })

    //文件名
    val config = OutputFileConfig.builder()
      .withPartPrefix("prefix")
      .withPartSuffix(".txt")
      .build()
    //定义桶
    val assigner = new DateTimeBucketAssigner[LogSchema]("yyyy-MM-dd--HH", ZoneId.of("Asia/Shanghai"))

    // text  sink
    val sinkRow = StreamingFileSink
      .forRowFormat(new Path((outputPath)), new SimpleStringEncoder[String]("utf-8"))
      //      .withBucketAssigner(new MyBucketAssigner())
      .withRollingPolicy(
        DefaultRollingPolicy.builder()
          .withRolloverInterval(TimeUnit.MINUTES.toMillis(10)) //10min 生成一个文件
          .withInactivityInterval(TimeUnit.MINUTES.toMillis(5)) //5min未接收到数据,生成一个文件
          .withMaxPartSize(1024 * 1024 * 1024) //文件大小达到1G
          .build())
      .build()    
      // parquet  sink
    val sinkClo = StreamingFileSink
      .forBulkFormat(
        new org.apache.flink.core.fs.Path(outputPath),
        ParquetAvroWriters.forReflectRecord(classOf[LogSchema]).asInstanceOf[BulkWriter.Factory[LogSchema]]
      )
      .withBucketAssigner(assigner)
      .build()
    // parquet + 压缩
    val sinkCloCom = StreamingFileSink
      .forBulkFormat(
        new org.apache.flink.core.fs.Path(outputPathParquetSnappy),
        PaulParquetAvroWriters.forReflectRecord(classOf[LogSchema], CompressionCodecName.SNAPPY)
      )
      .withBucketAssigner(assigner)
      .build()

    // 根据不同的存储格式替换上面三种sink
    kafkaSource.addSink(sinkCloCom)
    env.execute("KafkaLogToHdfs")

上述三种方式文本方式没有小文件问题,但是parquet格式会存在小文件问题,可以调大checkpoint时间、减小并行度等手段来处理,但是因为这两种方式在数据量大的情况下比较影响性能,可能会导致背压,所以推荐是写入hdfs后,单独写个spark任务coalesce 下。

猜你喜欢

转载自blog.csdn.net/xiaozhaoshigedasb/article/details/108121866