flink版本:1.10.0
code:
// 构建env环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.enableCheckpointing(1000*60) //1000*60 s一个checkpoint
env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
env.setStateBackend(new RocksDBStateBackend(checkpointPath, true))
env.getCheckpointConfig.enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION) //job取消,cp还在
//任务失败会进行五次重试,重试间隔为50s
env.setRestartStrategy(RestartStrategies.
fixedDelayRestart(5,
50000))
//kafak参数
val properties = new Properties()
properties.setProperty("bootstrap.servers", TEST_BROKERS)
properties.setProperty("group.id", "toHdfs-Parquet-group")
val kafkaConsumer = new FlinkKafkaConsumer[String](LAUNCHS_TOPIC, new SimpleStringSchema(), properties)
val source = env.addSource(kafkaConsumer)
// 解析json
val kafkaSource=source.flatMap(new RichFlatMapFunction[String, LogSchema] {
var gson:Gson=_
override def open(parameters: Configuration): Unit = {
gson=new Gson
}
override def flatMap(value: String, out: Collector[LogSchema]): Unit = {
val LogSchema = gson.fromJson(value, classOf[LogSchema])
out.collect(LogSchema)
}
})
//文件名
val config = OutputFileConfig.builder()
.withPartPrefix("prefix")
.withPartSuffix(".txt")
.build()
//定义桶
val assigner = new DateTimeBucketAssigner[LogSchema]("yyyy-MM-dd--HH", ZoneId.of("Asia/Shanghai"))
// text sink
val sinkRow = StreamingFileSink
.forRowFormat(new Path((outputPath)), new SimpleStringEncoder[String]("utf-8"))
// .withBucketAssigner(new MyBucketAssigner())
.withRollingPolicy(
DefaultRollingPolicy.builder()
.withRolloverInterval(TimeUnit.MINUTES.toMillis(10)) //10min 生成一个文件
.withInactivityInterval(TimeUnit.MINUTES.toMillis(5)) //5min未接收到数据,生成一个文件
.withMaxPartSize(1024 * 1024 * 1024) //文件大小达到1G
.build())
.build()
// parquet sink
val sinkClo = StreamingFileSink
.forBulkFormat(
new org.apache.flink.core.fs.Path(outputPath),
ParquetAvroWriters.forReflectRecord(classOf[LogSchema]).asInstanceOf[BulkWriter.Factory[LogSchema]]
)
.withBucketAssigner(assigner)
.build()
// parquet + 压缩
val sinkCloCom = StreamingFileSink
.forBulkFormat(
new org.apache.flink.core.fs.Path(outputPathParquetSnappy),
PaulParquetAvroWriters.forReflectRecord(classOf[LogSchema], CompressionCodecName.SNAPPY)
)
.withBucketAssigner(assigner)
.build()
// 根据不同的存储格式替换上面三种sink
kafkaSource.addSink(sinkCloCom)
env.execute("KafkaLogToHdfs")
上述三种方式文本方式没有小文件问题,但是parquet格式会存在小文件问题,可以调大checkpoint时间、减小并行度等手段来处理,但是因为这两种方式在数据量大的情况下比较影响性能,可能会导致背压,所以推荐是写入hdfs后,单独写个spark任务coalesce 下。