flink writes hdfs in gz format

In the official documentation of flink, there is an API for reading gz files, but there is no API for writing hdfs in gz format. What if we store a large amount of data and seriously occupy our storage space?

The blogger shares my approach here. Note: The generation of the blogger's hdfs file depends on the flink checkpoint. If the flink checkpoint is not configured in the program, this function cannot be realized.

package com.push.redis;

import org.apache.flink.api.common.serialization.BulkWriter;
import org.apache.flink.core.fs.FSDataOutputStream;

import java.io.IOException;
import java.util.zip.GZIPOutputStream;

public class GzipBulkStringWriterFactory<T> implements BulkWriter.Factory<T> {
    @Override
    public BulkWriter<T> create(FSDataOutputStream fsDataOutputStream) throws IOException {
        final GZIPOutputStream gzipOutputStream = new GZIPOutputStream(fsDataOutputStream, true);
        return new GzipStringBulkWriter<>(gzipOutputStream);
    }
}
package com.push.redis;

import org.apache.flink.api.common.serialization.BulkWriter;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.zip.GZIPOutputStream;

public class GzipStringBulkWriter<T> implements BulkWriter<T> {

    private final GZIPOutputStream gzipOutputStream;
    //private final ObjectOutputStream objectOutputStream;

    public GzipStringBulkWriter(GZIPOutputStream gzipOutputStream) {
        //this.objectOutputStream = objectOutputStream;
        this.gzipOutputStream = gzipOutputStream;
    }

    @Override
    public void addElement(T t) throws IOException {
        //objectOutputStream.writeUTF(String.valueOf(t));
        // write String only
        gzipOutputStream.write(String.valueOf(t).getBytes(StandardCharsets.UTF_8));
    }

    @Override
    public void flush() throws IOException {
        //objectOutputStream.flush();
        gzipOutputStream.flush();
    }

    @Override
    public void finish() throws IOException {
        //objectOutputStream.close();
        gzipOutputStream.close();
    }
}

Here, the Object type cannot be used. If the Object type is used, garbled characters will appear when zcat reads hdfs.

package com.push.redis;

import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.core.fs.Path;
import org.apache.flink.core.io.SimpleVersionedSerializer;
import org.apache.flink.runtime.state.memory.MemoryStateBackend;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.filesystem.BucketAssigner;
import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink;
import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.SimpleVersionedStringSerializer;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011;

import java.time.Instant;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.util.Properties;

public class testFlinkGZToHdfs {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        CheckpointConfig config = env.getCheckpointConfig();
        env.enableCheckpointing(10*6*1000);   //5s进行一次checkPoint
        config.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);  //精准一次处理
        config.setCheckpointTimeout(10*6*1000);  //配置checkpoint必须在1s内完成一次checkpoint,否则检查点终止
        config.setMinPauseBetweenCheckpoints(10*6*1000);  //设置checkpoint最小时间间隔
        config.setMaxConcurrentCheckpoints(1);  //配置checkpoint并行度,同一时间值允许进行一次检查点
        config.setFailOnCheckpointingErrors(false);  //一旦检查点不能正常运行,Task不终止
        config.enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);   //在cancel任务时候,系统保留checkpoint
//        env.setRestartStrategy(RestartStrategies.failureRateRestart(3, Time.of(3, TimeUnit.MINUTES),Time.of(10, TimeUnit.SECONDS)));
        env.setStateBackend(new MemoryStateBackend());

        Properties properties = new Properties();
        properties.setProperty("bootstrap.servers","centos:9092");
        properties.setProperty("group.id", "aa");
        FlinkKafkaConsumer011<String> kafkaSource0 = new FlinkKafkaConsumer011<String>("hhhh", new SimpleStringSchema(), properties);
        DataStreamSource<String> kafkaSource = (DataStreamSource<String>) env.addSource(kafkaSource0);
        kafkaSource0.setStartFromLatest();

        SingleOutputStreamOperator<String> streamOperator = kafkaSource.map(t -> parse(t));
        // gizp
        final StreamingFileSink<String> streamingGzipFileSink = StreamingFileSink.forBulkFormat(new Path("hdfs://192.168.139.188:9000/testGZ06"), new GzipBulkStringWriterFactory<String>())
                // 设置元素落到哪个 bucket
                .withBucketAssigner(new BucketAssigner<String, String>() {
                        private static final long serialVersionUID = 1L;
                        private transient DateTimeFormatter dateTimeFormatter;
                        @Override
                        public String getBucketId(String element, Context context) {
                            if (dateTimeFormatter == null) {
                                dateTimeFormatter = DateTimeFormatter.ofPattern("yyyyMMddHH").withZone(ZoneId.systemDefault());
                            }
                            //yyyyMMddHH 数据落入的目录,根据实际情况设置,比如从元素中获取元素生成时间
                            return "ymd="+dateTimeFormatter.format(Instant.ofEpochMilli(System.currentTimeMillis()));
                        }
                        @Override
                        public SimpleVersionedSerializer<String> getSerializer() {
                            return SimpleVersionedStringSerializer.INSTANCE;
                        }
                    }).build();

        streamOperator.addSink(streamingGzipFileSink);


        env.execute("testHdfsGZ");
    }

    public static String parse(String a){
        return a + '\n';
    }
}

The test process of the blogger is that f'lin'k consumes Kafka data and writes it to hdfs in compressed form. But note that when using, there is no newline character by default for the consumed data, so when data is stored, a newline character should be added after each piece of data.

After my test, compared with txt files, this form of compression has a compression ratio of 11:1, which can save space very well. In the generation of partition files, it also strictly depends on the time interval of flink's checkpoint.

Guess you like

Origin blog.csdn.net/qq_44962429/article/details/106261485