sparkSreaming对接kafka,手动提交offset(java版本)

其实官网上写的都很清楚,在这边整合下做个记录
给需要的小伙伴看看
直接上代码

@Slf4j
public class Sink2HiveTask {

    public static void main(String[] args) throws Exception {

        String topic = "test5";

        SparkConf sparkConf = new SparkConf().setMaster("local[2]").setAppName("test");
        sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
        //sparkConf.set("spark.kryo.registrator", "com.ykc.task.MyRegistrator"); //序列化ConsumerRecord类
        //sparkConf.set("spark.rdd.compress", "true"); // rdd的压缩
        //sparkConf.set("spark.kryo.registrator", MyRegistrator.class.getName());

        SparkSession ss = SparkSession.builder()
                .config(sparkConf)
                .enableHiveSupport()
                .getOrCreate();
        JavaStreamingContext jsc = new JavaStreamingContext(new JavaSparkContext(ss.sparkContext()), Durations.seconds(20));

        // 消费kafka,这边是重点!
        JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaParamsConfig.buildKafk   aSourceDStream(topic, jsc);

        ss.sql("set hive.exec.dynamic.partition = true");
        ss.sql("set hive.exec.dynamic.partition.mode = nonstrict");
        String hiveDatabase = PropConfig.getProperty("ykc.hive.database");
        ss.sql("use " + hiveDatabase);
        stream.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {

            private static final long serialVersionUID = 1L;

            @Override
            public void call(JavaRDD<ConsumerRecord<String, String>> javaRDD) throws Exception {
                OffsetRange[] offsetRanges = ((HasOffsetRanges) javaRDD.rdd()).offsetRanges();

                // 处理数据
                // 此处略过,看自己具体业务需求
                // .....

                // 数据写入成功后把每个分区下的offset存入redis
                Map<String, String> redisMap = Maps.newHashMap();
                for (OffsetRange offsetRange : offsetRanges) {
                    redisMap.put(String.valueOf(offsetRange.partition()), String.valueOf(offsetRange.fromOffset()));
                }
                RedisUtil.hSetKV(topic, redisMap);

                // 提交offset
                ((CanCommitOffsets) stream.inputDStream()).commitAsync(offsetRanges);
            }
        });

        jsc.start();
        jsc.awaitTermination();

    }

}
public class KafkaParamsConfig {
    /**
     * 创建kafka 数据源 (单个topic)
     * @param topic
     * @param jsc
     * @return
     */
    public static JavaInputDStream<ConsumerRecord<String, String>> buildKafkaSourceDStream(String topic, JavaStreamingContext jsc){

        Map<String, Object> kafkaParams = Maps.newHashMap();
        kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, PropertiesConstants.KAFKA_BROKERS);
        kafkaParams.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
        kafkaParams.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
        kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, PropertiesConstants.KAFKA_GROUPID);

        kafkaParams.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest");

        kafkaParams.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, false);

        JavaInputDStream<ConsumerRecord<String, String>> stream;
        if (MapUtils.isEmpty(getOffsets(topic))) {
            stream = KafkaUtils.createDirectStream(jsc,
                    LocationStrategies.PreferConsistent(),
                    ConsumerStrategies.Subscribe(Lists.newArrayList(topic), kafkaParams));
        } else {
            // 从指定offset处消费
            stream = KafkaUtils.createDirectStream(jsc,
                    LocationStrategies.PreferConsistent(),
                    ConsumerStrategies.Subscribe(Lists.newArrayList(topic), kafkaParams, getOffsets(topic)));
        }

        return stream;
    }


    /**
     * 从redis中获取 kafka消费的offset
     * @param topic
     * @return
     */
    private static Map<TopicPartition, Long> getOffsets(String topic) {
        Map<TopicPartition, Long> map = Maps.newHashMap();

        Map<String, String> stringMap = RedisUtil.hGetAll(topic);
        if (MapUtils.isEmpty(stringMap)) {
            return map;
        }

        for (Map.Entry<String, String> entry : stringMap.entrySet()) {
            String partition = entry.getKey();
            String offset = entry.getValue();
            TopicPartition topicPartition = new TopicPartition(topic, NumberUtils.toInt(partition));
            map.put(topicPartition, NumberUtils.toLong(offset));
        }

        return map;
    }
}

两个工具类和一个常量类,一些配置不能写死,配在配置文件中

/**
 * @author msy
 * @Type PropertiesConstants.java
 * @Desc 可能有些常量这里用不到,懒得删了
 * @date
 */
public class PropertiesConstants {
    /**
     * redis的参数
     */
    public static final String REDIS_HOST = PropConfig.getProperty("ykc.redis.host");
    public static final String REDIS_PORT = PropConfig.getProperty("ykc.redis.port");
    public static final String REDIS_PASSWORD = PropConfig.getProperty("ykc.redis.password");
    public static final String REDIS_TIMEOUT = PropConfig.getProperty("ykc.redis.timeout");
    public static final String REDIS_DATABASE = PropConfig.getProperty("ykc.redis.database");
    public static final String REDIS_MAXIDLE = PropConfig.getProperty("ykc.redis.maxidle");
    public static final String REDIS_MINIDLE = PropConfig.getProperty("ykc.redis.minidle");
    public static final String REDIS_MAXTOTAL = PropConfig.getProperty("ykc.redis.maxtotal");


    /**
     * hdfs的参数
     */
    public static final String HDFS_PATH = PropConfig.getProperty("ykc.hdfs.path");


    /**
     * hive的参数
     */
    public static final String HIVE_DATABASE = PropConfig.getProperty("ykc.hive.database");

    /**
     * kafka参数
     */
    public static final String KAFKA_BROKERS = PropConfig.getProperty("ykc.kafka.brokers");
    public static final String KAFKA_GROUPID = PropConfig.getProperty("ykc.kafka.groupId");
}
/**
 * 读取配置文件工具类
 */
@Slf4j
public class PropConfig {

    private static Properties properties ;
    private static String PROPERTIES_FILE_NAME = "application-dev.properties";

    static {
        // 获取环境
        InputStream in = PropConfig.class.getClassLoader().getResourceAsStream("application.properties");
        Properties prop = new Properties();
        try {
            prop.load(in);
        } catch (IOException e) {
            log.error("读取配置文件:application.properties 出错");
        }
        String profileName = prop.getProperty("ykc.profile");
        if ("dev".equals(profileName)) {
            // 开发环境
            PROPERTIES_FILE_NAME = "application-dev.properties";
        }

        if ("test".equals(profileName)) {
            // 测试环境
            PROPERTIES_FILE_NAME = "application-test.properties";
        }

        if ("prod".equals(profileName)) {
            // 生产环境
            PROPERTIES_FILE_NAME = "application-prod.properties";
        }

        //初始化
        properties= getProperties();

    }


    //生成properties
    private static final Properties getProperties(){
        if (properties == null) {
            properties = new Properties();
        }
        InputStream in = PropConfig.class.getClassLoader().getResourceAsStream(PROPERTIES_FILE_NAME);
        try {
            properties.load(in);
        } catch (IOException e) {
            log.error("读取配置文件出错");
        } finally {
            IOUtils.closeQuietly(in);
        }

        return properties;
    }


    public static final String getProperty(String key) {
        return properties.getProperty(key);
    }
}
/***
 * @Description: redis工具类
 * @Auther: minsiyi
 * @Date: 2019/12/24 19:45
 * @version: v1.0
 */
@Slf4j
public class RedisUtil {

    private static JedisPool pool = null;
    private JedisPoolConfig conf;

    static {
        try {
            JedisPoolConfig config = new JedisPoolConfig();
            // 最大(小)闲置个数
            config.setMaxIdle(NumberUtils.toInt(PropertiesConstants.REDIS_MAXIDLE));
            config.setMinIdle(NumberUtils.toInt(PropertiesConstants.REDIS_MINIDLE));
            // 最大连接数
            config.setMaxTotal(NumberUtils.toInt(PropertiesConstants.REDIS_MAXTOTAL));

            pool = new JedisPool(config,
                    PropertiesConstants.REDIS_HOST,
                    NumberUtils.toInt(PropertiesConstants.REDIS_PORT),
                    NumberUtils.toInt(PropertiesConstants.REDIS_TIMEOUT),
                    PropertiesConstants.REDIS_PASSWORD,
                    NumberUtils.toInt(PropertiesConstants.REDIS_DATABASE));

        } catch (Exception e) {
            log.error("redis连接池异常", e);
        }
    }

    private static Jedis getJedis() {
        return pool.getResource();
    }

    private static void closeJedis(Jedis jedis) {
        if (jedis != null) {
            jedis.close();
        }
    }


    public static void setKV(String k, String v) {
        Jedis jedis = null;
        try {
            jedis = getJedis();
            jedis.set(k, v);
        } catch (Exception e) {
            log.error("redis存储k-v异常,key:{}", k, e);
        } finally {
            closeJedis(jedis);
        }
    }

    public static String getKV(String k) {
        Jedis jedis = null;
        try {
            jedis = getJedis();
            return jedis.get(k);
        } catch (Exception e) {
            log.error("redis获取值异常,key:{}", k, e);
        } finally {
            closeJedis(jedis);
        }
        return null;

    }

    public static String hGetKV(String k, String field) {
        Jedis jedis = null;
        try {
            jedis = getJedis();
            return jedis.hget(k, field);
        } catch (Exception e) {
            log.error("redis获取值异常,key:{}", k, e);
        } finally {
            closeJedis(jedis);
        }
        return null;
    }
    public static Map<String, String> hGetAll(String k) {
        Jedis jedis = null;
        try {
            jedis = getJedis();
            return jedis.hgetAll(k);
        } catch (Exception e) {
            log.error("redis获取值异常,key:{}", k, e);
        } finally {
            closeJedis(jedis);
        }
        return null;
    }


    public static void hSetKV(String k, Map<String, String> map) {
        Jedis jedis = null;
        try {
            jedis = getJedis();
            jedis.hset(k, map);
        } catch (Exception e) {
            log.error("redis获取值异常,key:{}", k, e);
        } finally {
            closeJedis(jedis);
        }
    }
    
}

配置文件如下:
1.application.properties 这个配置确定是哪个环境

##环境指定
ykc.profile=dev

2.application-dev.properties 开发环境的配置文件(有些这里可能用不到,懒得删了,自己看着取)

##mysql的设置
ykc.mysql.driver=com.mysql.jdbc.Driver

#uat外网
ykc.mysql.omp.url=jdbc:mysql:/xxx.xxx.xxx.xxx:3306/%s?zeroDateTimeBehavior=convertToNull&useOldAliasMetadataBehavior=true&tinyInt1isBit=false
ykc.mysql.omp.username=xxxx
ykc.mysql.omp.password=xxxx


#test外网 (这里用占位符标志库名,代码中可以动态获取)
ykc.mysql.saas.url=jdbc:mysql://xxx.xxx.xxx.xxx:3306/%s?zeroDateTimeBehavior=convertToNull&useOldAliasMetadataBehavior=true&tinyInt1isBit=false
ykc.mysql.saas.username=xxxx
ykc.mysql.saas.password=xxx



##redis的设置  (3.0测试库)
ykc.redis.host=xxx.xxx.xxx.xxx
ykc.redis.port=6379
ykc.redis.password=xxx
ykc.redis.database=23
ykc.redis.timeout=2000
ykc.redis.maxidle=10
ykc.redis.minidle=30
ykc.redis.maxtotal=50


##hdfs的配置
ykc.hdfs.path=hdfs://xxx.xxx.xxx.xxx:9000/root/hadoop/dfs/data/spark/dev


##hive的配置
ykc.hive.database=hive_1



##kafka的配置
ykc.kafka.brokers=xxx.xxx.xxx.xxx:9092
ykc.kafka.groupId=xxx

3.还有 application-prod.properties 、application-test.properties 等,自己按需添加吧。不多说了

注:整体思路就是:把kafka每个分区的offset存到redis,下次任务重启再从redis里读取。

发布了20 篇原创文章 · 获赞 9 · 访问量 551

猜你喜欢

转载自blog.csdn.net/weixin_42155491/article/details/104767461