其实官网上写的都很清楚,在这边整合下做个记录
给需要的小伙伴看看
直接上代码
@Slf4j
public class Sink2HiveTask {
public static void main(String[] args) throws Exception {
String topic = "test5";
SparkConf sparkConf = new SparkConf().setMaster("local[2]").setAppName("test");
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
//sparkConf.set("spark.kryo.registrator", "com.ykc.task.MyRegistrator"); //序列化ConsumerRecord类
//sparkConf.set("spark.rdd.compress", "true"); // rdd的压缩
//sparkConf.set("spark.kryo.registrator", MyRegistrator.class.getName());
SparkSession ss = SparkSession.builder()
.config(sparkConf)
.enableHiveSupport()
.getOrCreate();
JavaStreamingContext jsc = new JavaStreamingContext(new JavaSparkContext(ss.sparkContext()), Durations.seconds(20));
// 消费kafka,这边是重点!
JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaParamsConfig.buildKafk aSourceDStream(topic, jsc);
ss.sql("set hive.exec.dynamic.partition = true");
ss.sql("set hive.exec.dynamic.partition.mode = nonstrict");
String hiveDatabase = PropConfig.getProperty("ykc.hive.database");
ss.sql("use " + hiveDatabase);
stream.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {
private static final long serialVersionUID = 1L;
@Override
public void call(JavaRDD<ConsumerRecord<String, String>> javaRDD) throws Exception {
OffsetRange[] offsetRanges = ((HasOffsetRanges) javaRDD.rdd()).offsetRanges();
// 处理数据
// 此处略过,看自己具体业务需求
// .....
// 数据写入成功后把每个分区下的offset存入redis
Map<String, String> redisMap = Maps.newHashMap();
for (OffsetRange offsetRange : offsetRanges) {
redisMap.put(String.valueOf(offsetRange.partition()), String.valueOf(offsetRange.fromOffset()));
}
RedisUtil.hSetKV(topic, redisMap);
// 提交offset
((CanCommitOffsets) stream.inputDStream()).commitAsync(offsetRanges);
}
});
jsc.start();
jsc.awaitTermination();
}
}
public class KafkaParamsConfig {
/**
* 创建kafka 数据源 (单个topic)
* @param topic
* @param jsc
* @return
*/
public static JavaInputDStream<ConsumerRecord<String, String>> buildKafkaSourceDStream(String topic, JavaStreamingContext jsc){
Map<String, Object> kafkaParams = Maps.newHashMap();
kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, PropertiesConstants.KAFKA_BROKERS);
kafkaParams.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
kafkaParams.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, PropertiesConstants.KAFKA_GROUPID);
kafkaParams.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest");
kafkaParams.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, false);
JavaInputDStream<ConsumerRecord<String, String>> stream;
if (MapUtils.isEmpty(getOffsets(topic))) {
stream = KafkaUtils.createDirectStream(jsc,
LocationStrategies.PreferConsistent(),
ConsumerStrategies.Subscribe(Lists.newArrayList(topic), kafkaParams));
} else {
// 从指定offset处消费
stream = KafkaUtils.createDirectStream(jsc,
LocationStrategies.PreferConsistent(),
ConsumerStrategies.Subscribe(Lists.newArrayList(topic), kafkaParams, getOffsets(topic)));
}
return stream;
}
/**
* 从redis中获取 kafka消费的offset
* @param topic
* @return
*/
private static Map<TopicPartition, Long> getOffsets(String topic) {
Map<TopicPartition, Long> map = Maps.newHashMap();
Map<String, String> stringMap = RedisUtil.hGetAll(topic);
if (MapUtils.isEmpty(stringMap)) {
return map;
}
for (Map.Entry<String, String> entry : stringMap.entrySet()) {
String partition = entry.getKey();
String offset = entry.getValue();
TopicPartition topicPartition = new TopicPartition(topic, NumberUtils.toInt(partition));
map.put(topicPartition, NumberUtils.toLong(offset));
}
return map;
}
}
两个工具类和一个常量类,一些配置不能写死,配在配置文件中
/**
* @author msy
* @Type PropertiesConstants.java
* @Desc 可能有些常量这里用不到,懒得删了
* @date
*/
public class PropertiesConstants {
/**
* redis的参数
*/
public static final String REDIS_HOST = PropConfig.getProperty("ykc.redis.host");
public static final String REDIS_PORT = PropConfig.getProperty("ykc.redis.port");
public static final String REDIS_PASSWORD = PropConfig.getProperty("ykc.redis.password");
public static final String REDIS_TIMEOUT = PropConfig.getProperty("ykc.redis.timeout");
public static final String REDIS_DATABASE = PropConfig.getProperty("ykc.redis.database");
public static final String REDIS_MAXIDLE = PropConfig.getProperty("ykc.redis.maxidle");
public static final String REDIS_MINIDLE = PropConfig.getProperty("ykc.redis.minidle");
public static final String REDIS_MAXTOTAL = PropConfig.getProperty("ykc.redis.maxtotal");
/**
* hdfs的参数
*/
public static final String HDFS_PATH = PropConfig.getProperty("ykc.hdfs.path");
/**
* hive的参数
*/
public static final String HIVE_DATABASE = PropConfig.getProperty("ykc.hive.database");
/**
* kafka参数
*/
public static final String KAFKA_BROKERS = PropConfig.getProperty("ykc.kafka.brokers");
public static final String KAFKA_GROUPID = PropConfig.getProperty("ykc.kafka.groupId");
}
/**
* 读取配置文件工具类
*/
@Slf4j
public class PropConfig {
private static Properties properties ;
private static String PROPERTIES_FILE_NAME = "application-dev.properties";
static {
// 获取环境
InputStream in = PropConfig.class.getClassLoader().getResourceAsStream("application.properties");
Properties prop = new Properties();
try {
prop.load(in);
} catch (IOException e) {
log.error("读取配置文件:application.properties 出错");
}
String profileName = prop.getProperty("ykc.profile");
if ("dev".equals(profileName)) {
// 开发环境
PROPERTIES_FILE_NAME = "application-dev.properties";
}
if ("test".equals(profileName)) {
// 测试环境
PROPERTIES_FILE_NAME = "application-test.properties";
}
if ("prod".equals(profileName)) {
// 生产环境
PROPERTIES_FILE_NAME = "application-prod.properties";
}
//初始化
properties= getProperties();
}
//生成properties
private static final Properties getProperties(){
if (properties == null) {
properties = new Properties();
}
InputStream in = PropConfig.class.getClassLoader().getResourceAsStream(PROPERTIES_FILE_NAME);
try {
properties.load(in);
} catch (IOException e) {
log.error("读取配置文件出错");
} finally {
IOUtils.closeQuietly(in);
}
return properties;
}
public static final String getProperty(String key) {
return properties.getProperty(key);
}
}
/***
* @Description: redis工具类
* @Auther: minsiyi
* @Date: 2019/12/24 19:45
* @version: v1.0
*/
@Slf4j
public class RedisUtil {
private static JedisPool pool = null;
private JedisPoolConfig conf;
static {
try {
JedisPoolConfig config = new JedisPoolConfig();
// 最大(小)闲置个数
config.setMaxIdle(NumberUtils.toInt(PropertiesConstants.REDIS_MAXIDLE));
config.setMinIdle(NumberUtils.toInt(PropertiesConstants.REDIS_MINIDLE));
// 最大连接数
config.setMaxTotal(NumberUtils.toInt(PropertiesConstants.REDIS_MAXTOTAL));
pool = new JedisPool(config,
PropertiesConstants.REDIS_HOST,
NumberUtils.toInt(PropertiesConstants.REDIS_PORT),
NumberUtils.toInt(PropertiesConstants.REDIS_TIMEOUT),
PropertiesConstants.REDIS_PASSWORD,
NumberUtils.toInt(PropertiesConstants.REDIS_DATABASE));
} catch (Exception e) {
log.error("redis连接池异常", e);
}
}
private static Jedis getJedis() {
return pool.getResource();
}
private static void closeJedis(Jedis jedis) {
if (jedis != null) {
jedis.close();
}
}
public static void setKV(String k, String v) {
Jedis jedis = null;
try {
jedis = getJedis();
jedis.set(k, v);
} catch (Exception e) {
log.error("redis存储k-v异常,key:{}", k, e);
} finally {
closeJedis(jedis);
}
}
public static String getKV(String k) {
Jedis jedis = null;
try {
jedis = getJedis();
return jedis.get(k);
} catch (Exception e) {
log.error("redis获取值异常,key:{}", k, e);
} finally {
closeJedis(jedis);
}
return null;
}
public static String hGetKV(String k, String field) {
Jedis jedis = null;
try {
jedis = getJedis();
return jedis.hget(k, field);
} catch (Exception e) {
log.error("redis获取值异常,key:{}", k, e);
} finally {
closeJedis(jedis);
}
return null;
}
public static Map<String, String> hGetAll(String k) {
Jedis jedis = null;
try {
jedis = getJedis();
return jedis.hgetAll(k);
} catch (Exception e) {
log.error("redis获取值异常,key:{}", k, e);
} finally {
closeJedis(jedis);
}
return null;
}
public static void hSetKV(String k, Map<String, String> map) {
Jedis jedis = null;
try {
jedis = getJedis();
jedis.hset(k, map);
} catch (Exception e) {
log.error("redis获取值异常,key:{}", k, e);
} finally {
closeJedis(jedis);
}
}
}
配置文件如下:
1.application.properties 这个配置确定是哪个环境
##环境指定
ykc.profile=dev
2.application-dev.properties 开发环境的配置文件(有些这里可能用不到,懒得删了,自己看着取)
##mysql的设置
ykc.mysql.driver=com.mysql.jdbc.Driver
#uat外网
ykc.mysql.omp.url=jdbc:mysql:/xxx.xxx.xxx.xxx:3306/%s?zeroDateTimeBehavior=convertToNull&useOldAliasMetadataBehavior=true&tinyInt1isBit=false
ykc.mysql.omp.username=xxxx
ykc.mysql.omp.password=xxxx
#test外网 (这里用占位符标志库名,代码中可以动态获取)
ykc.mysql.saas.url=jdbc:mysql://xxx.xxx.xxx.xxx:3306/%s?zeroDateTimeBehavior=convertToNull&useOldAliasMetadataBehavior=true&tinyInt1isBit=false
ykc.mysql.saas.username=xxxx
ykc.mysql.saas.password=xxx
##redis的设置 (3.0测试库)
ykc.redis.host=xxx.xxx.xxx.xxx
ykc.redis.port=6379
ykc.redis.password=xxx
ykc.redis.database=23
ykc.redis.timeout=2000
ykc.redis.maxidle=10
ykc.redis.minidle=30
ykc.redis.maxtotal=50
##hdfs的配置
ykc.hdfs.path=hdfs://xxx.xxx.xxx.xxx:9000/root/hadoop/dfs/data/spark/dev
##hive的配置
ykc.hive.database=hive_1
##kafka的配置
ykc.kafka.brokers=xxx.xxx.xxx.xxx:9092
ykc.kafka.groupId=xxx
3.还有 application-prod.properties 、application-test.properties 等,自己按需添加吧。不多说了
注:整体思路就是:把kafka每个分区的offset存到redis,下次任务重启再从redis里读取。