直接上代码
object KafkaToHive{
def main(args: Array[String]){
val sparkConf = new SparkConf().setAppName("KafkaToHive")
val sc = new SparkContext(sparkConf)
val ssc = new StringContext(sc,Seconds(60))
// 创建kafka参数
val kafkaParams = Map[String,Object](
//ip为kafka集群ip,端口为集群端口
"bootstrap.servers" -> "ip1:port1,ip2:port2,ip:port3",
"group.id" -> "KafkaToHive_group1", //自定义组名称
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> "false")
val topics = Array("test1")
val stream = KafkaUtils.createDirectStreaming[String,String](
ssc,PreferConsistent,
Subscribe[String,String](topics,kafkaParms)
stream.foreachRDD(rdd=>{
if(rdd.count>0){
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
//TODO 具体处理逻辑
//写入Hive
//value为实际操作中的结果集,即是//TODO返回的结果集
val subRdd = rdd.sparkContext.parallelize(value)
val sqlContext : SQLContext = new HiveContext(rdd.sparkContext)
sqlContext.setConf("hive.exec.dynamic.partition.mode","nonstrict")
sqlContext.setConf("hive.exec.dynamic.partition","true") sqlContext.sql("use database1")
val tempTable = sqlContext
.read
.format("json")
.json(subRdd)
.select(cols.map(new Column(_)): _*)
.coalesce(1)
.write
.mode(SaveMode.Append)
.insertInto("task_exec_time")
//提交offset
stream.asInstanceOf[CanCommitOffsets].commotAsync(offsetRanges)
}
})
}
}