目前的项目中需要将kafka队列的数据实时存到hive表中。应为之前工作中用到的是CDH5.11,而且spark等用的基本是最新版,语言也一直是Scala,所以这次要求Java语言以及低版本的spark,在写程序的时候还是遇到一个头疼的事情,或许多读读spark源码会好点。先上一个程序能跑通的代码,然后再优化。
import java.util.HashMap; import java.util.HashSet; import java.util.Set; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.broadcast.Broadcast; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.hive.HiveContext; import org.apache.spark.streaming.Durations; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.apache.spark.streaming.kafka.KafkaUtils; import com.sun.tools.internal.xjc.reader.xmlschema.bindinfo.BIConversion.Static; import com.twitter.chill.FieldAccessFinder; import kafka.serializer.StringDecoder; public class WaitSave2Hive2 { // Kryo序列化 public static void main(String[] args) { // final String hiveTableName = ""; final String topicName = "waitingsave_message"; SparkConf conf = new SparkConf().setAppName("待保存队列到hive"); // conf.set("spark.streaming.backpressure.enabled", "true"); conf.set("spark.streaming.receiver.maxRate", "3"); conf.set("spark.streaming.kafka.maxRatePerPartition", "1"); JavaSparkContext jsc = new JavaSparkContext(conf); JavaStreamingContext jssc = new JavaStreamingContext(jsc, Durations.seconds(3)); // 如果在这里初始化hivecontext,在下面的算子内使用hivecontext会报一个空指针异常,原因貌似是用的时候Hivecontext未初始化成功(请知道的大佬普及一下) // HiveContext hiveContext = new HiveContext(jsc); HashMap<String, String> kafkaParams = new HashMap<>(); kafkaParams.put("metadata.broker.list", "slave43:9092,slave44:9092,slave45:9092"); kafkaParams.put("group.id", "feature_group"); kafkaParams.put("auto.offset.reset", "smallest"); Set<String> topicSet = new HashSet<>(); topicSet.add(topicName); KafkaUtils.createDirectStream(jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicSet).foreachRDD(rdd -> { HiveContext hiveContext = new HiveContext(jsc); if (!rdd.isEmpty()) { try { JavaRDD<String> vRdd = rdd.map(x -> { return x._2; }); hiveContext.read().json(vRdd).select("rowkey", "indate", "feature") .registerTempTable("temp_table"); String s = "insert into table testhive partition(yue='5',ri ='14')" + " select * from temp_table"; hiveContext.sql(s); } catch (Exception e) { e.printStackTrace(); } } }); jssc.start(); jssc.awaitTermination(); } }