kafka 0.8 + spark offset submitted to mysql

kafka版本:<kafka.version> 0.8.2.1</kafka.version>

spark版本  <artifactId>spark-streaming-kafka-0-8_2.11</artifactId>

 

{DmRealStat Object

DEF main (args: the Array [ String]): Unit = {
/ **
* 1. Integrated data reading data kafka
* first starts acquiring an offset from the database, to start reading
* / Val = sparkConf new new SparkConf (). a setMaster ( "local [. 4]"). setAppName ( "real time monitoring") // turn on the back-pressure open automatically select the optimum spark consumption rate based on system load sparkConf.set ( "spark.streaming. backpressure.enabled ", " to true ") //spark.streaming.backpressure.initialRate (integer) default directly read all sparkConf.set ( " spark.streaming.backpressure.initialRate ", " 1000 ") // (4) restrictions each consumer thread reads per second partition kafka maximum amount of data (integer) each of the default direct read all sparkConf.set ( "spark.streaming.kafka.maxRatePerPartition", "500") SparkConf.set (








"spark.streaming.stopGracefullyOnShutdown", "true")
// sparkConf.set("spark.driver.memory","2G")
val ssc = new StreamingContext(sparkConf, Seconds(2))
val sc = ssc.sparkContext


//sparksql
val spark = SparkSession.builder().config(sparkConf).enableHiveSupport().getOrCreate()

//程序第一次启动,无偏移量
/*

def createDirectStream[
K: ClassTag, key的类型
V: ClassTag, value的类型
KD <: Decoder[K]: ClassTag,
VD <: Decoder[V]: ClassTag] (
ssc: StreamingContext,
kafkaParams: Map[String, String],
topics: Set[String]
): InputDStream[(K, V)] = {
val messageHandler = (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message)
val kc = new KafkaCluster(kafkaParams)
val fromOffsets = getFromOffsets(kc, kafkaParams, topics)
new DirectKafkaInputDStream[K, V, KD, VD, (K, V)](
ssc, kafkaParams, fromOffsets, messageHandler)
}
*/
val conf = ConfigFactory.load()
val brokers = conf.getString("kafka.broker.list")
val topic = conf.getString("kafka.topic")
val groupid = "11"
val kafkaParams = Map(
"metadata.broker.list" -> brokers,
"auto.offset.reset" -> "smallest",
"group.id" -> groupid
)



// load configuration information loaded default.jdbc default settings For production scalajdbcTest
DBs.setup ()
Val fromOffsets: the Map [TopicAndPartition, Long] = {DB.readOnly Implicit the session =>
SQL "SELECT Topic, Partitions, offset from stream_offset WHERE = groupId? and Topic =? = brokerlist and?. "the bind (groupId, Topic, Brokers) .map (RS => {
(TopicAndPartition (rs.get [ String] ( " Topic "), rs.get [Int] ( "Partitions")), rs.long ( "offset"))
}). List (). Apply ()
} .toMap

Val Topics = the Set (Topic)

Val Stream = IF (fromOffsets.size == 0) {
// first starts
KafkaUtils.createDirectStream [ String,String, StringDecoder, StringDecoder] (SSC, kafkaParams, Topics)
}
the else {
// program first starts a non- var = checkOffset the Map [TopicAndPartition, Long] () // Consideration: kafka The default is 7 days, but the data stored in the process in the consumer had not started, the saved offset is expired offset so // query must be offset with the currently valid comparison if the first offset saved than the current small, indicating that expired val kafkaCluste = KafkaCluster new new (kafkaParams); // pass in TopicAndPartition Val earliestLeaderOffsets = kafkaCluste.getEarliestLeaderOffsets (fromOffsets.keySet) IF (earliestLeaderOffsets.isRight) { // get the partition and corresponding offset Val topicAndOffset: the Map [TopicAndPartition, KafkaCluster.LeaderOffset ] = earliestLeaderOffsets.right.get checkOffset = fromOffsets.map (selectOffset => { // get the current cluster partition early offset












topicAndOffset.get currentOffset = Val (selectOffset._1) .get.offset
IF (selectOffset._2> = currentOffset) {
greater than the current // database cluster database is used offfset
selectOffset
} the else {
(selectOffset._1, currentOffset)
// A new new KafkaConsumer = Val (the Map [String, Object] ( "" -> "")
}

})
checkOffset
}
// here for offset from a database, the program starts from here onward consumption val messageHandler = (mm: MessageAndMetadata [ String, String]) => { (mm.key (), mm.message ()) } KafkaUtils.createDirectStream [ String, String, StringDecoder, StringDecoder, ( String, String)] (SSC, kafkaParams, checkOffset, the messageHandler ) } // 2. data processing








Stream
.foreachRDD (kafkardd => {
// A Val: RDD [(String, String)] = kafkardd . MapData Val = LogUtils.logParse (kafkardd.map (_._ 2)) filter (log => log.contains ( " EN ") && log ( " EN ") == " e_dm ") mapdata.foreach (println (_)) var minute = " " // 2 real-time statistical information for review // look at offset // 3 independent management offset stored redis / or MySQL Val offsetRanges = kafkardd.asInstanceOf [HasOffsetRanges] .offsetRanges offsetRanges.foreach (OffsetRange => { DB.autoCommit ( Implicit the session => SQL "Replace iNTO stream_offset (Topic, Partitions, groupId, brokerlist , offset) values (?,?,?,?,?). "the bind ( offsetRange.topic, offsetRange.partition, groupid,
















brokers,
offsetRange.untilOffset
).update().apply()
)

println("topic:" + offsetRange.topic + "分区:" + offsetRange.partition + "开始消费" + offsetRange.fromOffset + "消费到" + offsetRange.untilOffset + "共计" + offsetRange.count())
}


)
})

ssc.start()
ssc.awaitTermination()
}


def dongmanStat(mapdata:RDD[mutable.Map[String,String]]): Unit ={
val baseData = mapdata.filter(map => map.contains("c_type_name") && map.contains("status")).map(_map => {
val baseData = mapdata.map(_map => {
// String contId = _map.get("c_id");
// String cpId = _map.get("cp_id");
// String contTypeName = _map.get("c_type_name");
// String status = _map.get("status");
// String duration = _map.get("dura");
// String operator = _map.get("operator");
// String bcTime = _map.get("bc_time");
val minute = _map("s_time").substring(0, 12)
val day = _map("s_time").substring(0, 8)
val c_type = _map("c_type_name");
val progId = _map("cp_id");
val bcTotal = if (_map("status").toInt >= 8) 1 else 0
val receive = if (_map("status").toInt == 8) 1 else 0
val waitingBc = if (_map("status").toInt == 8) 1 else 0
val bcPerson = _map.getOrElse("operator", " ");
val syncTime = _map.getOrElse("sync_time", "");
// val srcLog = _map.getOrElse("src_log");
// val isDel = _map.getOrElse("is_delete",0)
// val isBcReview = _map.getOrElse("is_bc_review","")
(day, c_type, progId, bcPerson, syncTime, List[Int](bcTotal, receive, waitingBc))
})


// //内容统计
// val contBcStat = baseData.map {
// case (day, contId, progId, bcPerson, syncTime, list) => {
// ((day, contId), list)
// }
// }.distinct().reduceByKey((list1, list2) => {
// list1.zip(list2).map(i => {
// i._1 + i._2
// })
// }).foreachPartition(rdd => {
// val jedis = JedisUtil.getJedisClient()
// rdd.foreach(data => {
// val key: String = "cidStat" + "_" + data._1._1
// val a = jedis.hincrBy(key, "bcTotal", data._2(0))
// if (a > 0) println("自增成功") else println("自增失败")
// jedis.hincrBy(key, "receive", data._2(1))
// jedis.hincrBy(key, "waitingBc", data._2(2) - data._2(0))
// })
// jedis.Close () ) .distinct ()val bcPersonStat = baseData.map (t =>// if statistics content play control person is the same number of pieces of content control to re-
//})




.UpdateStateByKey // [Long] ((SEQ: Seq [Int], State: Option-[Long]) => {
// // SEQ: Seq [Long] each of the current batch of the same key value composed of Seq
/ / Val currentValue = seq.sum
// // State: the cumulative results of all previous batches representative of the current batch Option [Long], val for wordcount terms of the total number of all the previous batches is the same word appears
// state.getOrElse preValue = Val (0L)
// s Some (currentValue + preValue)
//})
.map (T => ((t._1, t._2), . 1))
.reduceByKey (_ + _)

.foreachPartition (RDD => {
Val jedis = JedisUtil.getJedisClient ()
rdd.foreach (Data => {
Val Key: String = data._1._1 + "_" + data._1._2
jedis.hincrBy (Key, "bcPersonStat" , data._2.toLong)
})
// will not release the data can not be inserted into the thread blocks occur
jedis.close ()
})
})
}

Guess you like

Origin www.cnblogs.com/hejunhong/p/12081042.html