spark使用状态函数updateStateByKey算子实现数据入库

def calcProvinceClickTop(dateProvinceCityAdCountsDS:DStream[(String, Int)], sqlContext:SQLContext): Unit = {
//当前批次的记录
val dateProvinceAdCounts:DStream[(String, Int)] = dateProvinceCityAdCountsDS.map{case (dateProvinceCity, count) => {
val fields = dateProvinceCity.split("")
val date = fields(0)
val province = fields(1)
val adid = fields(3).toInt
val key = date + "
" + province + “" + adid
(key, count)
}}/*.reduceByKey(
+)/
//使用状态函数,计算截止到目前为止的数据 seq 代表当前key对应的本批次的新值,option代表当前key对应的历史的值(前置状态)
val usbDStream:DStream[(String, Int)] = dateProvinceAdCounts.updateStateByKey{case (seq:Seq[Int], option:Option[Int]) => {
Option(seq.sum + option.getOrElse(0))
}}
/

求top3,使用sparkStreaming+SparkSQL
*/
usbDStream.foreachRDD(rdd => {
if(!rdd.isEmpty()) {
val rowRDD = rdd.map{case (dateProvinceAd, count) => {
val fields = dateProvinceAd.split("
”)
val date = fields(0)
val province = fields(1)
val ad_id = fields(2).toInt
Row(date, province, ad_id, count)
}}
val schema = StructType(List(
StructField(“date”, DataTypes.StringType, false),
StructField(“province”, DataTypes.StringType, false),
StructField(“ad_id”, DataTypes.IntegerType, false),
StructField(“click_count”, DataTypes.IntegerType, false)
))
val df = sqlContext.createDataFrame(rowRDD, schema)
df.registerTempTable(“date_province_ad_tmp”)
val ret = sqlContext.sql("select " +
"date, " +
"province, " +
"ad_id, " +
"click_count, " +
"row_number() over(partition by province order by click_count desc) rank " +
"from date_province_ad_tmp " +
“having rank < 4”)
ret.show()
//入库
ret.rdd.foreachPartition(partition => {
if(!partition.isEmpty) {
val adProvinceTopDao = new AdProvinceTopDaoImpl
val list = new util.ArrayListAdProvinceTop
partition.foreach(row => {
val apt = new AdProvinceTop
apt.setDate(row.getAsString)
apt.setProvince(row.getAsString)
apt.setClick_count(row.getAsInt)
apt.setAd_id(row.getAsInt)
list.add(apt)
})
adProvinceTopDao.insertBatch(list)
}
})
}
})
}

猜你喜欢

转载自blog.csdn.net/zwmonk/article/details/82935643
今日推荐