数据源:
数据结构:时间戳,省份,城市,用户,广告,中间字段使用空格分割
数据源下载链接:https://download.csdn.net/download/weixin_46122692/12921817
https://download.csdn.net/download/weixin_46122692/12921819
项目需求
1.只在大表中,统计出每一个省份广告被点击次数的TOP3
import org.apache.log4j.{
Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.{
SparkConf, SparkContext}
object Demo01AD {
def main(args: Array[String]): Unit = {
//控制日志输出
Logger.getLogger("org").setLevel(Level.ERROR)
val conf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[2]")
val sc = new SparkContext(conf)
//从本地读取数据源
val lineRdd = sc.textFile("E://file/spark/agent.log")
//需求:统计每个省份 广告的点击次数 Top3
//把数据映射成kv 的形式,((provience,ad),1)
val rdd01 = lineRdd.map(line=>{
//时间戳,省份,城市,用户,广告
//1516609143867 6 7 64 16
val splited = line.split(" ")
val provience = splited(1)
val ad = splited(4)
/* key为元组省份、广告,value设置出现一次 */
( (provience,ad),1)
})
//计算总次数 求和
val rdd02: RDD[((String, String), Int)] = rdd01.reduceByKey(_+_)
//排序 求出每个省份的前3名
//((provience,ad),count) -> (provience,(ad,count)) (provience,ad,count)
val rdd03 = rdd02.map(tuple=>{
/* 改变RDD的KV格式,key为省份,value为元组广告、广告点击次数 */
(tuple._1._1, (tuple._1._2,tuple._2) )
})
//按照省份分组
val rdd04: RDD[(String, Iterable[(String, Int)])] = rdd03.groupByKey()
val rdd05 = rdd04.map(tuple=>{
val provience = tuple._1
//算子里面不能在写算子。里面是scala 的API
//对广告次数做排序
val tuples: List[(String, Int)] = tuple._2.toList.sortBy(_._2).reverse.take(3)
(provience,tuples.toBuffer)
})
rdd05.foreach(println(_))
/* 关闭资源 */
sc.stop()
}
}
2.一张大表和一张小表联合,统计出每一个省份广告被点击次数的TOP3
import org.apache.log4j.{
Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.{
SparkConf, SparkContext}
object Demo02AD {
def main(args: Array[String]): Unit = {
//控制日志输出
Logger.getLogger("org").setLevel(Level.ERROR)
val conf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[2]")
val sc = new SparkContext(conf)
//从本地读取数据源
val lineRdd = sc.textFile("E://file/spark/agent.log")
//需求:统计每个省份 广告的点击次数 Top3
//把数据映射成kv 的形式,((province,ad),1)
val rdd01 = lineRdd.map(line=>{
// 数据源格式 时间戳,省份,城市,用户,广告
//1516609143867 6 7 64 16
val splited = line.split(" ")
val province = splited(1)
val ad = splited(4)
/* key为元组省份、广告,value设置出现一次 */
( (province,ad),1)
})
//计算总次数 求和
val rdd02: RDD[((String, String), Int)] = rdd01.reduceByKey(_+_)
//排序 求出每个省份的前3名
//((province,ad),count) -> (province,ad,count)
val rdd03: RDD[(String, String, Int)] = rdd02.map(tuple => {
/* 改变RDD,以元组返回,省份、广告、广告点击次数 */
(tuple._1._1, tuple._1._2, tuple._2)
})
// 按照省份分组
val rdd04: RDD[(String, Iterable[(String, String, Int)])] = rdd03.groupBy(_._1)
//排序取出前3名
val rdd05:RDD[(String, String, Int)] = rdd04.flatMap(tuple=>{
//(String, String, Int)
val tuples: List[(String, String, Int)] = tuple._2.toList.sortBy(_._3).reverse.take(3)
tuples
})
//(province,(province,ad,count))
val rdd06:RDD[ (String,(String,String,Int)) ] = rdd05.map(tuple=>{
// (省份,(省份,广告,次数))
(tuple._1,tuple)
})
rdd06.foreach(println(_))
println("********")
//读取数据源
val pidRdd = sc.textFile("E://file/spark/agent1.log")
//(province,name)
val provienceNameRdd: RDD[(String, String)] = pidRdd.map(line=>{
var split = line.split(" ")
/* 省份ID */
var province = split(0)
/* 省份名称 */
val name = split(1)
(province,name)
})
// join 返回:(String, ((String, String, Int), String)) (省份ID,((省份ID,广告ID,广告次数),省份名称))
val joinedRdd: RDD[(String, ((String, String, Int), String))] = rdd06.join(provienceNameRdd)
//输出(name,ad,count)
joinedRdd.foreach(tuple=>{
/* 输出 省份名称、广告ID、广告次数 */
println( (tuple._2._2,tuple._2._1._2,tuple._2._1._3 ) )
})
/* 关闭资源 */
sc.stop()
}
}