impclick.txt
id keywords imp click
1010,华语剧场|剧情|当代|类型,1,0
1010,剧情|剧情|家庭剧|类型|热血,1,1
1011,华语剧场|剧情|当代|类型,1,0
1011,剧情|爱情|家庭剧|类型|热血,1,1
1012,华语剧场|剧情|当代|类型,1,0
1012,剧情|爱情|剧情|剧情|家庭剧|类型|热血,1,1
1011,华语剧场|剧情|当代|类型,1,0
1012,剧情|爱情|剧情|剧情|家庭剧|类型,1,1
1013,华语剧场|剧情|当代|类型,1,0
1013,剧情|爱情|剧情|剧情|家庭剧|热血,1,1
1014,华语剧场|剧情|当代|类型,1,0
1014,剧情|爱情|剧情|家庭剧|类型|热血,1,1
1014,华语剧场|剧情|当代|类型,1,0
1015,剧情|爱情|剧情|家庭剧|类型|热血,1,1
1016,华语剧场|剧情|当代|类型,1,1
1017,剧情|爱情|剧情|剧情,1,1
1018,华语剧场|剧情|当代|类型,1,1
1017,剧情|爱情|剧情|剧情|家庭剧,1,1
1017,华语剧场|剧情|当代|类型,1,0
1017,剧情|爱情|剧情|剧情|家庭剧|类型,1,0
1017,华语剧场|剧情|当代|类型,1,0
1016,剧情|爱情|剧情|剧情|类型|热血,1,1
1015,华语剧场|剧情|当代|类型,1,0
1014,剧情|爱情|家庭剧|类型|热血,1,0
1013,华语剧场|剧情|当代|类型,1,0
1012,剧情|剧情|家庭剧|类型|热血,1,1
1011,华语剧场|当代|类型,1,0
1011,剧情|家庭剧|类型|热血,1,0
1012,华语剧场|剧情|当代|类型,1,0
1013,剧情|爱情|剧情|剧情|家庭剧|类型|热血,1,0
package com.wedoctor.sparkcore
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object ImpAndClick {
Logger.getLogger("org").setLevel(Level.ERROR)
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName(this.getClass.getSimpleName)
.setMaster("local[2]")
val sc = new SparkContext(conf)
//1011,华语剧场|剧情|当代|类型,1,0
val impDat: RDD[String] = sc.textFile("impclick.txt")
val proDat: RDD[((String, String), (Int, Int))] = impDat.flatMap(str => {
val strings: Array[String] = str.split(",")
val id: String = strings(0)
val keyWords: String = strings(1)
val imp: Int = strings(2).toInt
val click = strings(3).toInt
val splits: Array[String] = keyWords.split("[|]")
splits.map { case words => ((id, words), (imp, click)) }
})
val result: RDD[((String, String), (Int, Int))] = proDat.reduceByKey{case (k,v) =>(k._1+v._1,k._2+v._2)}
proDat.reduceByKey{
case ((i1,c1),(i2,c2)) => {
(i1+i2,c1+c2)
}}
result.map{
case ((id,words),(imp,click)) =>(id,words,imp,click)
}.foreach(println)
sc.stop()
}
}
结果:
(1018,当代,1,1)
(1012,热血,2,2)
(1014,爱情,2,1)
(1013,爱情,2,1)
(1010,剧情,3,2)
(1017,家庭剧,2,1)
(1015,家庭剧,1,1)
(1013,热血,2,1)
(1014,热血,2,1)
(1011,剧情,4,1)
(1014,家庭剧,2,1)
(1012,剧情,10,8)
(1018,类型,1,1)
(1012,爱情,2,2)
(1016,爱情,1,1)
(1017,爱情,3,2)
(1016,热血,1,1)
(1017,当代,2,0)
(1017,类型,3,0)
(1011,类型,5,1)
(1010,热血,1,1)
(1017,华语剧场,2,0)
(1018,剧情,1,1)
(1013,剧情,8,3)
(1011,家庭剧,2,1)
(1016,华语剧场,1,1)
(1011,热血,2,1)
(1010,华语剧场,1,0)
(1016,当代,1,1)
(1015,华语剧场,1,0)
(1011,当代,3,0)
(1010,家庭剧,1,1)
(1013,华语剧场,2,0)
(1015,类型,2,1)
(1012,家庭剧,3,3)
(1011,爱情,1,1)
(1014,华语剧场,2,0)
(1012,类型,5,3)
(1011,华语剧场,3,0)
(1015,剧情,3,2)
(1015,爱情,1,1)
(1017,剧情,11,6)
(1018,华语剧场,1,1)
(1016,类型,2,2)
(1013,当代,2,0)
(1012,当代,2,0)
(1014,剧情,5,2)
(1014,当代,2,0)
(1013,家庭剧,2,1)
(1015,当代,1,0)
(1015,热血,1,1)
(1014,类型,4,1)
(1016,剧情,4,4)
(1012,华语剧场,2,0)
(1010,类型,2,1)
(1010,当代,1,0)