Spark笔试题之pv&uv

impclick.txt

id         keywords                 imp  click

1010,华语剧场|剧情|当代|类型,1,0
1010,剧情|剧情|家庭剧|类型|热血,1,1
1011,华语剧场|剧情|当代|类型,1,0
1011,剧情|爱情|家庭剧|类型|热血,1,1
1012,华语剧场|剧情|当代|类型,1,0
1012,剧情|爱情|剧情|剧情|家庭剧|类型|热血,1,1
1011,华语剧场|剧情|当代|类型,1,0
1012,剧情|爱情|剧情|剧情|家庭剧|类型,1,1
1013,华语剧场|剧情|当代|类型,1,0
1013,剧情|爱情|剧情|剧情|家庭剧|热血,1,1
1014,华语剧场|剧情|当代|类型,1,0
1014,剧情|爱情|剧情|家庭剧|类型|热血,1,1
1014,华语剧场|剧情|当代|类型,1,0
1015,剧情|爱情|剧情|家庭剧|类型|热血,1,1
1016,华语剧场|剧情|当代|类型,1,1
1017,剧情|爱情|剧情|剧情,1,1
1018,华语剧场|剧情|当代|类型,1,1
1017,剧情|爱情|剧情|剧情|家庭剧,1,1
1017,华语剧场|剧情|当代|类型,1,0
1017,剧情|爱情|剧情|剧情|家庭剧|类型,1,0
1017,华语剧场|剧情|当代|类型,1,0
1016,剧情|爱情|剧情|剧情|类型|热血,1,1
1015,华语剧场|剧情|当代|类型,1,0
1014,剧情|爱情|家庭剧|类型|热血,1,0
1013,华语剧场|剧情|当代|类型,1,0
1012,剧情|剧情|家庭剧|类型|热血,1,1
1011,华语剧场|当代|类型,1,0
1011,剧情|家庭剧|类型|热血,1,0
1012,华语剧场|剧情|当代|类型,1,0
1013,剧情|爱情|剧情|剧情|家庭剧|类型|热血,1,0

package com.wedoctor.sparkcore

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

object ImpAndClick {
  Logger.getLogger("org").setLevel(Level.ERROR)
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setAppName(this.getClass.getSimpleName)
      .setMaster("local[2]")
    val sc = new SparkContext(conf)
    //1011,华语剧场|剧情|当代|类型,1,0
    val impDat: RDD[String] = sc.textFile("impclick.txt")
    val proDat: RDD[((String, String), (Int, Int))] = impDat.flatMap(str => {
      val strings: Array[String] = str.split(",")
      val id: String = strings(0)
      val keyWords: String = strings(1)
      val imp: Int = strings(2).toInt
      val click = strings(3).toInt
      val splits: Array[String] = keyWords.split("[|]")
      splits.map { case words => ((id, words), (imp, click)) }

    })
    val result: RDD[((String, String), (Int, Int))] = proDat.reduceByKey{case (k,v) =>(k._1+v._1,k._2+v._2)}
    proDat.reduceByKey{
      case ((i1,c1),(i2,c2)) => {
        (i1+i2,c1+c2)
      }}
    result.map{
      case ((id,words),(imp,click)) =>(id,words,imp,click)
    }.foreach(println)

    sc.stop()
  }
}

结果:

(1018,当代,1,1)
(1012,热血,2,2)
(1014,爱情,2,1)
(1013,爱情,2,1)
(1010,剧情,3,2)
(1017,家庭剧,2,1)
(1015,家庭剧,1,1)
(1013,热血,2,1)
(1014,热血,2,1)
(1011,剧情,4,1)
(1014,家庭剧,2,1)
(1012,剧情,10,8)
(1018,类型,1,1)
(1012,爱情,2,2)
(1016,爱情,1,1)
(1017,爱情,3,2)
(1016,热血,1,1)
(1017,当代,2,0)
(1017,类型,3,0)
(1011,类型,5,1)
(1010,热血,1,1)
(1017,华语剧场,2,0)
(1018,剧情,1,1)
(1013,剧情,8,3)
(1011,家庭剧,2,1)
(1016,华语剧场,1,1)
(1011,热血,2,1)
(1010,华语剧场,1,0)
(1016,当代,1,1)
(1015,华语剧场,1,0)
(1011,当代,3,0)
(1010,家庭剧,1,1)
(1013,华语剧场,2,0)
(1015,类型,2,1)
(1012,家庭剧,3,3)
(1011,爱情,1,1)
(1014,华语剧场,2,0)
(1012,类型,5,3)
(1011,华语剧场,3,0)
(1015,剧情,3,2)
(1015,爱情,1,1)
(1017,剧情,11,6)
(1018,华语剧场,1,1)
(1016,类型,2,2)
(1013,当代,2,0)
(1012,当代,2,0)
(1014,剧情,5,2)
(1014,当代,2,0)
(1013,家庭剧,2,1)
(1015,当代,1,0)
(1015,热血,1,1)
(1014,类型,4,1)
(1016,剧情,4,4)
(1012,华语剧场,2,0)
(1010,类型,2,1)
(1010,当代,1,0)

发布了79 篇原创文章 · 获赞 107 · 访问量 8万+

猜你喜欢

转载自blog.csdn.net/zuochang_liu/article/details/97046045
今日推荐