如有一下的数据:
20161123101523 http://java.learn.com/java/javaee.shtml 20161123101523 http://java.learn.com/java/javaee.shtml 20161123101523 http://ui.learn.com/ui/video.shtml 20161123101523 http://bigdata.learn.com/bigdata/teacher.shtml 20161123101523 http://android.learn.com/android/video.shtml 20161123101523 http://h5.learn.com/h5/teacher.shtml 20161123101523 http://h5.learn.com/h5/course.shtml 20161123101523 http://bigdata.learn.com/bigdata/teacher.shtml 20161123101523 http://java.learn.com/java/video.shtml 20161123101523 http://bigdata.learn.com/bigdata/teacher.shtml 20161123101523 http://ui.learn.com/ui/course.shtml 20161123101523 http://bigdata.learn.com/bigdata/teacher.shtml 20161123101523 http://h5.learn.com/h5/course.shtml 20161123101523 http://java.learn.com/java/video.shtml |
要求:统计所有用户对每个学科的各个模块的访问次数,再取Top3
(1)使用常规的算子
import java.net.URL
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
//D:\数据\subject
//统计所有的用户对每个学科的各个模块的访问次数,再去top3
object SubjectCount1 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SubjectCount1").setMaster("local[*]")
val context = new SparkContext(conf)
//读取文件
val file = context.textFile("D:\\数据\\subject\\access.txt")
//切分,形成的格式为:(http://java.learn.com/java/javaee.shtml,1)
val urlRdd: RDD[(String, Int)] = file.map(x=>{
val word=x.split("\t")
// val url=new URL(word(1)).getHost;
(word(1),1)
})
//将相同的url进行聚合,得到的结果就是相同的学科里的相同的模块的访问量
val urlCount: RDD[(String, Int)] = urlRdd.reduceByKey(_+_)
val subjectandCounr: RDD[(String, String, Int)] =urlCount.map(x=>{
val url = new URL(x._1).getHost
(url,x._1,x._2)
})
//分组,相同类别的学科在一起
val grouped: RDD[(String, Iterable[(String, String, Int)])] = subjectandCounr.groupBy(_._1)
//排序
val sorted: RDD[(String, List[(String, String, Int)])] = grouped.mapValues(_.toList.sortBy(_._3).reverse)
val res = sorted.mapValues(_.take(3))
res.foreach(println)
context.stop()
}
}
(2)使用cache的方式:
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
//方式2使用cache(缓存)
object SubjectCount2 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SubjectCount1").setMaster("local[*]")
val context = new SparkContext(conf)
//读取文件
val file = context.textFile("D:\\数据\\subject\\access.txt")
//切分,形成的格式为:(http://java.learn.com/java/javaee.shtml,1)
val urlRdd: RDD[(String, Int)] = file.map(x=>{
val word=x.split("\t")
// val url=new URL(word(1)).getHost;
(word(1),1)
})
//将相同的url进行聚合,得到的结果就是相同的学科里的相同的模块的访问量
val urlCount: RDD[(String, Int)] = urlRdd.reduceByKey(_+_)
//在发生shuffle后得到的中间结果数据往往很重要,会经常调用该数据
//为了提高调用的速度,可以把数据缓存起来
//注意cache的数据特别的大的时候,会造成整个任务出现oom
//通常会将shuffle后的比较重要的数据先做一下缓存
//1、便于以后快速的访问
//2、提高数据的安全性
val cached: RDD[(String, Int)] = urlCount.cache()
println(cached.collect().toBuffer)
/*
ArrayBuffer((http://ui.learn.com/ui/course.shtml,26),
(http://bigdata.learn.com/bigdata/teacher.shtml,46),
(http://bigdata.learn.com/bigdata/course.shtml,25), (http://h5.learn.com/h5/course.shtml,47),
(http://ui.learn.com/ui/teacher.shtml,23), (http://java.learn.com/java/teacher.shtml,25),
(http://ui.learn.com/ui/video.shtml,37), (http://h5.learn.com/h5/video.shtml,11),
(http://java.learn.com/java/javaee.shtml,13), (http://bigdata.learn.com/bigdata/video.shtml,47),
(http://android.learn.com/android/video.shtml,5), (http://h5.learn.com/h5/teacher.shtml,17),
(http://java.learn.com/java/video.shtml,23))
*/
val subjects = Array("http://bigdata.learn.com", "http://ui.learn.com/ui",
"http://java.learn.com", "http://h5.learn.com", "http://android.learn.com")
for (subject <- subjects) { //这里使用了startsWith
val filteredSubject: RDD[(String, Int)] = cached.filter(_._1.startsWith(subject))
val sorted: RDD[(String, Int)] = filteredSubject.sortBy(_._2, false)
val res: Array[(String, Int)] = sorted.take(3)
println(res.toBuffer)
}
context.stop()
}
}
(3)使用了自定义分区的形式
import java.net.URL
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import scala.collection.mutable
object SubjectCount3 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SubjectCount1").setMaster("local[*]")
val context = new SparkContext(conf)
//读取文件
val file = context.textFile("D:\\数据\\subject\\access.txt")
//切分,形成的格式为:(http://java.learn.com/java/javaee.shtml,1)
val urlRdd: RDD[(String, Int)] = file.map(x=>{
val word=x.split("\t")
// val url=new URL(word(1)).getHost;
(word(1),1)
})
//将相同的url进行聚合,得到的结果就是相同的学科里的相同的模块的访问量
val urlCount: RDD[(String, Int)] = urlRdd.reduceByKey(_+_)
// 获取学科信息并把聚合后的所有数据缓存
val cachedUrl: RDD[(String, (String, Int))] = urlCount.map(x => {
val url = x._1
val subject = new URL(url).getHost
val sumed = x._2
(subject, (url, sumed))
}).cache()
// 调用Spark默认的分区器来进行数据的分区,会发生哈希碰撞,导致出现数据倾斜,需要自定义分区器
// val res: RDD[(String, (String, Int))] = cachedUrl.partitionBy(new HashPartitioner(3))
// res.saveAsTextFile("c://out20180301-1")
// 获取所有的学科信息,使用keys并使用distinct去重
val subjects: Array[String] = cachedUrl.keys.distinct().collect()
// 调用自定义分区器来获取分区号
val partitioner = new SubjectPartitioner(subjects)
// 开始分区,partitionBy该函数根据partitioner函数生成新的ShuffleRDD,将原RDD重新分区
val partitioned: RDD[(String, (String, Int))] = cachedUrl.partitionBy(partitioner)
// 排序取Top3,将相同的学科类型放在了一个partition里面现在可以直接的对分区里面的操作
val res: RDD[(String, (String, Int))] = partitioned.mapPartitions(it => {
it.toList.sortBy(_._2._2).reverse.take(3).iterator
})
res.saveAsTextFile("D:\\数据\\subject\\q1")
context.stop()
}
}
class SubjectPartitioner(subjects: Array[String]) extends Partitioner{
// 用来存储学科信息和分区号,使用的是可变的HashMap,scala中的集合分为两种,一种是可变的集合,另一种是不可变的集合
private val subjectAndNum: mutable.HashMap[String, Int] = new mutable.HashMap[String, Int]()
// 计数器,用来生成分区号
var i = 0
for (subject <- subjects){
subjectAndNum += (subject -> i)
i += 1
}
// 获取分区数
override def numPartitions = subjects.length
//println("subjects.length"+subjects.length)
// 获取分区号
override def getPartition(key: Any) = {
val i= subjectAndNum.getOrElse(key.toString, 0)
//println("i"+i)
i
}
}
/*class sub() extends Partitioner{
override def numPartitions: Int = ???
override def getPartition(key: Any): Int = ???
}*/
会形成5个分区:生成5个文件