实现功能:将数据按照某个字段进行分开存储
样例数据:
20170721101954 http://sport.sina.cn/sport/race/nba.shtml
20170721101954 http://sport.sina.cn/sport/watch.shtml
20170721101954 http://car.sina.cn/car/fps.shtml
20170721101954 http://sport.sina.cn/sport/watch.shtml
20170721101956 http://sport.sina.cn/sport/race/nba.shtml
20170721101957 http://sport.sina.cn/sport/watch.shtml
20170721101958 http://car.sina.cn/car/fps.shtml
20170721101959 http://sport.sina.cn/sport/watch.shtml
20170721101951 http://sport.sina.cn/sport/race/nba.shtml
20170721101951 http://sport.sina.cn/sport/watch.shtml
20170721101952 http://car.sina.cn/car/fps.shtml
20170721101953 http://sport.sina.cn/sport/watch.shtml
20170721101953 http://sport.sina.cn/sport/race/nba.shtml
20170721101950 http://sport.sina.cn/sport/watch.shtml
20170721101950 http://car.sina.cn/car/fps.shtml
20170721101950 http://sport.sina.cn/sport/watch.shtml
package 自定义分区
import java.net.URL
import org.apache.spark.{HashPartitioner, Partitioner, SparkConf, SparkContext}
import scala.collection.mutable
object Partition {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("UrlCount").setMaster("local[2]")
val sc = new SparkContext(conf)
val lines = sc.textFile("E:\\Users\\11046\\IdeaProjects\\SparkFly\\url1.log")
// 20170721101954 http://sport.sina.cn/sport/race/nba.shtml
val text = lines.map(line => {
val f = line.split("\t")
(f(1), 1) //最后一行作为返回值的 先给每个域名 后面增加1
})
val text1 = text.reduceByKey(_ + _) //统计每个域名的个数
// println(text1.collect.toBuffer)
// http://sport.sina.cn/sport/race/nba.shtml 1
val text2 = text1.map(t => {
val url = t._1 //每个url
val host = new URL(url).getHost
(host, (url, t._2)) //返回每个host
})
val fornum = text2.map(_._1).distinct().collect()
// println(fornum)
val np = new HostParitioner(fornum)
//后面的partitionBy也是一个固定写法
text2.partitionBy(np).saveAsTextFile("E:\\Users\\11046\\IdeaProjects\\SparkFly\\output2")
sc.stop() //关闭
}
}
自定义分区内容:
package 自定义分区
import org.apache.spark.Partitioner
import scala.collection.mutable
//程序代码中的自定义分区
class HostParitioner(ins: Array[String]) extends Partitioner {
val parMap = new mutable.HashMap[String, Int]()
var count = 0// 表示分区号
// 对for循环的目的是使 每个host 作为一个分区
for(i <- ins){
parMap += (i -> count)
count += 1
}
// 为了保证每一个域名有一个分区,就用fornum.length的形式 源码用到
override def numPartitions: Int = ins.length
//获得每个key的分区号 源码用到
override def getPartition(key: Any): Int = {
parMap.getOrElse(key.toString, 0)
}
}
结果展示:
第一部分:
(sport.sina.cn,(http://sport.sina.cn/sport/watch.shtml,8))
(sport.sina.cn,(http://sport.sina.cn/sport/race/nba.shtml,4))
第二部分:
(car.sina.cn,(http://car.sina.cn/car/fps.shtml,4))