IpFromUtils来自 查询ip归属地
package com.ws.spark
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* 统计日志中ip归属地出现次数
*/
object IpFromCount {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("IpFromCount").setMaster("local[4]")
val sc = new SparkContext(conf)
//从本地读取规则
val rules: Array[(Long, Long, String)] = IpFromUtils.rules(args(0))
//广播变量
val broadCast: Broadcast[Array[(Long, Long, String)]] = sc.broadcast(rules)
//读取数据
val data: RDD[String] = sc.textFile(args(1))
//清洗数据
val provinceData: RDD[(String, Int)] = data.map(line => {
val lineArr: Array[String] = line.split("[|]")
val ip = lineArr(1)
//转换成十进制
val ipNum: Long = IpFromUtils.ipToLong(ip)
//使用广播变量
val broadCastValue: Array[(Long, Long, String)] = broadCast.value
val index: Int = IpFromUtils.binarySearch(broadCastValue, ipNum)
var province = "未知地区"
if (index != -1) {
province = broadCastValue(index)._3
}
(province, 1)
})
//聚合
val reduce: RDD[(String, Int)] = provinceData.reduceByKey(_ + _)
//排序
val sort: RDD[(String, Int)] = reduce.sortBy(_._2, false)
println(sort.collect().toBuffer)
sc.stop()
}
}