package sparkProgram
import org.apache.spark.sql.{Dataset, SparkSession}
/**
* @Description * @Author 黄仁议<[email protected]>
* @Version V1.0
* @Since 1.0
* @Date 2019/6/4 0004 17:51
* @Description * @ClassName Spark_Sql_DataSet_IPSearch
*/
object Spark_Sql_DataSet_IPSearch {
def main(args: Array[String]): Unit = {
val sparkSession = SparkSession.builder().appName("Spark_Sql_DataSet_IPSearch").master("local[2]").getOrCreate()
import sparkSession.implicits._
//解析字典数据
val lines: Dataset[String] = sparkSession.read.textFile("D:\\data\\ip.txt")
val dictFrame = lines.map(line=> {
val arr = line.split("[|]")
val startIp = arr(2).toLong
val endIp = arr(3).toLong
val province = arr(6)
(startIp,endIp,province)
}).toDF("startIp","endIp","province")
//处理日志数据
val logData: Dataset[String] = sparkSession.read.textFile("D:\\data\\http.log")
val logFrame = logData.map(log=>{
val arr = log.split("[|]")
val ip = arr(1)
val ipnum = ipToLong(ip)
ipnum
}).toDF("ip")
//注册两张表
dictFrame.createOrReplaceTempView("t_dict")
logFrame.createOrReplaceTempView("t_log")
//根据IP信息进行不等值连接join
sparkSession.sql("SELECT province,count(*) counts FROM t_dict JOIN t_log ON(ip >=startIp" +
" AND ip <= endIp) GROUP BY province ORDER BY counts DESC ").show()
sparkSession.stop()
}
//将ip转为long类型
def ipToLong(ip: String) = {
val fragments: Array[String] = ip.split("[.]")
var ipNum = 0L
for (i <- 0 until fragments.length) {
ipNum = fragments(i).toLong | ipNum << 8L
}
ipNum
}
}
结果:
+--------+------+
|province|counts|
+--------+------+
| 陕西| 1824|
| 北京| 1535|
| 重庆| 868|
| 河北| 383|
| 云南| 126|
+--------+------+
spark案例三用Spark_Sql_DataSet实现
猜你喜欢
转载自blog.csdn.net/weixin_43562705/article/details/90804378
今日推荐
周排行