spark案例三用Spark_Sql_DataSet实现

package sparkProgram
import org.apache.spark.sql.{Dataset, SparkSession}
/**
  * @Description * @Author 黄仁议<[email protected]>
  * @Version V1.0
  * @Since 1.0
  * @Date 2019/6/4 0004 17:51
  * @Description * @ClassName Spark_Sql_DataSet_IPSearch
  */
object Spark_Sql_DataSet_IPSearch {
  def main(args: Array[String]): Unit = {

    val sparkSession = SparkSession.builder().appName("Spark_Sql_DataSet_IPSearch").master("local[2]").getOrCreate()
    import sparkSession.implicits._
    //解析字典数据
    val lines: Dataset[String] = sparkSession.read.textFile("D:\\data\\ip.txt")
    val dictFrame =  lines.map(line=> {
      val arr = line.split("[|]")
      val startIp = arr(2).toLong
      val endIp = arr(3).toLong
      val province = arr(6)
      (startIp,endIp,province)
    }).toDF("startIp","endIp","province")

    //处理日志数据
    val logData: Dataset[String] = sparkSession.read.textFile("D:\\data\\http.log")

    val logFrame = logData.map(log=>{
      val arr = log.split("[|]")
      val ip = arr(1)
      val ipnum = ipToLong(ip)
      ipnum
    }).toDF("ip")

    //注册两张表
    dictFrame.createOrReplaceTempView("t_dict")
    logFrame.createOrReplaceTempView("t_log")
    //根据IP信息进行不等值连接join
    sparkSession.sql("SELECT province,count(*) counts FROM t_dict JOIN t_log ON(ip >=startIp" +
      " AND ip <= endIp) GROUP BY province ORDER BY counts DESC ").show()
    sparkSession.stop()

  }
  //将ip转为long类型
  def ipToLong(ip: String) = {
    val fragments: Array[String] = ip.split("[.]")
    var ipNum = 0L
    for (i <- 0 until fragments.length) {
      ipNum = fragments(i).toLong | ipNum << 8L
    }
    ipNum
  }
}
结果:
+--------+------+
|province|counts|
+--------+------+
|      陕西|  1824|
|      北京|  1535|
|      重庆|   868|
|      河北|   383|
|      云南|   126|
+--------+------+

猜你喜欢

转载自blog.csdn.net/weixin_43562705/article/details/90804378
今日推荐