geoip ip2region2 with spark

The last article I use maxmind developed a library of free plug-ins waterdrop, test data found that some domestic municipal or inaccurate, and Hong Kong is not China show, which is unfriendly.

Looking for a bit, issued ip2region this is very good. https://github.com/lionsoul2014/ip2region

I use the library, and then use the previous code, slightly modified, test it, the effect is good. Basically no value empty again.

About the query efficiency, as the author said, memsearch the fastest, I tested is true, but there will be some batches efficiency spark the beginning of a stream of somewhat less certain, and slowly lift up

package com.student

import io.github.interestinglab.waterdrop.apis.BaseFilter
import com.typesafe.config.{Config, ConfigFactory}
import org.apache.spark.SparkFiles
import org.apache.spark.sql.{Dataset, Row, SparkSession}
import org.apache.spark.sql.functions.{col, udf}

import scala.collection.JavaConversions._
import org.lionsoul.ip2region.DbConfig
import org.lionsoul.ip2region.DbSearcher
import scala.collection.JavaConversions._
import org.lionsoul.ip2region.DbConfig
import org.lionsoul.ip2region.DbSearcher


object SearcherWrapper extends Serializable {
  @transient lazy val searcher = {
    val config = new DbConfig
    val dbfile = SparkFiles.get("ip2region.db")
    val searcher = new DbSearcher(config, dbfile)
    searcher
  }
}



class IP2Region2 extends BaseFilter {

  var config: Config = ConfigFactory.empty()

  /**
    * Set Config.
    **/
  override def setConfig(config: Config): Unit = {
    this.config = config
  }

  /**
    * Get Config.
    **/
  override def getConfig(): Config = {
    this.config
  }

  override def checkConfig(): (Boolean, String) = {

    val requiredOptions = List("source_field")
    val nonExistsOptions: List[(String, Boolean)] = requiredOptions.map { optionName =>
      (optionName, config.hasPath(optionName))
    }.filter { p =>
      !p._2
    }

    if (nonExistsOptions.length == 0) {
      (true, "")
    } else {
      (false, "please specify setting as non-empty string")
    }

  }

  override def prepare(spark: SparkSession): Unit = {

    val defaultConfig = ConfigFactory.parseMap(
      Map(
        "source_field" -> "raw_message",
        "target_field" -> "__ROOT__"
      )
    )

    config = config.withFallback(defaultConfig)


  }

  override def process(spark: SparkSession, df: Dataset[Row]): Dataset[Row] = {

    val srcField = config.getString("source_field")

    val ip2region=udf{ip:String => ip2Location2(ip)}

     import  org.apache.spark.sql.functions.split

     df.withColumn("__region__", ip2region(col(srcField)))
      .withColumn("__country__",split(col("__region__"),"\\|")(0))
      .withColumn("__province__",split(col("__region__"),"\\|")(2))
      .withColumn("__city__",split(col("__region__"),"\\|")(3))
       .withColumn("__isp__",split(col("__region__"),"\\|")(4))

  }



  def ip2Location2(ip: String) = {
    try {
      val searcher = SearcherWrapper.searcher
      val response = searcher.memorySearch(ip)

      response.getRegion
    }
    catch {
      case ex: Exception =>
       // ex.printStackTrace()
        ""
    }
  }

}

 

Guess you like

Origin www.cnblogs.com/huaxiaoyao/p/12104587.html