The last article I use maxmind developed a library of free plug-ins waterdrop, test data found that some domestic municipal or inaccurate, and Hong Kong is not China show, which is unfriendly.
Looking for a bit, issued ip2region this is very good. https://github.com/lionsoul2014/ip2region
I use the library, and then use the previous code, slightly modified, test it, the effect is good. Basically no value empty again.
About the query efficiency, as the author said, memsearch the fastest, I tested is true, but there will be some batches efficiency spark the beginning of a stream of somewhat less certain, and slowly lift up
package com.student import io.github.interestinglab.waterdrop.apis.BaseFilter import com.typesafe.config.{Config, ConfigFactory} import org.apache.spark.SparkFiles import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.apache.spark.sql.functions.{col, udf} import scala.collection.JavaConversions._ import org.lionsoul.ip2region.DbConfig import org.lionsoul.ip2region.DbSearcher import scala.collection.JavaConversions._ import org.lionsoul.ip2region.DbConfig import org.lionsoul.ip2region.DbSearcher object SearcherWrapper extends Serializable { @transient lazy val searcher = { val config = new DbConfig val dbfile = SparkFiles.get("ip2region.db") val searcher = new DbSearcher(config, dbfile) searcher } } class IP2Region2 extends BaseFilter { var config: Config = ConfigFactory.empty() /** * Set Config. **/ override def setConfig(config: Config): Unit = { this.config = config } /** * Get Config. **/ override def getConfig(): Config = { this.config } override def checkConfig(): (Boolean, String) = { val requiredOptions = List("source_field") val nonExistsOptions: List[(String, Boolean)] = requiredOptions.map { optionName => (optionName, config.hasPath(optionName)) }.filter { p => !p._2 } if (nonExistsOptions.length == 0) { (true, "") } else { (false, "please specify setting as non-empty string") } } override def prepare(spark: SparkSession): Unit = { val defaultConfig = ConfigFactory.parseMap( Map( "source_field" -> "raw_message", "target_field" -> "__ROOT__" ) ) config = config.withFallback(defaultConfig) } override def process(spark: SparkSession, df: Dataset[Row]): Dataset[Row] = { val srcField = config.getString("source_field") val ip2region=udf{ip:String => ip2Location2(ip)} import org.apache.spark.sql.functions.split df.withColumn("__region__", ip2region(col(srcField))) .withColumn("__country__",split(col("__region__"),"\\|")(0)) .withColumn("__province__",split(col("__region__"),"\\|")(2)) .withColumn("__city__",split(col("__region__"),"\\|")(3)) .withColumn("__isp__",split(col("__region__"),"\\|")(4)) } def ip2Location2(ip: String) = { try { val searcher = SearcherWrapper.searcher val response = searcher.memorySearch(ip) response.getRegion } catch { case ex: Exception => // ex.printStackTrace() "" } } }