HIVE custom UDF function - convert latitude and longitude into provincial and municipal addresses

Table of contents

1. Demand background

2. Solutions

3. Code pom reference

4. Code integration UDF

5. Compile the jar package

6. Instructions for use

7. Put the jar package in hdfs

 8. Create a persistent function

9. Test

 10. Epilogue


1. Demand background

There are longitude and latitude in the user behavior buried point data, and the address information of the data needs to be analyzed through hive.

2. Solutions

Through the custom UDF function of HIVE, the longitude and latitude can be converted into address information and integrated with Baidu interface query.

3. Code pom reference

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.dd.xinwen.hive.udf</groupId>
    <artifactId>dd-hiveudf</artifactId>
    <version>1.0-SNAPSHOT</version>

    <name>dd-hiveudf</name>
    <repositories>
        <repository>
            <id>cloudera</id>
            <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
        </repository>
    </repositories>


    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
        <hive.version>2.3.3</hive.version>
        <fastjson.version>1.2.28</fastjson.version>
    </properties>
    <packaging>jar</packaging>
    <dependencies>
        <!--添加 hive 依赖-->
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>${hive.version}</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.11</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>${fastjson.version}</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.0</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                    <encoding>UTF-8</encoding>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>2.2</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*/RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

4. Code integration UDF

package com.dd.xinwen.hive.udf;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;

public class MapBaiduAddressUDF extends UDF {
    public Text evaluate (String lat, String lng){
        String s="";
        if (null==lat || null==lng){
            return null;
        }
        try {
            s=getLocationByBaiduMap(lng, lat);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return new Text(s);
    }

    public static void main(String[] args) {
        String lat="31.931";
        String lng="120.961";
        Text evaluate = new MapBaiduAddressUDF().evaluate(lat,lng);

        System.out.println(evaluate.toString());
    }


    public static String getLocationByBaiduMap(String longitude,String latitude) throws Exception {
        String ak = "你的百度地图AK,申请一个吧";

        String locJson = geturl("http://api.map.baidu.com/geoconv/v1/?coords=" + longitude + "," +latitude + "&from=1&to=5&ak=" + ak);
        System.out.println(locJson);

        JSONObject jobject =  JSON.parseObject(locJson);
        JSONArray jsonArray = jobject.getJSONArray("result");
        String lat=jsonArray.getJSONObject(0).getString("y");
        String lng=jsonArray.getJSONObject(0).getString("x");
        //System.out.println(lat);

        String addrJson = geturl("http://api.map.baidu.com/reverse_geocoding/v3/?ak="+ ak +"&location=" + lat + "," + lng + "&output=json&pois=1");
        System.out.println(addrJson);

        JSONObject jobjectaddr =  JSON.parseObject(addrJson);
        JSONObject rJsonObject = jobjectaddr.getJSONObject("result");
        System.out.println(rJsonObject.getJSONObject("addressComponent").getString("city"));
System.out.println(rJsonObject.getJSONObject("addressComponent").getString("province"));

        String addr=jobjectaddr.getJSONObject("result").getString("formatted_address");
        return addr;
    }

    private static String geturl(String geturl) throws Exception {
        //请求的webservice的url
        URL url = new URL(geturl);
        //创建http链接,得到connection对象
        HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();
        //设置请求的方法类型
        httpURLConnection.setRequestMethod("POST");
        //设置请求的内容类型
        httpURLConnection.setRequestProperty("Content-type", "application/x-www-form-urlencoded");
        //设置发送数据
        httpURLConnection.setDoOutput(true);
        //设置接受数据
        httpURLConnection.setDoInput(true);
        //发送数据,使用输出流
        OutputStream outputStream = httpURLConnection.getOutputStream();
        //发送的soap协议的数据
        String content = "user_id="+ URLEncoder.encode("用户Id", "utf-8");
        //发送数据
        outputStream.write(content.getBytes());
        //接收数据
        InputStream inputStream = httpURLConnection.getInputStream();
        BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
        StringBuffer buffer = new StringBuffer();
        String line = "";
        while ((line = in.readLine()) != null){
            buffer.append(line);
        }
        String str = buffer.toString();
        return str;
    }
}

The other two java classes can be implemented in the same way as MapBaiduCityUDF and MapBaiduProvinceUDF.

5. Compile the jar package

dd-hiveudf-1.0-SNAPSHOT.jar

About 85.5MB

6. Instructions for use

We need to create persistent functions (Permanent Functions) and use the jar on hdfs.

This method is recommended for the production environment, which is easy to manage and use.

7. Put the jar package in hdfs

Compare to warehouse/dd/auxlib/dd-hiveudf-1.0-SNAPSHOT.jar

 8. Create a persistent function

execute in hive

create function gotBaiduAddr as 'com.dd.xinwen.hive.udf.MapBaiduAddressUDF' USING JAR 'hdfs:///warehouse/dd/auxlib/dd-hiveudf-1.0-SNAPSHOT.jar';
create function gotBaiduCity as 'com.dd.xinwen.hive.udf.MapBaiduCityUDF' USING JAR 'hdfs:///warehouse/dd/auxlib/dd-hiveudf-1.0-SNAPSHOT.jar';
create function gotBaiduProvince as 'com.dd.xinwen.hive.udf.MapBaiduProvinceUDF' USING JAR 'hdfs:///warehouse/dd/auxlib/dd-hiveudf-1.0-SNAPSHOT.jar';

9. Test

execute in hive

select gotbaiduaddr('31.931','120.961');
select gotbaiducity('31.931','120.961');
select gotbaiduprovince('31.931','120.961');

 10. Epilogue

Create Temporary Functions

Official reference: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-CreateFunction

    Disadvantage: Temporary Functions are only valid for the current session (window)   

    Example: Execute in Hive's Shell

    ADD JAR /home/hadoop/lib/g6-hadoop-udf.jar;

    CREATE TEMPORARY FUNCTION sayHello AS 'com.ruozedata.hadoop.udf.HelloUDF';

11. Problems encountered

Question 1: hive failed to create a permanent function

Refer to the main tutorial on CSDN to make the jar package

Hive failed to create a permanent function, Failed to register youmeng.finderrorcount using class com.jinghang.hive.MyCoustom_lkm0522's Blog-CSDN Blog

Guess you like

Origin blog.csdn.net/xieedeni/article/details/121206971