Table of contents
7. Put the jar package in hdfs
8. Create a persistent function
1. Demand background
There are longitude and latitude in the user behavior buried point data, and the address information of the data needs to be analyzed through hive.
2. Solutions
Through the custom UDF function of HIVE, the longitude and latitude can be converted into address information and integrated with Baidu interface query.
3. Code pom reference
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.dd.xinwen.hive.udf</groupId>
<artifactId>dd-hiveudf</artifactId>
<version>1.0-SNAPSHOT</version>
<name>dd-hiveudf</name>
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
</repositories>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<hive.version>2.3.3</hive.version>
<fastjson.version>1.2.28</fastjson.version>
</properties>
<packaging>jar</packaging>
<dependencies>
<!--添加 hive 依赖-->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>${fastjson.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.2</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*/RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
4. Code integration UDF
package com.dd.xinwen.hive.udf;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
public class MapBaiduAddressUDF extends UDF {
public Text evaluate (String lat, String lng){
String s="";
if (null==lat || null==lng){
return null;
}
try {
s=getLocationByBaiduMap(lng, lat);
} catch (Exception e) {
e.printStackTrace();
}
return new Text(s);
}
public static void main(String[] args) {
String lat="31.931";
String lng="120.961";
Text evaluate = new MapBaiduAddressUDF().evaluate(lat,lng);
System.out.println(evaluate.toString());
}
public static String getLocationByBaiduMap(String longitude,String latitude) throws Exception {
String ak = "你的百度地图AK,申请一个吧";
String locJson = geturl("http://api.map.baidu.com/geoconv/v1/?coords=" + longitude + "," +latitude + "&from=1&to=5&ak=" + ak);
System.out.println(locJson);
JSONObject jobject = JSON.parseObject(locJson);
JSONArray jsonArray = jobject.getJSONArray("result");
String lat=jsonArray.getJSONObject(0).getString("y");
String lng=jsonArray.getJSONObject(0).getString("x");
//System.out.println(lat);
String addrJson = geturl("http://api.map.baidu.com/reverse_geocoding/v3/?ak="+ ak +"&location=" + lat + "," + lng + "&output=json&pois=1");
System.out.println(addrJson);
JSONObject jobjectaddr = JSON.parseObject(addrJson);
JSONObject rJsonObject = jobjectaddr.getJSONObject("result");
System.out.println(rJsonObject.getJSONObject("addressComponent").getString("city"));
System.out.println(rJsonObject.getJSONObject("addressComponent").getString("province"));
String addr=jobjectaddr.getJSONObject("result").getString("formatted_address");
return addr;
}
private static String geturl(String geturl) throws Exception {
//请求的webservice的url
URL url = new URL(geturl);
//创建http链接,得到connection对象
HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();
//设置请求的方法类型
httpURLConnection.setRequestMethod("POST");
//设置请求的内容类型
httpURLConnection.setRequestProperty("Content-type", "application/x-www-form-urlencoded");
//设置发送数据
httpURLConnection.setDoOutput(true);
//设置接受数据
httpURLConnection.setDoInput(true);
//发送数据,使用输出流
OutputStream outputStream = httpURLConnection.getOutputStream();
//发送的soap协议的数据
String content = "user_id="+ URLEncoder.encode("用户Id", "utf-8");
//发送数据
outputStream.write(content.getBytes());
//接收数据
InputStream inputStream = httpURLConnection.getInputStream();
BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
StringBuffer buffer = new StringBuffer();
String line = "";
while ((line = in.readLine()) != null){
buffer.append(line);
}
String str = buffer.toString();
return str;
}
}
The other two java classes can be implemented in the same way as MapBaiduCityUDF and MapBaiduProvinceUDF.
5. Compile the jar package
dd-hiveudf-1.0-SNAPSHOT.jar
About 85.5MB
6. Instructions for use
We need to create persistent functions (Permanent Functions) and use the jar on hdfs.
This method is recommended for the production environment, which is easy to manage and use.
7. Put the jar package in hdfs
Compare to warehouse/dd/auxlib/dd-hiveudf-1.0-SNAPSHOT.jar
8. Create a persistent function
execute in hive
create function gotBaiduAddr as 'com.dd.xinwen.hive.udf.MapBaiduAddressUDF' USING JAR 'hdfs:///warehouse/dd/auxlib/dd-hiveudf-1.0-SNAPSHOT.jar';
create function gotBaiduCity as 'com.dd.xinwen.hive.udf.MapBaiduCityUDF' USING JAR 'hdfs:///warehouse/dd/auxlib/dd-hiveudf-1.0-SNAPSHOT.jar';
create function gotBaiduProvince as 'com.dd.xinwen.hive.udf.MapBaiduProvinceUDF' USING JAR 'hdfs:///warehouse/dd/auxlib/dd-hiveudf-1.0-SNAPSHOT.jar';
9. Test
execute in hive
select gotbaiduaddr('31.931','120.961');
select gotbaiducity('31.931','120.961');
select gotbaiduprovince('31.931','120.961');
10. Epilogue
Create Temporary Functions
Official reference: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-CreateFunction
Disadvantage: Temporary Functions are only valid for the current session (window)
Example: Execute in Hive's Shell
ADD JAR /home/hadoop/lib/g6-hadoop-udf.jar;
CREATE TEMPORARY FUNCTION sayHello AS 'com.ruozedata.hadoop.udf.HelloUDF';
11. Problems encountered
Question 1: hive failed to create a permanent function
Refer to the main tutorial on CSDN to make the jar package