hive：自定义函数UDF

1.创建为mavn项目

2.jdk用的1.8

3.代码：

java：

package com.huayong;

import org.apache.hadoop.hive.ql.exec.UDF;

import java.util.HashSet;
import java.util.Set;

/**
 * Created by tang on 2019/01/07
 */
public class Udf_doubleMinSalary extends UDF {

    public String evaluate(String a) {

        return a+"____udf";
    }

    public static void main(String[] args) {
		//System.out.println(evaluate(6));
	}
}

pom：

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>huayong</groupId>
  <artifactId>udf_doubleMinSalary</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  
  <dependencies>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>1.1.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.6.0</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
        </dependency>
        <dependency>
		    <groupId>jdk.tools</groupId>
		    <artifactId>jdk.tools</artifactId>
		    <version>1.8</version>
		    <scope>system</scope>
		    <systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
		</dependency>
    </dependencies>
  <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>2.2</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
  
</project>

4.打包：项目上右键---run as----maven install

5.上传到linux一个目录如：/var/lib/hadoop-hdfs/spride_sqoop_beijing/udf_jar

6.创建udf函数：

add jar /var/lib/hadoop-hdfs/spride_sqoop_beijing/udf_jar/udf_doubleMinSalary-0.0.1-SNAPSHOT.jar;

创建一个临时函数函数名为：doubleMinSalary

create temporary function doubleMinSalary as 'com.huayong.Udf_doubleMinSalary';

udf 创建永久函数：

先把包传到hdfs：
hadoop fs -put /var/lib/hadoop-hdfs/spride_sqoop_beijing/udf_jar/udf_hive2kafka-0.0.1-SNAPSHOT.jar /user/hive/warehouse/ods.db/udf_jar/udf_hive2kafka-0.0.1-SNAPSHOT.jar

然后创建永久函数
CREATE FUNCTION udf_hive2kafka  AS 'com.huayong.Hive2KakfaUDF'
 USING JAR 'hdfs:///user/hive/warehouse/ods.db/udf_jar/udf_hive2kafka-0.0.1-SNAPSHOT.jar';

show functions：

执行sql如;

SELECT g,default.udf_hive2kafka('lienidata001:9092','bobizlist_tangzhanbo',collect_list(map(
    'bo_id',bo_id,
    'full_name', full_name,
    'simple_name',simple_name,
    'source',source,
    'company_id',company_id,
    'contact',contact,
    'position',position,
    'mobile_phone',mobile_phone,
    'phone',phone,
    'email',email,
    'contact_source',contact_source,
    'request_host',request_host,
    'request_url',request_url,
    'insert_time',insert_time
    ))) AS result
FROM 
(
SELECT r1,pmod(ABS(hash(r1)),100) AS g,bo_id,full_name,simple_name,source,company_id,contact,position,mobile_phone,phone,email,contact_source,request_host,request_url,insert_time
FROM dws_bo_final_spider_contact
LIMIT 10000
) tmp
GROUP BY g;

hive：自定义函数UDF

猜你喜欢