Get the latest partition UDF function of hive

In the offline data warehouse, when you encounter the fact table associating the dimension table, you only need the data association of the latest partition of the dimension table. At this time, you can write the UDF function to get the latest partition of the table. The following code is to get the latest partition by traversing the HDFS file path .

<repositories>
        <repository>
            <id>mvnrepository</id>
            <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
        </repository>
        <repository>
            <id>cloudera</id>
            <url>https://mvnrepository.com/artifact</url>
        </repository>
        <repository>
            <id>aliyun</id>
            <url>http://maven.aliyun.com/mvn/view</url>
        </repository>
        <repository>
            <id>jboss</id>
            <url>http://repository.jboss.com/nexus/content/groups/public</url>
        </repository>
    </repositories>
    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <maven.compiler.source>1.7</maven.compiler.source>
        <maven.compiler.target>1.7</maven.compiler.target>
        <hadoop.version>2.7.6</hadoop.version>
        <hive.version>2.1.1</hive.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
            <exclusions>
                <exclusion>
                    <groupId>log4j</groupId>
                    <artifactId>log4j</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-log4j12</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>${hive.version}</version>
        </dependency>
    </dependencies>
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;

import java.net.URI;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class GetNewestPatition extends UDF {
    
    

    public Text evaluate(Text tableName) {
    
    

        StringBuffer sb = new StringBuffer();
        String newestPatition = null;
        String split1 = tableName.toString().split("\\.")[0];
        String split2 = tableName.toString().split("\\.")[1];
        String fileName = sb.append("/user/hive/warehouse-3.1.1/").append(split1).append(".db/").append(split2).toString();
        try{
    
    
            newestPatition = getFileList(fileName);
        }catch (Exception e){
    
    
            System.out.println("获取结果异常" +e.getMessage());
        }
        ;
        return new Text(newestPatition);
        //return newestPatition;
    }

    public static String getFileList(String path) throws Exception{
    
    
        String res = null;

        Configuration conf=new Configuration(false);
        conf.set("fs.default.name", "hdfs://192.168.235.66:8020/");
        FileSystem hdfs = FileSystem.get(URI.create(path),conf);
        FileStatus[] fs = hdfs.listStatus(new Path(path));
        Path[] listPath = FileUtil.stat2Paths(fs);

        List<String> list = new ArrayList();
        for(Path p : listPath){
    
    
            String s = p.toString();
            String partition = s.split("=")[1];
            list.add(partition);
        }
        if(list.size() != 0) {
    
    
            res = Collections.max(list).toString();
        }
        return  res;
    }
}

Then package it with maven and upload it to HDFS

hdfs dfs -put hive-UDF-1.0.0-1.0-SNAPSHOT.jar /user/hive/udf/

Log in to the hive client

create function getnewest_partition as 'com.fuyun.udf.UDFLatestPartition' using jar 'hdfs:/user/hive/udf/hive-UDF-1.0.0-1.0-SNAPSHOT.jar';

select getnewest_partition('temp.temp_partition1_tb');

Guess you like

Origin blog.csdn.net/lz6363/article/details/108178317