In the offline data warehouse, when you encounter the fact table associating the dimension table, you only need the data association of the latest partition of the dimension table. At this time, you can write the UDF function to get the latest partition of the table. The following code is to get the latest partition by traversing the HDFS file path .
<repositories>
<repository>
<id>mvnrepository</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
<repository>
<id>cloudera</id>
<url>https://mvnrepository.com/artifact</url>
</repository>
<repository>
<id>aliyun</id>
<url>http://maven.aliyun.com/mvn/view</url>
</repository>
<repository>
<id>jboss</id>
<url>http://repository.jboss.com/nexus/content/groups/public</url>
</repository>
</repositories>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
<hadoop.version>2.7.6</hadoop.version>
<hive.version>2.1.1</hive.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
<exclusions>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
</dependency>
</dependencies>
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class GetNewestPatition extends UDF {
public Text evaluate(Text tableName) {
StringBuffer sb = new StringBuffer();
String newestPatition = null;
String split1 = tableName.toString().split("\\.")[0];
String split2 = tableName.toString().split("\\.")[1];
String fileName = sb.append("/user/hive/warehouse-3.1.1/").append(split1).append(".db/").append(split2).toString();
try{
newestPatition = getFileList(fileName);
}catch (Exception e){
System.out.println("获取结果异常" +e.getMessage());
}
;
return new Text(newestPatition);
//return newestPatition;
}
public static String getFileList(String path) throws Exception{
String res = null;
Configuration conf=new Configuration(false);
conf.set("fs.default.name", "hdfs://192.168.235.66:8020/");
FileSystem hdfs = FileSystem.get(URI.create(path),conf);
FileStatus[] fs = hdfs.listStatus(new Path(path));
Path[] listPath = FileUtil.stat2Paths(fs);
List<String> list = new ArrayList();
for(Path p : listPath){
String s = p.toString();
String partition = s.split("=")[1];
list.add(partition);
}
if(list.size() != 0) {
res = Collections.max(list).toString();
}
return res;
}
}
Then package it with maven and upload it to HDFS
hdfs dfs -put hive-UDF-1.0.0-1.0-SNAPSHOT.jar /user/hive/udf/
Log in to the hive client
create function getnewest_partition as 'com.fuyun.udf.UDFLatestPartition' using jar 'hdfs:/user/hive/udf/hive-UDF-1.0.0-1.0-SNAPSHOT.jar';
select getnewest_partition('temp.temp_partition1_tb');