Hive desarrolla UDF con dos interfaces: una es org.apache.hadoop.hive.ql.exec.UDF, que es principalmente para tipos de datos simples (como String, Integer, etc.); luego realiza algunas operaciones de procesamiento de inicialización y apagado.
1. Código
package com.scb.dss.udf;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import java.io.Serializable;
import java.util.Map;
@Description(name="merge_map",
value="_FUNC_(map1, map2) - Merge map1 and map2, and the last value will be taken for the identical key.",
extended = "SELECT merge_map(map('1', 'a', '2', 'c'),map('1', 'b'));")
public class GenericUDFMergeMap extends GenericUDF implements Serializable {
private transient MapObjectInspector inputMap1;
private transient MapObjectInspector inputMap2;
@Override
public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException {
if(objectInspectors.length != 2){
throw new UDFArgumentLengthException("The function merge_map requires 2 arguments, got " + objectInspectors.length);
}
if (objectInspectors[0].getCategory()!= ObjectInspector.Category.MAP) {
throw new UDFArgumentTypeException(0, "\"map\" expected at function merge_map, but \"" + objectInspectors[0].getCategory().name() + "\" " + "is found.");
}
if (objectInspectors[1].getCategory()!= ObjectInspector.Category.MAP) {
throw new UDFArgumentTypeException(1, "\"map\" expected at function merge_map, but \"" + objectInspectors[1].getCategory().name() + "\" " + "is found.");
}
inputMap1 = (MapObjectInspector) objectInspectors[0];
inputMap2 = (MapObjectInspector) objectInspectors[1];
if (inputMap1.getMapKeyObjectInspector().getCategory() != inputMap2.getMapKeyObjectInspector().getCategory()) {
throw new UDFArgumentTypeException(0, "The key type between map1 and map2 should be consistent. " +
"But key1 is " + inputMap1.getMapKeyObjectInspector().getCategory().name() +
", key2 is " + inputMap2.getMapKeyObjectInspector().getCategory().name());
}
if (inputMap1.getMapValueObjectInspector().getCategory() != inputMap2.getMapValueObjectInspector().getCategory()) {
throw new UDFArgumentTypeException(0, "The value type between map1 and map2 should be consistent. " +
"But value1 is " + inputMap1.getMapValueObjectInspector().getCategory().name() +
", value2 is " + inputMap2.getMapValueObjectInspector().getCategory().name());
}
// 定义返回类型
return ObjectInspectorFactory.getStandardMapObjectInspector(inputMap1.getMapKeyObjectInspector(), inputMap1.getMapValueObjectInspector());
}
@Override
public Object evaluate(DeferredObject[] deferredObjects) throws HiveException {
Map map1 = inputMap1.getMap(deferredObjects[0].get());
Map map2 = inputMap2.getMap(deferredObjects[1].get());
map1.putAll(map2);
return map1;
}
@Override
public String getDisplayString(String[] strings) {
return null;
}
}
Dos, prueba unitaria
package com.scb.dss.udf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;
import java.util.HashMap;
import java.util.Map;
public class GenericUDFMergeMapTest {
private final GenericUDFMergeMap genericUDFMergeMap = new GenericUDFMergeMap();
private final ObjectInspector mapStringStringOI1 = ObjectInspectorFactory.getStandardMapObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector);
private final ObjectInspector mapStringStringOI2 = ObjectInspectorFactory.getStandardMapObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector);
private final ObjectInspector stringListOI = ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
@Rule
public ExpectedException exception = ExpectedException.none();
@Test
public void testEvaluate() throws Exception {
ObjectInspector[] arguments = {mapStringStringOI1, mapStringStringOI2};
genericUDFMergeMap.initialize(arguments);
Map<String, String> map1 = new HashMap<>();
map1.put("1","a");
map1.put("2","c");
GenericUDF.DeferredObject mapObj1 = new GenericUDF.DeferredJavaObject(map1);
Map<String, String> map2 = new HashMap<>();
map2.put("1","b");
map2.put("3","d");
GenericUDF.DeferredObject mapObj2 = new GenericUDF.DeferredJavaObject(map2);
GenericUDF.DeferredObject[] args = {mapObj1, mapObj2};
Map<String, String> actual = (Map<String, String>) genericUDFMergeMap.evaluate(args);
System.out.println(actual);
Map<String, String> expected = new HashMap<>();
expected.put("1","b");
expected.put("2","c");
expected.put("3","d");
Assert.assertEquals("Result should be identical", expected, actual);
}
@Test
public void testDifferentType() throws HiveException {
exception.expect(UDFArgumentTypeException.class);
exception.expectMessage("\"map\" expected at function merge_map, but \"LIST\" is found.");
ObjectInspector[] arguments = {mapStringStringOI1, stringListOI};
genericUDFMergeMap.initialize(arguments);
}
@Test(expected = UDFArgumentLengthException.class)
public void testNullTypeFirst() throws HiveException {
ObjectInspector[] arguments = {mapStringStringOI1, mapStringStringOI2, stringListOI};
genericUDFMergeMap.initialize(arguments);
}
}
3. Prueba de implementación
función desc extendida merge_map;
SELECCIONE merge_map(mapa('1', 'a', '2', 'c'),mapa('1', 'b'));
SELECCIONE merge_map(mapa('1', 'a', '2', 'c'), '1');
4. Mejora
El código anterior solo puede fusionar dos mapas. Ahora mejoramos el código y lo cambiamos para fusionar mapas ilimitados.
package com.scb.dss.udf;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import java.io.Serializable;
import java.util.Map;
@Description(name="merge_map",
value="_FUNC_(map1, map2, ...) - Merge maps, and the last value will be taken for the identical key.",
extended = "SELECT merge_map(map('1', 'a', '2', 'c'),map('1', 'b'));")
public class GenericUDFMergeMap extends GenericUDF implements Serializable {
private transient MapObjectInspector[] inputMaps;
@Override
public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException {
if(objectInspectors.length < 2 || objectInspectors[0] == null){
throw new UDFArgumentLengthException("The function merge_map takes at least 2 arguments, got " + objectInspectors.length + ". And the first map cannot be null");
}
inputMaps = new MapObjectInspector[objectInspectors.length];
inputMaps[0] = (MapObjectInspector) objectInspectors[0];
ObjectInspector baseMapKeyOI = inputMaps[0].getMapKeyObjectInspector();
ObjectInspector baseMapValueOI = inputMaps[0].getMapValueObjectInspector();
for(int i = 1; i < objectInspectors.length; i++) {
if (objectInspectors[i] == null) {
continue;
}
// Check whether the type of the parameter is map
if (objectInspectors[i].getCategory() != ObjectInspector.Category.MAP) {
throw new UDFArgumentTypeException(i, "\"map\" expected at function merge_map, but \"" + objectInspectors[i].getCategory().name() + "\" " + "is found.");
}
MapObjectInspector mapOI = (MapObjectInspector) objectInspectors[i];
// Check the key and value type
if (baseMapKeyOI.getCategory() != mapOI.getMapKeyObjectInspector().getCategory()) {
throw new UDFArgumentTypeException(i, "The key type between map1 and map2 should be consistent. " +
"But key1 is " + baseMapKeyOI.getCategory().name() +
", key2 is " + mapOI.getMapKeyObjectInspector().getCategory().name());
}
if (baseMapValueOI.getCategory() != mapOI.getMapValueObjectInspector().getCategory()) {
throw new UDFArgumentTypeException(i, "The value type between map1 and map2 should be consistent. " +
"But value1 is " + baseMapValueOI.getCategory().name() +
", value2 is " + mapOI.getMapValueObjectInspector().getCategory().name());
}
inputMaps[i] = mapOI;
}
// 定义返回类型
return ObjectInspectorFactory.getStandardMapObjectInspector(baseMapKeyOI, baseMapValueOI);
}
@Override
public Object evaluate(DeferredObject[] deferredObjects) throws HiveException {
Map baseMap = inputMaps[0].getMap(deferredObjects[0].get());
for(int i = 1; i < deferredObjects.length; i++) {
Map map = inputMaps[i].getMap(deferredObjects[i].get());
baseMap.putAll(map);
}
return baseMap;
}
@Override
public String getDisplayString(String[] strings) {
return "merge_map";
}
}
La prueba es la siguiente:
SELECCIONE merge_map(mapa('1', 'a', '2', 'c'),mapa('1', 'b'),mapa('3', 'd'));
Cinco, en la mejora - permitir valores nulos
código:
package com.scb.dss.udf;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableVoidObjectInspector;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
@Description(name = "merge_map",
value = "_FUNC_(map1, map2, ...) - Merge maps, and the last value will be taken for the identical key.",
extended = "SELECT merge_map(map('1', 'a', '2', 'c'),map('1', 'b'));")
public class GenericUDFMergeMap extends GenericUDF implements Serializable {
@Override
public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException {
if (objectInspectors.length < 2) {
throw new UDFArgumentLengthException("The function merge_map takes at least 2 arguments, got " + objectInspectors.length);
}
int i = 0;
MapObjectInspector baseMapOI = null;
// get base MapObjectInspector
do {
if (objectInspectors[i] == null || objectInspectors[i] instanceof WritableVoidObjectInspector) {
i++;
continue;
}
if (objectInspectors[i].getCategory() != ObjectInspector.Category.MAP) {
throw new UDFArgumentTypeException(i, "\"map\" expected at function merge_map, but \"" + objectInspectors[i].getCategory().name() + "\" " + "is found.");
}
baseMapOI = (MapObjectInspector) objectInspectors[i++];
} while (baseMapOI == null && i < objectInspectors.length);
for (; i < objectInspectors.length; i++) {
if (objectInspectors[i] == null || objectInspectors[i] instanceof WritableVoidObjectInspector) {
continue;
}
if (objectInspectors[i].getCategory() != ObjectInspector.Category.MAP) {
throw new UDFArgumentTypeException(i, "\"map\" expected at function merge_map, but \"" + objectInspectors[i].getCategory().name() + "\" " + "is found.");
}
MapObjectInspector mapOI = (MapObjectInspector) objectInspectors[i];
// Check the key and value type
if (baseMapOI.getMapKeyObjectInspector().getCategory() != mapOI.getMapKeyObjectInspector().getCategory()) {
throw new UDFArgumentTypeException(i, "The key type between map1 and map2 should be consistent. " +
"But key1 is " + baseMapOI.getMapKeyObjectInspector().getCategory().name() +
", key2 is " + mapOI.getMapKeyObjectInspector().getCategory().name());
}
if (baseMapOI.getMapValueObjectInspector().getCategory() != mapOI.getMapValueObjectInspector().getCategory()) {
throw new UDFArgumentTypeException(i, "The value type between map1 and map2 should be consistent. " +
"But value1 is " + baseMapOI.getMapValueObjectInspector().getCategory().name() +
", value2 is " + mapOI.getMapValueObjectInspector().getCategory().name());
}
}
// 定义返回类型
return baseMapOI;
}
@Override
public Object evaluate(DeferredObject[] deferredObjects) throws HiveException {
Map baseMap = new HashMap();
for (int i = 0; i < deferredObjects.length; i++) {
if (deferredObjects[i] == null || deferredObjects[i].get() == null) {
continue;
}
baseMap.putAll((Map) deferredObjects[i].get());
}
return baseMap;
}
@Override
public String getDisplayString(String[] strings) {
return "merge_map";
}
}
clase de prueba
package com.scb.dss.udf;
import com.google.common.collect.Maps;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableVoidObjectInspector;
import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;
import java.util.HashMap;
import java.util.Map;
public class GenericUDFMergeMapTest {
private final GenericUDFMergeMap genericUDFMergeMap = new GenericUDFMergeMap();
private final ObjectInspector mapStringStringOI = ObjectInspectorFactory.getStandardMapObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector);
private final ObjectInspector stringListOI = ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
private final WritableVoidObjectInspector voidOI = PrimitiveObjectInspectorFactory.writableVoidObjectInspector;
@Rule
public ExpectedException exception = ExpectedException.none();
@Test
public void testEvaluate() throws Exception {
ObjectInspector[] arguments = {mapStringStringOI, mapStringStringOI, mapStringStringOI};
genericUDFMergeMap.initialize(arguments);
Map<String, String> map1 = new HashMap<>();
map1.put("1", "a");
map1.put("2", "c");
GenericUDF.DeferredObject mapObj1 = new GenericUDF.DeferredJavaObject(map1);
Map<String, String> map2 = new HashMap<>();
map2.put("1", "b");
map2.put("3", "d");
GenericUDF.DeferredObject mapObj2 = new GenericUDF.DeferredJavaObject(map2);
Map<String, String> map3 = new HashMap<>();
map3.put("4", "e");
GenericUDF.DeferredObject mapObj3 = new GenericUDF.DeferredJavaObject(map3);
GenericUDF.DeferredObject[] args = {mapObj1, mapObj2, mapObj3};
Map<String, String> actual = (Map<String, String>) genericUDFMergeMap.evaluate(args);
System.out.println(actual);
Map<String, String> expected = new HashMap<>();
expected.put("1", "b");
expected.put("2", "c");
expected.put("3", "d");
expected.put("4", "e");
Assert.assertEquals("Result should be identical", expected, actual);
}
@Test
public void testDifferentType() throws HiveException {
exception.expect(UDFArgumentTypeException.class);
exception.expectMessage("\"map\" expected at function merge_map, but \"LIST\" is found.");
ObjectInspector[] arguments = {mapStringStringOI, stringListOI};
genericUDFMergeMap.initialize(arguments);
}
@Test(expected = UDFArgumentLengthException.class)
public void testArgumentLengthException() throws HiveException {
ObjectInspector[] arguments = {mapStringStringOI};
genericUDFMergeMap.initialize(arguments);
}
@Test
public void testNull() throws Exception {
ObjectInspector[] arguments = {null, voidOI, mapStringStringOI, voidOI, null, mapStringStringOI};
genericUDFMergeMap.initialize(arguments);
Map<String, String> map1 = new HashMap<>();
map1.put("1", "a");
GenericUDF.DeferredObject mapObj1 = new GenericUDF.DeferredJavaObject(map1);
Map<String, String> map2 = new HashMap<>();
map2.put("1", "b");
GenericUDF.DeferredObject mapObj2 = new GenericUDF.DeferredJavaObject(map2);
GenericUDF.DeferredObject[] args = {null, null, mapObj1, null, null, mapObj2};
Map<String, String> actual = (Map<String, String>) genericUDFMergeMap.evaluate(args);
Map<String, String> expected = new HashMap<>();
expected.put("1", "b");
Assert.assertEquals("Result should be identical", expected, actual);
}
@Test
public void testNull2() throws Exception {
ObjectInspector[] arguments = {null, voidOI};
genericUDFMergeMap.initialize(arguments);
GenericUDF.DeferredObject[] args = {null, null};
Map<String, String> actual = (Map<String, String>) genericUDFMergeMap.evaluate(args);
Assert.assertEquals("Result should be identical", Maps.newHashMap(), actual);
}
}