GenericUDF——Combinar mapa

Hive desarrolla UDF con dos interfaces: una es org.apache.hadoop.hive.ql.exec.UDF, que es principalmente para tipos de datos simples (como String, Integer, etc.); luego realiza algunas operaciones de procesamiento de inicialización y apagado.

 1. Código

package com.scb.dss.udf;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;

import java.io.Serializable;
import java.util.Map;

@Description(name="merge_map",
        value="_FUNC_(map1, map2) - Merge map1 and map2, and the last value will be taken for the identical key.",
        extended = "SELECT merge_map(map('1', 'a', '2', 'c'),map('1', 'b'));")
public class GenericUDFMergeMap extends GenericUDF implements Serializable {
    private transient MapObjectInspector inputMap1;
    private transient MapObjectInspector inputMap2;

    @Override
    public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException {
        if(objectInspectors.length != 2){
            throw new UDFArgumentLengthException("The function merge_map requires 2 arguments, got " + objectInspectors.length);
        }
        if (objectInspectors[0].getCategory()!= ObjectInspector.Category.MAP) {
            throw new UDFArgumentTypeException(0, "\"map\" expected at function merge_map, but \"" + objectInspectors[0].getCategory().name() + "\" " + "is found.");
        }
        if (objectInspectors[1].getCategory()!= ObjectInspector.Category.MAP) {
            throw new UDFArgumentTypeException(1, "\"map\" expected at function merge_map, but \"" + objectInspectors[1].getCategory().name() + "\" " + "is found.");
        }

        inputMap1 = (MapObjectInspector) objectInspectors[0];
        inputMap2 = (MapObjectInspector) objectInspectors[1];

        if (inputMap1.getMapKeyObjectInspector().getCategory() != inputMap2.getMapKeyObjectInspector().getCategory()) {
            throw new UDFArgumentTypeException(0, "The key type between map1 and map2 should be consistent. " +
                    "But key1 is " + inputMap1.getMapKeyObjectInspector().getCategory().name() +
                    ", key2 is " + inputMap2.getMapKeyObjectInspector().getCategory().name());
        }
        if (inputMap1.getMapValueObjectInspector().getCategory() != inputMap2.getMapValueObjectInspector().getCategory()) {
            throw new UDFArgumentTypeException(0, "The value type between map1 and map2 should be consistent. " +
                    "But value1 is " + inputMap1.getMapValueObjectInspector().getCategory().name() +
                    ", value2 is " + inputMap2.getMapValueObjectInspector().getCategory().name());
        }

        // 定义返回类型
        return ObjectInspectorFactory.getStandardMapObjectInspector(inputMap1.getMapKeyObjectInspector(), inputMap1.getMapValueObjectInspector());
    }

    @Override
    public Object evaluate(DeferredObject[] deferredObjects) throws HiveException {
        Map map1 = inputMap1.getMap(deferredObjects[0].get());
        Map map2 = inputMap2.getMap(deferredObjects[1].get());
        map1.putAll(map2);
        return map1;
    }

    @Override
    public String getDisplayString(String[] strings) {
        return null;
    }
}

Dos, prueba unitaria

package com.scb.dss.udf;

import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;

import java.util.HashMap;
import java.util.Map;

public class GenericUDFMergeMapTest {
    private final GenericUDFMergeMap genericUDFMergeMap = new GenericUDFMergeMap();
    private final ObjectInspector mapStringStringOI1 = ObjectInspectorFactory.getStandardMapObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector);
    private final ObjectInspector mapStringStringOI2 = ObjectInspectorFactory.getStandardMapObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector);
    private final ObjectInspector stringListOI = ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector);

    @Rule
    public ExpectedException exception = ExpectedException.none();

    @Test
    public void testEvaluate() throws Exception {
        ObjectInspector[] arguments = {mapStringStringOI1, mapStringStringOI2};
        genericUDFMergeMap.initialize(arguments);

        Map<String, String> map1 = new HashMap<>();
        map1.put("1","a");
        map1.put("2","c");
        GenericUDF.DeferredObject mapObj1 = new GenericUDF.DeferredJavaObject(map1);

        Map<String, String> map2 = new HashMap<>();
        map2.put("1","b");
        map2.put("3","d");
        GenericUDF.DeferredObject mapObj2 = new GenericUDF.DeferredJavaObject(map2);

        GenericUDF.DeferredObject[] args = {mapObj1, mapObj2};
        Map<String, String> actual = (Map<String, String>) genericUDFMergeMap.evaluate(args);
        System.out.println(actual);

        Map<String, String> expected = new HashMap<>();
        expected.put("1","b");
        expected.put("2","c");
        expected.put("3","d");
        Assert.assertEquals("Result should be identical", expected, actual);
    }

    @Test
    public void testDifferentType() throws HiveException {
        exception.expect(UDFArgumentTypeException.class);
        exception.expectMessage("\"map\" expected at function merge_map, but \"LIST\" is found.");
        ObjectInspector[] arguments = {mapStringStringOI1, stringListOI};
        genericUDFMergeMap.initialize(arguments);
    }

    @Test(expected = UDFArgumentLengthException.class)
    public void testNullTypeFirst() throws HiveException  {
        ObjectInspector[] arguments = {mapStringStringOI1, mapStringStringOI2, stringListOI};
        genericUDFMergeMap.initialize(arguments);
    }
}

3. Prueba de implementación

función desc extendida merge_map;

 SELECCIONE merge_map(mapa('1', 'a', '2', 'c'),mapa('1', 'b'));

SELECCIONE merge_map(mapa('1', 'a', '2', 'c'), '1');

 4. Mejora

El código anterior solo puede fusionar dos mapas. Ahora mejoramos el código y lo cambiamos para fusionar mapas ilimitados.

package com.scb.dss.udf;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;

import java.io.Serializable;
import java.util.Map;

@Description(name="merge_map",
        value="_FUNC_(map1, map2, ...) - Merge maps, and the last value will be taken for the identical key.",
        extended = "SELECT merge_map(map('1', 'a', '2', 'c'),map('1', 'b'));")
public class GenericUDFMergeMap extends GenericUDF implements Serializable {
    private transient MapObjectInspector[] inputMaps;

    @Override
    public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException {
        if(objectInspectors.length < 2 || objectInspectors[0] == null){
            throw new UDFArgumentLengthException("The function merge_map takes at least 2 arguments, got " + objectInspectors.length + ". And the first map cannot be null");
        }

        inputMaps = new MapObjectInspector[objectInspectors.length];
        inputMaps[0] = (MapObjectInspector) objectInspectors[0];
        ObjectInspector baseMapKeyOI = inputMaps[0].getMapKeyObjectInspector();
        ObjectInspector baseMapValueOI = inputMaps[0].getMapValueObjectInspector();

        for(int i = 1; i < objectInspectors.length; i++) {
            if (objectInspectors[i] == null) {
                continue;
            }

            // Check whether the type of the parameter is map
            if (objectInspectors[i].getCategory() != ObjectInspector.Category.MAP) {
                throw new UDFArgumentTypeException(i, "\"map\" expected at function merge_map, but \"" + objectInspectors[i].getCategory().name() + "\" " + "is found.");
            }

            MapObjectInspector mapOI = (MapObjectInspector) objectInspectors[i];

            // Check the key and value type
            if (baseMapKeyOI.getCategory() != mapOI.getMapKeyObjectInspector().getCategory()) {
                throw new UDFArgumentTypeException(i, "The key type between map1 and map2 should be consistent. " +
                        "But key1 is " + baseMapKeyOI.getCategory().name() +
                        ", key2 is " + mapOI.getMapKeyObjectInspector().getCategory().name());
            }
            if (baseMapValueOI.getCategory() != mapOI.getMapValueObjectInspector().getCategory()) {
                throw new UDFArgumentTypeException(i, "The value type between map1 and map2 should be consistent. " +
                        "But value1 is " + baseMapValueOI.getCategory().name() +
                        ", value2 is " + mapOI.getMapValueObjectInspector().getCategory().name());
            }

            inputMaps[i] = mapOI;
        }

        // 定义返回类型
        return ObjectInspectorFactory.getStandardMapObjectInspector(baseMapKeyOI, baseMapValueOI);
    }

    @Override
    public Object evaluate(DeferredObject[] deferredObjects) throws HiveException {
        Map baseMap = inputMaps[0].getMap(deferredObjects[0].get());

        for(int i = 1; i < deferredObjects.length; i++) {
            Map map = inputMaps[i].getMap(deferredObjects[i].get());
            baseMap.putAll(map);
        }
        return baseMap;
    }

    @Override
    public String getDisplayString(String[] strings) {
        return "merge_map";
    }
}

La prueba es la siguiente:

SELECCIONE merge_map(mapa('1', 'a', '2', 'c'),mapa('1', 'b'),mapa('3', 'd'));

Cinco, en la mejora - permitir valores nulos

código:

package com.scb.dss.udf;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableVoidObjectInspector;

import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;

@Description(name = "merge_map",
        value = "_FUNC_(map1, map2, ...) - Merge maps, and the last value will be taken for the identical key.",
        extended = "SELECT merge_map(map('1', 'a', '2', 'c'),map('1', 'b'));")
public class GenericUDFMergeMap extends GenericUDF implements Serializable {

    @Override
    public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException {
        if (objectInspectors.length < 2) {
            throw new UDFArgumentLengthException("The function merge_map takes at least 2 arguments, got " + objectInspectors.length);
        }

        int i = 0;
        MapObjectInspector baseMapOI = null;
        // get base MapObjectInspector
        do {
            if (objectInspectors[i] == null || objectInspectors[i] instanceof WritableVoidObjectInspector) {
                i++;
                continue;
            }
            if (objectInspectors[i].getCategory() != ObjectInspector.Category.MAP) {
                throw new UDFArgumentTypeException(i, "\"map\" expected at function merge_map, but \"" + objectInspectors[i].getCategory().name() + "\" " + "is found.");
            }
            baseMapOI = (MapObjectInspector) objectInspectors[i++];
        } while (baseMapOI == null && i < objectInspectors.length);

        for (; i < objectInspectors.length; i++) {
            if (objectInspectors[i] == null || objectInspectors[i] instanceof WritableVoidObjectInspector) {
                continue;
            }
            if (objectInspectors[i].getCategory() != ObjectInspector.Category.MAP) {
                throw new UDFArgumentTypeException(i, "\"map\" expected at function merge_map, but \"" + objectInspectors[i].getCategory().name() + "\" " + "is found.");
            }
            MapObjectInspector mapOI = (MapObjectInspector) objectInspectors[i];

            // Check the key and value type
            if (baseMapOI.getMapKeyObjectInspector().getCategory() != mapOI.getMapKeyObjectInspector().getCategory()) {
                throw new UDFArgumentTypeException(i, "The key type between map1 and map2 should be consistent. " +
                        "But key1 is " + baseMapOI.getMapKeyObjectInspector().getCategory().name() +
                        ", key2 is " + mapOI.getMapKeyObjectInspector().getCategory().name());
            }
            if (baseMapOI.getMapValueObjectInspector().getCategory() != mapOI.getMapValueObjectInspector().getCategory()) {
                throw new UDFArgumentTypeException(i, "The value type between map1 and map2 should be consistent. " +
                        "But value1 is " + baseMapOI.getMapValueObjectInspector().getCategory().name() +
                        ", value2 is " + mapOI.getMapValueObjectInspector().getCategory().name());
            }
        }

        // 定义返回类型
        return baseMapOI;
    }

    @Override
    public Object evaluate(DeferredObject[] deferredObjects) throws HiveException {
        Map baseMap = new HashMap();

        for (int i = 0; i < deferredObjects.length; i++) {
            if (deferredObjects[i] == null || deferredObjects[i].get() == null) {
                continue;
            }
            baseMap.putAll((Map) deferredObjects[i].get());
        }
        return baseMap;
    }

    @Override
    public String getDisplayString(String[] strings) {
        return "merge_map";
    }
}

clase de prueba

package com.scb.dss.udf;

import com.google.common.collect.Maps;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableVoidObjectInspector;
import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;

import java.util.HashMap;
import java.util.Map;

public class GenericUDFMergeMapTest {
    private final GenericUDFMergeMap genericUDFMergeMap = new GenericUDFMergeMap();
    private final ObjectInspector mapStringStringOI = ObjectInspectorFactory.getStandardMapObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector);
    private final ObjectInspector stringListOI = ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
    private final WritableVoidObjectInspector voidOI = PrimitiveObjectInspectorFactory.writableVoidObjectInspector;

    @Rule
    public ExpectedException exception = ExpectedException.none();

    @Test
    public void testEvaluate() throws Exception {
        ObjectInspector[] arguments = {mapStringStringOI, mapStringStringOI, mapStringStringOI};
        genericUDFMergeMap.initialize(arguments);

        Map<String, String> map1 = new HashMap<>();
        map1.put("1", "a");
        map1.put("2", "c");
        GenericUDF.DeferredObject mapObj1 = new GenericUDF.DeferredJavaObject(map1);

        Map<String, String> map2 = new HashMap<>();
        map2.put("1", "b");
        map2.put("3", "d");
        GenericUDF.DeferredObject mapObj2 = new GenericUDF.DeferredJavaObject(map2);

        Map<String, String> map3 = new HashMap<>();
        map3.put("4", "e");
        GenericUDF.DeferredObject mapObj3 = new GenericUDF.DeferredJavaObject(map3);

        GenericUDF.DeferredObject[] args = {mapObj1, mapObj2, mapObj3};
        Map<String, String> actual = (Map<String, String>) genericUDFMergeMap.evaluate(args);
        System.out.println(actual);

        Map<String, String> expected = new HashMap<>();
        expected.put("1", "b");
        expected.put("2", "c");
        expected.put("3", "d");
        expected.put("4", "e");
        Assert.assertEquals("Result should be identical", expected, actual);
    }

    @Test
    public void testDifferentType() throws HiveException {
        exception.expect(UDFArgumentTypeException.class);
        exception.expectMessage("\"map\" expected at function merge_map, but \"LIST\" is found.");
        ObjectInspector[] arguments = {mapStringStringOI, stringListOI};
        genericUDFMergeMap.initialize(arguments);
    }

    @Test(expected = UDFArgumentLengthException.class)
    public void testArgumentLengthException() throws HiveException {
        ObjectInspector[] arguments = {mapStringStringOI};
        genericUDFMergeMap.initialize(arguments);
    }

    @Test
    public void testNull() throws Exception {
        ObjectInspector[] arguments = {null, voidOI, mapStringStringOI, voidOI, null, mapStringStringOI};
        genericUDFMergeMap.initialize(arguments);

        Map<String, String> map1 = new HashMap<>();
        map1.put("1", "a");
        GenericUDF.DeferredObject mapObj1 = new GenericUDF.DeferredJavaObject(map1);

        Map<String, String> map2 = new HashMap<>();
        map2.put("1", "b");
        GenericUDF.DeferredObject mapObj2 = new GenericUDF.DeferredJavaObject(map2);

        GenericUDF.DeferredObject[] args = {null, null, mapObj1, null, null, mapObj2};
        Map<String, String> actual = (Map<String, String>) genericUDFMergeMap.evaluate(args);

        Map<String, String> expected = new HashMap<>();
        expected.put("1", "b");
        Assert.assertEquals("Result should be identical", expected, actual);
    }

    @Test
    public void testNull2() throws Exception {
        ObjectInspector[] arguments = {null, voidOI};
        genericUDFMergeMap.initialize(arguments);

        GenericUDF.DeferredObject[] args = {null, null};
        Map<String, String> actual = (Map<String, String>) genericUDFMergeMap.evaluate(args);
        Assert.assertEquals("Result should be identical", Maps.newHashMap(), actual);
    }
}

Supongo que te gusta

Origin blog.csdn.net/qq_37771475/article/details/127120420
Recomendado
Clasificación