GenericUDF的示例, 根据字符串生成词向量

GenericUDF提供了更好的参数和返回值检查, 效率更高, 适合处理HIVE中的复杂数据类型

把字符串变成词向量, 例如:

"This is a sentence"->{'This':1, 'is':1, 'a':1, 'sentence':1}

对于外部依赖, 为了让集群的每个节点都能执行jar, 可以用eclipse export Runnable Jar File

package cn.pywei.HiveUDF;

import java.util.HashMap;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;

@Description(name="WordsArray",value="_FUNC_(string), return the word array by using GenericUDF.")

public class WordArray extends GenericUDF {
	private final Map<Text, IntWritable> sortMap = new HashMap<Text, IntWritable>();

	private ObjectInspectorConverters.Converter converter;

	@Override
	public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
		
		// check the input argument count
		if (arguments.length != 1) {
            throw new UDFArgumentException("Param must be 1 argu.");
        }
		
		// check the input argument type
		if (arguments[0].getCategory() != Category.PRIMITIVE) {
	        throw new UDFArgumentTypeException(1, "A string argument was expected.");

	    }
		
		PrimitiveCategory primitiveCategory = ((PrimitiveObjectInspector) arguments[0]).getPrimitiveCategory();
		if (primitiveCategory != PrimitiveCategory.STRING
		          && primitiveCategory != PrimitiveCategory.CHAR
		          && primitiveCategory != PrimitiveCategory.VARCHAR
		          && primitiveCategory != PrimitiveCategory.VOID) {
		        throw new UDFArgumentTypeException(1,
		            "A string, char, varchar or null argument was expected");

		}
		
		// generate a converter for the argument to use in the evaluate function
		converter = ObjectInspectorConverters.getConverter(arguments[0],PrimitiveObjectInspectorFactory.writableStringObjectInspector);
		
		// return the inspector to check the return value of evaluate function
        return ObjectInspectorFactory.getStandardMapObjectInspector(
                PrimitiveObjectInspectorFactory.writableStringObjectInspector,
                PrimitiveObjectInspectorFactory.writableIntObjectInspector);
	}

	@Override
	public Object evaluate(DeferredObject[] arguments) throws HiveException {
		// check if the argument is null
		if (arguments[0].get() == null) {
			return sortMap;
		}
		
		// populate the word array
		Text s = (Text) converter.convert(arguments[0].get());
		String[] ss = s.toString().split(" ");
        for (String i : ss) {
            if (StringUtils.isBlank(i)) {
                continue;
            }
            if(sortMap.containsKey(new Text(i))) {
            	sortMap.replace(new Text(i), new IntWritable(sortMap.get(new Text(i)).get()+1));
            }
            else {
            	sortMap.put(new Text(i), new IntWritable(1));
            }   
        }
		return sortMap;
	}

	@Override
	public String getDisplayString(String[] children) {
		// generate the logs to show in the HQL explain clause
		return children[0];
	}
}


猜你喜欢

转载自blog.csdn.net/rav009/article/details/80423965