今天写了关于Pig的EvalFunc UDF函数,结果一执行,发现返回值,总是bag类型,我就纳闷了,我明明指定了返回是String类型,怎么会变成Bag类型呢?经查找,发现拷贝的问题,由于先前写的UDF函数,返回值是多个,而现在的这个是一个,所以导致,我在pig脚本里面,进行强转string类型出错,发现问题后,设置返回类型为DataType.CHARARRAY问题得以解决。
案例(一),输入值为多个参数,返回也为多个参数
package com.easy.pig; import com.easy.similar.model.ResultModel; import com.easy.similar.tools.TextBuildID; import org.apache.pig.EvalFunc; import org.apache.pig.data.*; import org.apache.pig.impl.logicalLayer.schema.Schema; import java.io.IOException; /** * Created by qindongliang on 2015/9/29. * 根据一篇内容返回md5和关键词words */ public class FingerUDF extends EvalFunc<DataBag> { /**tuple实例**/ TupleFactory mTupleFactory = TupleFactory.getInstance(); /**Bag实例*/ BagFactory mBagFactory = BagFactory.getInstance(); /**md5构建**/ TextBuildID textBuildID = new TextBuildID(); @Override public DataBag exec(Tuple tuple) throws IOException { try { DataBag output = mBagFactory.newDefaultBag(); Object o = tuple.get(0); //返回多少句子 int num_sentence = Integer.parseInt(tuple.get(1).toString()); //返回几个关键词 int num_words = Integer.parseInt(tuple.get(2).toString()); //使用指纹算法 1 还是分词算法 2 int type=Integer.parseInt(tuple.get(3).toString()); //参与扩展计算的指纹,null的直接返回空 String ext=tuple.get(4)==null?"":tuple.get(4).toString(); //设置句子数量 textBuildID.top_N_Sentence=num_sentence; //设置返回指纹数量 textBuildID.top_N_Words=num_words; //设置去重算法类型 textBuildID.type=type; ResultModel ro=textBuildID.buildID(o.toString(),ext); output.add(mTupleFactory.newTuple(ro.getMd5()));//获取md5值 output.add(mTupleFactory.newTuple(ro.getWords()));//获取指纹关键词 return output; } catch (Exception ee) { // error handling goes here ee.printStackTrace(); } return null; } /**描述scheaml形式*/ public Schema outputSchema(Schema input) { try{ Schema bagSchema = new Schema(); bagSchema.add(new Schema.FieldSchema("token", DataType.CHARARRAY)); return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), bagSchema, DataType.BAG)); }catch (Exception e){ return null; } } }
案例(二),输入值为String,返回也为String
package com.easy.pig; import org.apache.pig.EvalFunc; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.impl.logicalLayer.schema.Schema; import java.io.IOException; /** * 处理url */ public class UrlCvUDF extends EvalFunc<String> { @Override public String exec(Tuple tuple) throws IOException { try { Object o = tuple.get(0); if(o!=null){ //判断是否为指定url开头的来源 if((o+"").startsWith("http://www.court.gov.cn")){ return "1"; }else{ return "2"; } } } catch (Exception ee) { ee.printStackTrace(); } //url 为null 则返回0 return "0"; } // /**描述scheaml形式*/ public Schema outputSchema(Schema input) { try{ Schema bagSchema = new Schema(); bagSchema.add(new Schema.FieldSchema("token", DataType.CHARARRAY)); //注意此处返回值要与泛型里面的对应 return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), bagSchema, DataType.CHARARRAY)); }catch (Exception e){ e.printStackTrace(); return null; } } }
案例一的pig脚本:
--SET debug 'on' --REGISTER ./aa.jar REGISTER ./udf-pig-similarty-hbase-1.0-SNAPSHOT-jar-with-dependencies.jar REGISTER ./pig-udf-extend-1.0.1-SNAPSHOT-jar-with-dependencies.jar --mkdir /user/webmaster/crawldb/finger/ --rmf /user/webmaster/crawldb/finger/ mkdir /user/webmaster/search/monitor/finger-data; rmf /user/webmaster/search/monitor/finger-data; set job.name 'pig-hbase-build-index' a = load 'hbase://ETLDB' USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('content:conn content:casenum,meta:isdelete','-loadKey true '); --a = foreach a generate FLATTEN(com.easy.pig.FingerUDF((chararray)$1)); -- 4个参数 0:文章内容 1:前n最长的句子数,2:前n关键词,3:判重算法, 1=>指纹 2=>分词 a = foreach a generate $0 as rowkey:chararray , BagToString(com.easy.pig.FingerUDF((chararray)$1,5,8,1,$2),'@') as info:chararray,$2 as casenum:chararray , $3 as isdel:chararray ; --a = foreach a generate $2 as num:chararray; --a = limit a 50; --dump a; --describe a; a = foreach a generate $0 as rowkey:chararray , STRSPLIT(info,'@',2).$0 as finger_md5:chararray ,STRSPLIT(info,'@',2).$1 as finger_content:chararray ,casenum,isdel ; --describe a; store a into '/user/webmaster/search/monitor/finger-data' using com.pig.support.lucene.LuceneStore('row:true:false,finger_md5:true:false,finger_content:true:false,casenum:true:false,isdel:true:false','default');
案例二的pig脚本:
--SET debug 'on' --REGISTER ./aa.jar REGISTER ./udf-pig-similarty-hbase-1.0-SNAPSHOT-jar-with-dependencies.jar REGISTER ./pig-udf-extend-1.0.1-SNAPSHOT-jar-with-dependencies.jar --mkdir /user/webmaster/crawldb/finger/ --rmf /user/webmaster/crawldb/finger/ mkdir /user/webmaster/search/monitor/finger-data; rmf /user/webmaster/search/monitor/finger-data; set job.name 'pig-hbase-build-index' a = load 'hbase://ETLDB' USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('content:conn content:casenum meta:isdelete content:url','-loadKey true '); --a = foreach a generate FLATTEN(com.easy.pig.FingerUDF((chararray)$1)); -- 4个参数 0:文章内容 1:前n最长的句子数,2:前n关键词,3:判重算法, 1=>指纹 2=>分词 a = foreach a generate $0 as rowkey:chararray , BagToString(com.easy.pig.FingerUDF((chararray)$1,5,8,1,''),'@') as info:chararray,$2 as casenum:chararray , $3 as isdel:chararray, com.easy.pig.UrlCvUDF((chararray)$4) as source:chararray ; --a = foreach a generate $2 as num:chararray; a = limit a 11; dump a; describe a; --describe a; --a = foreach a generate $0 as rowkey:chararray , STRSPLIT(info,'@',2).$0 as finger_md5:chararray ,STRSPLIT(info,'@',2).$1 as finger_content:chararray ,casenum,isdel ; --describe a; --store a into '/user/webmaster/search/monitor/finger-data' using com.pig.support.lucene.LuceneStore('row:true:false,finger_md5:true:false,finger_content:true:false,casenum:true:false,isdel:true:false','default');