文件A:这里面是sequenceFile格式文件,key是long型时间戳,value是一个Text类型字符串(是JSON格式)
{"app":"bshare","data":{"ip":"1032096474","keyword":"%E6%9D%8E%E5%85%8B%E5%BC%BA%2C%E6%9D%8E%E5%85%8B%E5%BC%BA%E5%87%BA%E8%AE%BF%E6%AC%A7%E6%B4%B2%E4%B8%89%E5%9B%BD","referrer":"http://news.sina.com.cn/","title":"%E6%9D%8E%E5%85%8B%E5%BC%BA%E5%90%91%E4%BF%84%E7%BD%97%E6%96%AF%E6%97%A0%E5%90%8D%E7%83%88%E5%A3%AB%E5%A2%93%E7%8C%AE%E8%8A%B1%E5%9C%88%28%E5%9B%BE%29%7C%E6%9D%8E%E5%85%8B%E5%BC%BA_%E6%96%B0%E6%B5%AA%E6%96%B0%E9%97%BB","ua":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36","url":"http://news.sina.com.cn/c/p/2014-10-13/164030981857.shtml","uuid":"cc6dc2b3-f467-40cb-afae-c7c6154a5f54","vid":"1CH3vFsRCmQaVQG9J8yH"},"time":"2014-10-13 18:01:14","type":"view"}
文件B: 这里也是SequenceFile, key是Text型,对应的是上面数据中的vid,value是一个Text类型字符串(也是JSON格式)
{"age_group":"45-54","article_titles":["新浪新闻"],"bshare_id":"1CH01b9HB2uj7UuIP818","country_city":"UNKNOWN","device":"UNKNOWN","gender":"m","interests":["sports"],"keywords":["汽车"]}
文件C:referrer_top100.txt 记录排名前100的referrer
求:referrer_top100下的所有vid和vid属性。
pig脚本一:
REGISTER /Users/shuguo/pig/contrib/piggybank/java/piggybank.jar; DEFINE SequenceFileLoader org.apache.pig.piggybank.storage.SequenceFileLoader(); domain = load '/tmp/input/top_domain.txt' using PigStorage('\t') as ( domain_text:chararray ); domain_cookie =load '/tmp/input/gnome_simple.data' USING SequenceFileLoader('\t') as ( time:chararray, log_text:chararray ); cookie = load '/tmp/input/buzz_cookie_simple.data' USING SequenceFileLoader('\t') as ( vid:chararray, cookie_value:chararray ); domain = foreach domain generate domain_text as domain_text; cookie = foreach cookie generate vid as vid, cookie_value as cookie_value; domain_cookie_a = foreach domain_cookie generate log_text; store domain_cookie_a into '/tmp/input/domain_cookie.json' using PigStorage('\t'); log_domain_cookie = load '/tmp/input/domain_cookie.json' USING JsonLoader('app:chararray, data(ip:chararray, keyword:chararray,referrer:chararray, title:chararray,ua:chararray,url:chararray,uuid:chararray,vid:chararray), time:chararray, type:chararray'); domain_cookie_b = foreach log_domain_cookie generate vid as vid, title as title; domain_cookie_c = distinct domain_cookie_b; domain_cookie_find = join domain_cookie_c by title, domain by domain_text; domain_cookie_result = foreach domain_cookie_find generate domain_cookie_find::domain::domain_text as domain,domain_cookie_find::domain_cookie_c::vid as vid; domain_result = join domain_cookie_result by vid, cookie by vid; domain_result = foreach domain_result generate domain_result::domain_cookie_result::domain, domain_result::domain_cookie_result::vid,domain_result::cookie::cookie_value; store domain_result into '/tmp/input/tony_domain' using PigStorage('\t');
pig脚本二:
register /opt/pig/contrib/piggybank/java/piggybank.jar; REGISTER /home/code/opensource/elephant-bird/pig/target/elephant-bird-pig-4.6-SNAPSHOT-jar-with-dependencies.jar; DEFINE SequenceFileLoader org.apache.pig.piggybank.storage.SequenceFileLoader(); DEFINE JsonStringToMap com.twitter.elephantbird.pig.piggybank.JsonStringToMap(); genome_raw = LOAD '$INFILE1' using SequenceFileLoader() as (key:long, value:chararray); genome_parsed = FOREACH genome_raw GENERATE JsonStringToMap(value) as genome_data:map[]; genome_data = FOREACH genome_parsed GENERATE JsonStringToMap(genome_data#'data'); top_100_domain = LOAD '$INFILE2' as (domain:chararray); cookie_domain = FOREACH genome_data GENERATE json#'vid' as vid:chararray,FLATTEN(REGEX_EXTRACT(json#'url', '(http://[^/]+).*',1)) as d:chararray; cookie_with_domain = JOIN cookie_domain BY d,top_100_domain BY domain USING 'replicated'; cookie_with_domain_se = FOREACH cookie_with_domain GENERATE vid, domain; cookie_with_domain_gr = GROUP cookie_with_domain_se BY vid; cookie_with_domain_re = FOREACH cookie_with_domain_gr GENERATE $0 as vid,$1.domain as domain; cookie_info_raw = LOAD '$INFILE3' USING com.twitter.elephantbird.pig.load.SequenceFileLoader ( '-c com.twitter.elephantbird.pig.util.TextConverter', '-c com.twitter.elephantbird.pig.util.TextConverter' ) AS ( key:chararray, value:chararray ); cookie_info = FOREACH cookie_info_raw GENERATE JsonStringToMap(value) as cookie:map[]; cookie_join = JOIN cookie_with_domain_re BY vid, cookie_info BY cookie#'bshare_id'; cookie_result = FOREACH cookie_join GENERATE cookie#'bshare_id',cookie#'age_group',cookie#'country_city',cookie#'device',cookie#'gender',cookie#'interests',domain; STORE cookie_result INTO '$OUTFILE';