#1、Hive基本操作:查看dw.full_h_usr_base_user的详细信息,可以获取数据文件的存放路径desc formatted dw.full_h_usr_base_user;dfs -ls dfs -ls hdfs://BIGDATA:9000/user/hive/warehouse/dw.db/full_h_usr_base_user;删除外表full_h_usr_base_user的数据dfs -rmdir dfs -ls hdfs://BIGDATA:9000/user/hive/warehouse/dw.db/full_h_usr_base_user;
#192.168.1.181 192.168.1.1#2、创建带有map数据类型的外表create external table dw.full_h_usr_base_user(user_id string comment '用户id',reg_ip string comment 'ip',reg_ip_geo_map map<string,string> comment --map数据类型创建方法'city_id,city_name,isp,province_id,province_name,country_id,country_name,postzip,district,province')comment '用户测试表'partitioned by(ds string comment '当前时间,用于分区字段')row format delimitedfields terminated by '\t'collection items terminated by ","--map键值对逗号分割map keys terminated by ":"--map键值冒号分割stored as TEXTFILE;--存储为文本类型
#3、加载数据(指定user_id和reg_ip即可,reg_ip_geo_map可以通过UDF运算出来)load data local inpath '/opt/data/dw.full_h_usr_base_user.del'overwrite into table dw.full_h_usr_base_user partition(ds='2017-09-25');
#4、自定义函数:Python UDF函数ip_to_num.py#coding=utf-8
#Version:python3.5.2
#Tools:Pycharm
#Date:
__author__ = "Colby"
import socket
import struct
import sys,datetime
ipDB='/opt/data/IP_utf-8.csv'
for line in sys.stdin:
line = line.strip()
user_id, reg_ip, reg_ip_geo_map, ds = line.split('\t')
num_ip = int(socket.ntohl(struct.unpack("I", socket.inet_aton(str(reg_ip)))[0]))
f = open(ipDB, 'r', encoding="utf-8")
ipDict = {}
count = 0
for line in f:
if count == 9:
count += 1
continue
line = line.split(',')
if int(line[2]) <= num_ip and int(line[3]) >= num_ip:
ipDict['IP'] = reg_ip
ipDict['nationality'] = line[4]
ipDict['province'] = line[5]
ipDict['city'] = line[6]
ipDict['Corporation'] = line[8]
reg_ip_geo_map=str(ipDict)[1:-1].replace('\'','').replace(' ','')
print('\t'.join([user_id, reg_ip, reg_ip_geo_map,ds]))
f.close()
#5、将udf函数文件上传文件到服务器指定目录 /opt/udf//opt/udf/ip_to_num.py
#6、进入hive命令行,add文件add file /opt/udf/ip_to_num.py;#Added resources: [/opt/udf/ip_to_num.py]
#7、使用udf函数并进行测试SELECTTRANSFORM (user_id, reg_ip, reg_ip_geo_map, ds)USING 'python3 ip_to_num.py'AS (user_id, reg_ip, reg_ip_geo_map, ds)FROM dw.full_h_usr_base_user;
#8、函数处理数据,并且overwrite表dw.full_h_usr_base_user,注意动态分区参数set hive.exec.dynamic.partition.mode=nonstrict;insert overwrite table dw.full_h_usr_base_user partition(ds)select user_id,reg_ip,str_to_map(reg_ip_geo_map,',',':') as reg_ip_geo_map,ds from (SELECTTRANSFORM (user_id, reg_ip, reg_ip_geo_map,ds)USING 'python3 ip_to_num.py'AS (user_id, reg_ip, reg_ip_geo_map,ds)FROM dw.full_h_usr_base_user) as a;
#9、查询处理好的数据,学会查询map类型的数据
select user_id,reg_ip_geo_map['province'],reg_ip_geo_map['city'],reg_ip_geo_map['nationality']from dw.full_h_usr_base_userwhere ds='2017-09-25' and user_id='1000000015';
输出结果:OK1000000015 安徽省 合肥市 中国Time taken: 0.107 seconds, Fetched: 1 row(s)#动态分区,将字符创转换成MAP#set hive.exec.dynamic.partition.mode=nonstrict;#insert into dw.full_h_usr_base_user partition(ds)#select user_id#, reg_ip#, str_to_map(reg_ip_geo_map) reg_ip_geo_map#,ds from dw.full_h_usr_base_user_tmp;
hive之Python UDF函数操作map数据 详解 全是干货
猜你喜欢
转载自blog.csdn.net/colby_chenlun/article/details/78140033
今日推荐
周排行