一、问题
遇到一个场景,要根据oracle表结构创建hive表。
本来想写一个根据oracle信息和表名自动在hive中创建表的脚本。如果在集群运行,python操作oracle的cx_Oracle在集群的每个结点都需要安装,时间紧急,干活要紧,先凑合写个脚本本地用着吧
二、解决
# -*- coding=utf8 -*-
class GetHiveDDLFromOracleMeta:
def get_oralce_2_hive_datatype_map(self):
type_map = {
"INTEGER": "double",
"NUMBER": "double",
"NUMBER(10)": "bigint",
"NUMBER(11)": "bigint",
"NUMBER(12)": "bigint",
"NUMBER(13)": "bigint",
"NUMBER(14)": "bigint",
"NUMBER(15)": "bigint",
"NUMBER(16)": "bigint",
"NUMBER(17)": "bigint",
"NUMBER(18)": "bigint",
"NUMBER(5)": "int",
"NUMBER(6)": "int",
"NUMBER(7)": "int",
"NUMBER(8)": "int",
"NUMBER(9)": "int",
"NUMBER(2)": "smallint",
"NUMBER(3)": "smallint",
"NUMBER(4)": "smallint",
"NUMBER(P,S)": "decimal",
"NUMBER(1)": "tinyint",
"FLOAT": "double",
"BINARY_FLOAT": "double",
"BINARY_DOUBLE": "double",
"DATE": "timestamp",
"TIMESTAMP(1)": "timestamp",
"TIMESTAMP(2)": "timestamp",
"TIMESTAMP(3)": "timestamp",
"TIMESTAMP(4)": "timestamp",
"TIMESTAMP(5)": "timestamp",
"TIMESTAMP(6)": "timestamp",
"CHAR": "string",
"NCHAR": "string",
"NVARCHAR2": "string",
"VARCHAR2": "string",
"NVARCHAR": "string",
"BLOB": "", "BFILE": "", "NCLOB": "","CLOB": "string", "ROWID": "", "UROWID": "", "TIMESTAMP with time zone": "",
"TIMESTAMP with local time zone": "", "ANYDATA": "", "VARRAY": "", "NESTEDTAB": "", "Object": ""
, "REF": "", "RAW": ""}
return type_map
def split_oracl_meta(self, oracle_meta):
name_types = []
for name_type in oracle_meta.split(","):
column_name = name_type.split(" ")[0]
column_type = name_type.split(" ")[1]
name_types.append((column_name, column_type))
return name_types
def transfrom_from_oracle_to_hive(self, oracle_meta_maps):
oracle_hive_maps = self.get_oralce_2_hive_datatype_map()
hive_name_type_maps = []
for name_type in oracle_meta_maps:
column_name = name_type[0]
oracle_type = name_type[1]
hive_type = oracle_hive_maps[oracle_type]
if hive_type:
print(oracle_hive_maps[oracle_type])
hive_name_type_maps.append((column_name, hive_type))
else:
raise Exception(" ERROR!!! no map column and type is " + str(name_type))
return hive_name_type_maps
def get_dml_from_hive_name_type_maps(self, hive_name_type_maps, hive_table_name):
dml_sql_prefix = "create external table " + hive_table_name + " ("
dml_sql_middle = ""
for name_type in hive_name_type_maps:
column_name = name_type[0]
column_type = name_type[1]
dml_sql_middle = dml_sql_middle + column_name + " " + column_type + ", "
dml_sql_middle = dml_sql_middle[0:len(dml_sql_middle)-2]
dml_sql_suffix = ") " \
" row format delimited fields terminated by '\t' stored as textfile ;"
sql = dml_sql_prefix + dml_sql_middle + dml_sql_suffix
return sql
def create_hive_table_dml_by_oracle_meta(self, oracle_meta, hive_table_name):
oracle_name_types = self.split_oracl_meta(oracle_meta)
hive_name_types = self.transfrom_from_oracle_to_hive(oracle_name_types)
hive_table_dml_sql = self.get_dml_from_hive_name_type_maps(hive_name_types,hive_table_name)
return hive_table_dml_sql
if __name__ == "__main__":
this_obj = GetHiveDDLFromOracleMeta()
oracle_meta = "BIAOZHIPAIDIANNAOBIANHAO NUMBER,BIAOZHIPAIFUKAYOUXIAOQICONG TIMESTAMP(6),BIAOZHIPAIFUKAYOUXIAOQIZHI TIMESTAMP(6),BIAOZHIPAIHAOMA VARCHAR2,BIAOZHIPAILEIXING VARCHAR2,CHELIANGDIANNAOBIANHAO NUMBER,CHELIANGTUICHURIQI TIMESTAMP(6),CHEXINGXIANZHI VARCHAR2,CUSTOM_OPER_TYPE VARCHAR2,CUSTOM_UPDATE_TIME TIMESTAMP(6),DAIYONGKAYOUXIAOQICONG TIMESTAMP(6),DAIYONGKAYOUXIAOQIZHI TIMESTAMP(6),FUKAHAO VARCHAR2,GUANLIJIBIE VARCHAR2,JINGYINGQIXIAN TIMESTAMP(6),JINGYQUANDANWEIDIANNAOBIANHAO NUMBER,NITOURUCHELIANGRIQI TIMESTAMP(6),PIWENHAO VARCHAR2,PIZHUNJIGUAN VARCHAR2,PIZHUNSHIJIAN TIMESTAMP(6),PM_PRIMARYKEY NUMBER,QIDIANDI VARCHAR2,SHENQINGRIQI TIMESTAMP(6),SHIFOUYIFAPAI VARCHAR2,SHIYONGDANWEIDIANNAOBIANHAO NUMBER,SHOUCITOURUSHIJIAN TIMESTAMP(6),SHUJUZHUANGTAI VARCHAR2,SYS_XINZENGREN VARCHAR2,SYS_XINZENGSHIJIAN TIMESTAMP(6),SYS_ZUIJINXIUGAIREN VARCHAR2,SYS_ZUIJINXIUGAISHIJIAN TIMESTAMP(6),XIAQUSHENG VARCHAR2,XIAQUSHI VARCHAR2,XIAQUXIAN VARCHAR2,XIAQUZHEN VARCHAR2,XUKERIQI TIMESTAMP(6),YUANCHEPAIHAO VARCHAR2,ZHIBIAODIANNAOBIANHAO NUMBER,ZHIBIAOYOUXIAORIQI TIMESTAMP(6),ZHIBIAOZHONGLEI VARCHAR2,ZHUANGBEITIAOJIAN VARCHAR2,ZHUANGTAI VARCHAR2,ZUIDICHELIANGJISHUDENGJI VARCHAR2,ZUIDICHELIANGZHUANGBEIDENGJI VARCHAR2"
hive_table_name = "ST_DLYS_KEYUNBAOCHEXIANLU"
results = this_obj.create_hive_table_dml_by_oracle_meta(oracle_meta, hive_table_name)
print(type(results))
print(results)
三、使用
1.根据自己需求调整get_oralce_2_hive_datatype_map方法中的类型映射
2.在能连oracle的工具中,例如plsql,navicat,dbeaver等工具,连接到oracle库,执行下面的sql
注意:TABLE_NAME请改成创建hive表依据的oracle表名
select Listagg(concat(concat(A.COLUMN_NAME,' '),A.DATA_TYPE), ',') WITHIN GROUP(ORDER BY column_name)
from user_tab_columns A
where table_name = upper('TABLE_NAME');
查询结果类似这样
BIAOZHIPAIDIANNAOBIANHAO NUMBER,BIAOZHIPAIFUKAYOUXIAOQICONG TIMESTAMP(6),BIAOZHIPAIFUKAYOUXIAOQIZHI TIMESTAMP(6),BIAOZHIPAIHAOMA VARCHAR2,BIAOZHIPAILEIXING VARCHAR2,CHELIANGDIANNAOBIANHAO NUMBER,CHELIANGTUICHURIQI TIMESTAMP(6),CHEXINGXIANZHI VARCHAR2,CUSTOM_OPER_TYPE VARCHAR2,CUSTOM_UPDATE_TIME TIMESTAMP(6),DAIYONGKAYOUXIAOQICONG TIMESTAMP(6),DAIYONGKAYOUXIAOQIZHI TIMESTAMP(6),FUKAHAO VARCHAR2,GUANLIJIBIE VARCHAR2,JINGYINGQIXIAN TIMESTAMP(6),JINGYQUANDANWEIDIANNAOBIANHAO NUMBER,NITOURUCHELIANGRIQI TIMESTAMP(6),PIWENHAO VARCHAR2,PIZHUNJIGUAN VARCHAR2,PIZHUNSHIJIAN TIMESTAMP(6),PM_PRIMARYKEY NUMBER,QIDIANDI VARCHAR2,SHENQINGRIQI TIMESTAMP(6),SHIFOUYIFAPAI VARCHAR2,SHIYONGDANWEIDIANNAOBIANHAO NUMBER,SHOUCITOURUSHIJIAN TIMESTAMP(6),SHUJUZHUANGTAI VARCHAR2,SYS_XINZENGREN VARCHAR2,SYS_XINZENGSHIJIAN TIMESTAMP(6),SYS_ZUIJINXIUGAIREN VARCHAR2,SYS_ZUIJINXIUGAISHIJIAN TIMESTAMP(6),XIAQUSHENG VARCHAR2,XIAQUSHI VARCHAR2,XIAQUXIAN VARCHAR2,XIAQUZHEN VARCHAR2,XUKERIQI TIMESTAMP(6),YUANCHEPAIHAO VARCHAR2,ZHIBIAODIANNAOBIANHAO NUMBER,ZHIBIAOYOUXIAORIQI TIMESTAMP(6),ZHIBIAOZHONGLEI VARCHAR2,ZHUANGBEITIAOJIAN VARCHAR2,ZHUANGTAI VARCHAR2,ZUIDICHELIANGJISHUDENGJI VARCHAR2,ZUIDICHELIANGZHUANGBEIDENGJI VARCHAR2
复制查询结果
3.修改main函数里的参数
- 用上面复制的字段名和类型拼接成的字符串替换main函数里的参数oracle_meta=右侧的字符串
- 修改参数hive_table_name=右侧的表名为想要创建的hive表名
- (以py文件名是GetHiveDDLFromOracleMeta为例)在pycharm里run或者打开cmd输入$PYTHON_HOME/python GetHiveDDLFromOracleMeta.py
- pycharm控制台或者cmd输出的create语句即为所需的hive建表语句了
4.优化(TODO)
后面有时间会把复制sql查询结果这一步用查询oracle的方法替代。hive建表语句也不需要粘贴出来自己运行,而是直接自动创建。