其他类似相关参考:
Python自动化拉取Mysql数据并装载到Hive(V3.0)--> https://blog.csdn.net/BabyFish13/article/details/73885033
Python自动化拉取Mysql数据并装载到Hive(V2.0)--> http://blog.csdn.net/babyfish13/article/details/70792158
Python自动化拉取Mysql数据并装载到Hive--> http://blog.csdn.net/babyfish13/article/details/73618331
有两个版本,第一个版本是通过Mysql与Hive表对表传输,其间可以指定过滤条件,也可以多分表传输,但分表的话每次调用一次程序太慢,所以效率会有影响;第二个版本通过sqoop的query参数进行传输,变量及参数在sql语句中进行控制,所以效率会好一些。
其实, 无论是通过sqoop、或是数据select出来再load,或者是通过datax传输,其只是手段,要形成方案,都需要通过python或shell将其串联起来,才能供我们随心所欲的使用。
1、sqoop表对表传输
/Users/nisj/PycharmProjects/BiDataProc/love/mysqlData2HiveBySqoop0329.py
/Users/nisj/PycharmProjects/BiDataProc/love/mysqlData2HiveBySqoop.py
1、与建表建分区结合,暂没有考虑传输特定几个字段的情况,可以在以后的版本中优化。
2、总体目前在同一个脚本中,以后可以考虑根据功能分开。
3、总体包含元数据脚本获取模块、hive建表模块、数据传输模块。
Python自动化拉取Mysql数据并装载到Hive(V3.0)--> https://blog.csdn.net/BabyFish13/article/details/73885033
Python自动化拉取Mysql数据并装载到Hive(V2.0)--> http://blog.csdn.net/babyfish13/article/details/70792158
Python自动化拉取Mysql数据并装载到Hive--> http://blog.csdn.net/babyfish13/article/details/73618331
有两个版本,第一个版本是通过Mysql与Hive表对表传输,其间可以指定过滤条件,也可以多分表传输,但分表的话每次调用一次程序太慢,所以效率会有影响;第二个版本通过sqoop的query参数进行传输,变量及参数在sql语句中进行控制,所以效率会好一些。
其实, 无论是通过sqoop、或是数据select出来再load,或者是通过datax传输,其只是手段,要形成方案,都需要通过python或shell将其串联起来,才能供我们随心所欲的使用。
1、sqoop表对表传输
/Users/nisj/PycharmProjects/BiDataProc/love/mysqlData2HiveBySqoop0329.py
# -*- coding=utf-8 -*- import os import re import warnings import datetime warnings.filterwarnings("ignore") # src Database config srcMysqlConfig_jellyfish_server = { 'host': 'MysqlHost', # 'host': 'MysqlHost', 'user': 'MysqlUser', 'passwd': 'MysqlPass', 'port': 50506, 'db': 'jellyfish_server' } def dateRange(beginDate, endDate): dates = [] dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d") date = beginDate[:] while date <= endDate: dates.append(date) dt = dt + datetime.timedelta(1) date = dt.strftime("%Y-%m-%d") return dates def getSrcMysqlConfig(srcMysql_config): srcMysql_config = srcMysql_config return srcMysql_config['host'], srcMysql_config['port'], srcMysql_config['user'], srcMysql_config['passwd'], srcMysql_config['db'] def getMysqlTabCreateScript(srcMysql_config, src_tabName, tabType): # Parameter initialization host = getSrcMysqlConfig(srcMysql_config)[0] port = getSrcMysqlConfig(srcMysql_config)[1] user = getSrcMysqlConfig(srcMysql_config)[2] passwd = getSrcMysqlConfig(srcMysql_config)[3] db = getSrcMysqlConfig(srcMysql_config)[4] if tabType == 'single': src_postfix = '' elif 'submeter' in tabType: src_postfix = '_0' srcTabStructure = os.popen("""source /etc/profile; \ /usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -D{db} \ -N -e"set names utf8; \ select a2.column_name,case when a2.data_type like '%int' then 'bigint' else 'string' end data_type from information_schema.TABLES a1 left join information_schema.columns a2 on a1.TABLE_SCHEMA=a2.TABLE_SCHEMA and a1.TABLE_NAME=a2.TABLE_NAME where a1.TABLE_SCHEMA='{db}' and a1.table_name ='{src_tabName}{src_postfix}' order by a2.ORDINAL_POSITION;" \ """ .format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName, src_postfix=src_postfix)).readlines(); srcTabCol_list = [] for stcList in srcTabStructure: stc = re.split('\t', stcList.replace('\n', '')) srcTabCol_list.append(stc) TabCreateScript = 'use ods;\ndrop table if exists {db}_{src_tabName};\ncreate table {db}_{src_tabName}(\n'.format(src_tabName=src_tabName, db=db) for srcColType in srcTabCol_list: TabCreateScript = TabCreateScript + '\`' + srcColType[0] + '\`' + ' ' + srcColType[1] + ',\n' TabCreateScript = TabCreateScript[:-2]+") partitioned by (\`pt_day\` string) row format delimited fields terminated by '\t' lines terminated by '\n' location 'hdfs://emr-cluster/user/hive/warehouse/ods.db/{db}_{src_tabName}';".format(src_tabName=src_tabName, db=db) return TabCreateScript def HiveCreateTab(srcMysql_config, src_tabName, tabType): TabCreateScript = getMysqlTabCreateScript(srcMysql_config, src_tabName, tabType) os.system("""/usr/lib/hive-current/bin/hive -e "{TabCreateScript}" """.format(TabCreateScript=TabCreateScript)) def mysqlData2Hive(srcMysql_config, src_tabName, tabType, runDay, whereCondition): # Parameter initialization host = getSrcMysqlConfig(srcMysql_config)[0] port = getSrcMysqlConfig(srcMysql_config)[1] user = getSrcMysqlConfig(srcMysql_config)[2] passwd = getSrcMysqlConfig(srcMysql_config)[3] db = getSrcMysqlConfig(srcMysql_config)[4] # add partitions os.system("""source /etc/profile; \ /usr/lib/hive-current/bin/hive -e " use ods; alter table {db}_{src_tabName} drop if exists partition (pt_day='{runDay}'); \ alter table {db}_{src_tabName} add partition (pt_day='{runDay}');" \ """.format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName, runDay=runDay)) # get submeter table count if tabType == 'single': submeter_cnt = 1 elif 'submeter' in tabType: submeter_cnt = int(str(tabType).replace('submeter-', '')) # partition table data load for submeterPlus in range(0, submeter_cnt, 1): # get submeter table postfix if tabType == 'single': submeterPostfix = '' elif 'submeter' in tabType: submeterPostfix = '_'+str(submeterPlus) os.system("""source /etc/profile; \ sqoop import \ --connect jdbc:mysql://{host}:{port}/{db}?zeroDateTimeBehavior=convertToNull \ --username {user} \ --password {passwd} \ --table {src_tabName}{submeterPostfix} \ --target-dir hdfs://emr-cluster/user/hive/warehouse/ods.db/{db}_{src_tabName}/pt_day={runDay} \ --append \ --fields-terminated-by '\t' \ --lines-terminated-by '\n' \ --driver com.mysql.jdbc.Driver -m 1 \ --where "{whereCondition}" \ """.format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName, submeterPostfix=submeterPostfix, runDay=runDay, whereCondition=whereCondition)) # Batch Test HiveCreateTab(srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='live_history_status', tabType='submeter-3') for runDay in dateRange(beginDate='2018-03-25', endDate='2018-03-26'): mysqlData2Hive(srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='live_history_status', tabType='submeter-3', runDay=runDay, whereCondition="substr(updated_time, 1, 10) = '{runDay}'".format(runDay=runDay)) # HiveCreateTab(srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='big_fans_detail', tabType='single') # for runDay in dateRange(beginDate='2018-03-25', endDate='2018-03-26'): # mysqlData2Hive(srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='big_fans_detail', tabType='single', runDay=runDay, whereCondition="")2、通过query进行数据传输
/Users/nisj/PycharmProjects/BiDataProc/love/mysqlData2HiveBySqoop.py
# -*- coding=utf-8 -*- import os import re import warnings import datetime warnings.filterwarnings("ignore") # src Database config srcMysqlConfig_jellyfish_server = { 'host': 'MysqlHost', # 'host': 'MysqlHost', 'user': 'MysqlUser', 'passwd': 'MysqlPass', 'port': 50506, 'db': 'jellyfish_server' } def dateRange(beginDate, endDate): dates = [] dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d") date = beginDate[:] while date <= endDate: dates.append(date) dt = dt + datetime.timedelta(1) date = dt.strftime("%Y-%m-%d") return dates def getSrcMysqlConfig(srcMysql_config): srcMysql_config = srcMysql_config return srcMysql_config['host'], srcMysql_config['port'], srcMysql_config['user'], srcMysql_config['passwd'], srcMysql_config['db'] def getMysqlTabScript(srcMysql_config, src_tabName, tabType, whereCondition): # Parameter initialization host = getSrcMysqlConfig(srcMysql_config)[0] port = getSrcMysqlConfig(srcMysql_config)[1] user = getSrcMysqlConfig(srcMysql_config)[2] passwd = getSrcMysqlConfig(srcMysql_config)[3] db = getSrcMysqlConfig(srcMysql_config)[4] if tabType == 'single': src_postfix = '' elif 'submeter' in tabType: src_postfix = '_0' srcTabStructure = os.popen("""source /etc/profile; \ /usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -D{db} \ -N -e"set names utf8; \ select a2.column_name,case when a2.data_type like '%int' then 'bigint' else 'string' end data_type from information_schema.TABLES a1 left join information_schema.columns a2 on a1.TABLE_SCHEMA=a2.TABLE_SCHEMA and a1.TABLE_NAME=a2.TABLE_NAME where a1.TABLE_SCHEMA='{db}' and a1.table_name ='{src_tabName}{src_postfix}' order by a2.ORDINAL_POSITION;" \ """ .format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName, src_postfix=src_postfix)).readlines(); srcTabCol_list = [] for stcList in srcTabStructure: stc = re.split('\t', stcList.replace('\n', '')) srcTabCol_list.append(stc) TabCreateScript = 'use ods;\ndrop table if exists {db}_{src_tabName};\ncreate table {db}_{src_tabName}(\n'.format(src_tabName=src_tabName, db=db) TabSelectScriptHalf = 'select ' for srcColType in srcTabCol_list: TabSelectScriptHalf = TabSelectScriptHalf + '' + srcColType[0] + ',' TabCreateScript = TabCreateScript + '\`' + srcColType[0] + '\`' + ' ' + srcColType[1] + ',\n' TabSelectScriptHalf = TabSelectScriptHalf[:-1] TabCreateScript = TabCreateScript[:-2] + ") partitioned by (\`pt_day\` string) row format delimited fields terminated by '\t' lines terminated by '\n' location 'hdfs://emr-cluster/user/hive/warehouse/ods.db/{db}_{src_tabName}';".format(src_tabName=src_tabName, db=db) # get submeter table count if tabType == 'single': submeter_cnt = 1 elif 'submeter' in tabType: submeter_cnt = int(str(tabType).replace('submeter-', '')) # partition table data load TabSelectScript='' for submeterPlus in range(0, submeter_cnt, 1): # get submeter table postfix if tabType == 'single': submeterPostfix = '' elif 'submeter' in tabType: submeterPostfix = '_' + str(submeterPlus) TabSelectScriptSingle = TabSelectScriptHalf + " from {src_tabName}{submeterPostfix} where {whereCondition}\nunion all\n".format(src_tabName=src_tabName, submeterPostfix=submeterPostfix, whereCondition=whereCondition) TabSelectScript = TabSelectScript + TabSelectScriptSingle TabSelectScript = TabSelectScript[:-11] + " and \$CONDITIONS;" return TabCreateScript, TabSelectScript def HiveCreateTab(srcMysql_config, src_tabName, tabType): TabCreateScript = getMysqlTabScript(srcMysql_config, src_tabName, tabType, whereCondition="")[0] os.system("""/usr/lib/hive-current/bin/hive -e "{TabCreateScript}" """.format(TabCreateScript=TabCreateScript)) def mysqlData2Hive(srcMysql_config, src_tabName, tabType, runDay, whereCondition): # Parameter initialization host = getSrcMysqlConfig(srcMysql_config)[0] port = getSrcMysqlConfig(srcMysql_config)[1] user = getSrcMysqlConfig(srcMysql_config)[2] passwd = getSrcMysqlConfig(srcMysql_config)[3] db = getSrcMysqlConfig(srcMysql_config)[4] TabSelectScript = getMysqlTabScript(srcMysql_config, src_tabName, tabType, whereCondition)[1] # add partitions os.system("""source /etc/profile; \ /usr/lib/hive-current/bin/hive -e " use ods; alter table {db}_{src_tabName} drop if exists partition (pt_day='{runDay}'); \ alter table {db}_{src_tabName} add partition (pt_day='{runDay}');" \ """.format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName, runDay=runDay)) # partition table data load os.system("""source /etc/profile; \ sqoop import \ --connect jdbc:mysql://{host}:{port}/{db}?zeroDateTimeBehavior=convertToNull \ --username {user} \ --password {passwd} \ --query "{TabSelectScript}" \ --target-dir hdfs://emr-cluster/user/hive/warehouse/ods.db/{db}_{src_tabName}/pt_day={runDay} \ --delete-target-dir \ --fields-terminated-by '\t' \ --lines-terminated-by '\n' \ --num-mappers 1 \ --compress \ --compression-codec org.apache.hadoop.io.compress.SnappyCodec \ --direct \ --driver com.mysql.jdbc.Driver -m 8 \ """.format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName, runDay=runDay, TabSelectScript=TabSelectScript)) # Batch Test HiveCreateTab(srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='live_history_status', tabType='submeter-256') HiveCreateTab(srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='big_fans_detail', tabType='single') for runDay in dateRange(beginDate='2018-03-26', endDate='2018-03-28'): mysqlData2Hive(srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='live_history_status', tabType='submeter-256', runDay=runDay, whereCondition="substr(updated_time, 1, 10) = '{runDay}'".format(runDay=runDay)) mysqlData2Hive(srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='big_fans_detail', tabType='single', runDay=runDay, whereCondition="point=20000")说明:
1、与建表建分区结合,暂没有考虑传输特定几个字段的情况,可以在以后的版本中优化。
2、总体目前在同一个脚本中,以后可以考虑根据功能分开。
3、总体包含元数据脚本获取模块、hive建表模块、数据传输模块。