此文中,相当于是对【Hive数据汇总导入Mysql(Draw lessons from colleagues)】一文的整理。
1、在进行代码跑批之前,需要将mysql里的目标表,按要求先建好。
2、需要增减任务时,只需要在【configSql.py】增减相应的配置即可。
3、采用beeline客户端进行hive数据的汇总及查询;实际上查询sql可以任意,也可以多个,只要最终返回一个带结果的数据集就可以了;在其之前可以有数据删除及临时表创建等语句。
4、mysql数据插入sql分两块,数据回滚及hive汇总数据的插入;当然也可以根据需要进行mysql数据的更新。
5、设计的目的进行将hive中的汇总数据插入到mysql目标表中,但其实非汇总的明细数据也一样;每次处理的结果数据集建议在万条记录以下,如果太多了,插入mysql处理起来比较费时费力。
6、针对数据量较多的情况,考虑了事务性处理以提高效率;但数据量还是要尽量控制在万条以内。
同时,可以参考如下相关文章:
Hive汇总统计数据自动化传输到Mysql数据库-跑批参数文本配置及提取使用--> http://blog.csdn.net/babyfish13/article/details/73188712
Hive汇总统计数据自动化传输到Mysql数据库--> http://blog.csdn.net/babyfish13/article/details/72701512
Hive数据汇总导入Mysql(Draw lessons from colleagues)--> https://blog.csdn.net/babyfish13/article/details/78979161
1、连接配置
/Users/nisj/PycharmProjects/BiDataProc/OpenETL-hiveSum2mysql/configConn.py
/Users/nisj/PycharmProjects/BiDataProc/OpenETL-hiveSum2mysql/configSql.py
/Users/nisj/PycharmProjects/BiDataProc/OpenETL-hiveSum2mysql/funReadWrite.py
/Users/nisj/PycharmProjects/BiDataProc/OpenETL-hiveSum2mysql/overallPlanning.py
方案结合调度系统,可以进行报表数据的周期性传输。
配置的两个示例,一个是明细数据的直播拉取,一个是hive汇总数据传输;各具有代表意义。
其目标mysql建表语句如下:
1、在进行代码跑批之前,需要将mysql里的目标表,按要求先建好。
2、需要增减任务时,只需要在【configSql.py】增减相应的配置即可。
3、采用beeline客户端进行hive数据的汇总及查询;实际上查询sql可以任意,也可以多个,只要最终返回一个带结果的数据集就可以了;在其之前可以有数据删除及临时表创建等语句。
4、mysql数据插入sql分两块,数据回滚及hive汇总数据的插入;当然也可以根据需要进行mysql数据的更新。
5、设计的目的进行将hive中的汇总数据插入到mysql目标表中,但其实非汇总的明细数据也一样;每次处理的结果数据集建议在万条记录以下,如果太多了,插入mysql处理起来比较费时费力。
6、针对数据量较多的情况,考虑了事务性处理以提高效率;但数据量还是要尽量控制在万条以内。
同时,可以参考如下相关文章:
Hive汇总统计数据自动化传输到Mysql数据库-跑批参数文本配置及提取使用--> http://blog.csdn.net/babyfish13/article/details/73188712
Hive汇总统计数据自动化传输到Mysql数据库--> http://blog.csdn.net/babyfish13/article/details/72701512
Hive数据汇总导入Mysql(Draw lessons from colleagues)--> https://blog.csdn.net/babyfish13/article/details/78979161
1、连接配置
/Users/nisj/PycharmProjects/BiDataProc/OpenETL-hiveSum2mysql/configConn.py
# -*- coding=utf-8 -*- hiveConn = { "hive_old": { "jdbcConn": "jdbc:hive2://HiveHost:10000" }, "hive_new": { "jdbcConn": "jdbc:hive2://HivenewHost:10000" } } mysqlConn = { "funnyai_data": { "ip": "MysqlHost", # "ip": "MysqlHost", "port": 6603, "db": "funnyai_data", "username": "MysqlUser", "password": "MysqlPass" }, "jellyfish_hadoop_stat": { "ip": "MysqlHost", "port": 6605, "db": "jellyfish_hadoop_stat", "username": "MysqlUser", "password": "MysqlPass" } }2、数据传输SQL配置
/Users/nisj/PycharmProjects/BiDataProc/OpenETL-hiveSum2mysql/configSql.py
# -*- coding=utf-8 -*- TaskHive2mysql={} TaskHive2mysql["oss_bi_all_room"] = { "enable": True, "comment": "房间明细信息", "reader": { "type": "hive", "hiveName": "hive_old", "query_sql": """ select * from oss_bi_all_room where pt_day='2018-04-18' limit 888; """ }, "writer": { "type": "mysql", "conn": "jellyfish_hadoop_stat", "clear_sql": """ delete from xxx_room_test -- where created_time="{0}"; """, "insert_sql": """ insert into xxx_room_test (id,live_id,is_profession,creator_uid,subcriber_count,last_live_time,state,created_time,updated_time) values ("{1}", "{2}", "{3}", "{4}", "{5}", "{6}", "{7}", "{8}", "{9}") ; """ } } TaskHive2mysql["xxx_gamezone_bringnew_audience_test_static_daily"] = { "enable": True, "comment": "游戏专区拉新-新增观众数及其当日充值和弹幕发送及次日留存情况", "reader": { "type": "hive", "hiveName": "hive_old", "query_sql": """ with tab_view_game as( select a1.uid,a1.gameid,a1.view_time from (select uid,gameid,sum(view_time) view_time,row_number()over(partition by uid order by sum(view_time) desc) rk from recommend_data_view where pt_day=date_add('{0}',-1) group by uid,gameid) a1 where a1.rk=1), tab_newidentifier_newuser as( select uid from oss_bi_type_of_all_user where pt_day=date_add('{0}',-1) and type=1 ), tab_pay_info as ( select uid,amount from oss_bi_all_chushou_pay_info where pt_day=date_add('{0}',-1) and state=0), tab_message_info as ( select parms['uid'] uid,parms['liveGameId'] gameid,parms['liveGameName'] gamename,count(*) message_cnt from oss_bi_all_message_send_log where pt_day=date_add('{0}',-1) group by parms['uid'],parms['liveGameId'],parms['liveGameName']), tab_view_nextday as(select uid,gameid from recommend_data_view where pt_day=date_add('{0}',0) group by uid,gameid) select a2.gameid,a6.name gamename,count(distinct a1.uid) new_register_game_view_cnt,count(distinct a3.uid) pay_uid_cnt,sum(a3.amount) pay_amount,count(distinct a4.uid) message_send_uid_cnt,sum(a4.message_cnt) message_send_cnt,count(distinct a5.uid) audience_new_next_remain_cnt,count(distinct a5.uid)/count(distinct a1.uid) audience_new_next_remain_rate,from_unixtime(unix_timestamp()) created_time,from_unixtime(unix_timestamp()) updated_time from tab_newidentifier_newuser a1 inner join tab_view_game a2 on a1.uid=a2.uid left join tab_pay_info a3 on a1.uid=a3.uid left join tab_message_info a4 on a1.uid=a4.uid left join tab_view_nextday a5 on a1.uid=a5.uid left join data_chushou_game a6 on a2.gameid=a6.id group by a2.gameid,a6.name ; """ }, "writer": { "type": "mysql", "conn": "jellyfish_hadoop_stat", "clear_sql": """ delete from xxx_gamezone_bringnew_audience_test where calc_date="{0}"; """, "insert_sql": """ insert into xxx_gamezone_bringnew_audience_test (calc_date,game_id,game_name,new_register_game_view_cnt,pay_uid_cnt,pay_amount,message_send_uid_cnt,message_send_cnt,audience_new_next_remain_cnt,audience_new_next_remain_rate,created_time,updated_time) values ("{0}", "{1}", "{2}", "{3}" , "{4}", "{5}", "{6}", "{7}", "{8}", "{9}", "{10}", "{11}") ; """ } }3、数据传输及处理具体脚本
/Users/nisj/PycharmProjects/BiDataProc/OpenETL-hiveSum2mysql/funReadWrite.py
# -*- coding=utf-8 -*- import os from configConn import hiveConn, mysqlConn def runSqlOnMysqlShell(conn, sql): if conn in mysqlConn: my_conf = mysqlConn[conn] return "mysql -h{0} -P{1} -u{2} -p{3} -e \"\"\"set names utf8; use {4}; {5}\"\"\" ".format( my_conf['ip'], my_conf['port'], my_conf['username'], my_conf['password'], my_conf['db'], sql) else: return None def runSqlOnHive(taskConf, runDay): mpNameSql = """ SET mapred.job.name=' hiveSum2Mysql-test ({0}) '; """.format(runDay) hiveSql = mpNameSql + taskConf['reader']['query_sql'].format( runDay).replace('"', "'").replace('`', '\`') # 替换所有的 双引号 成 单引号 jdbcConn = hiveConn[taskConf['reader']['hiveName']]['jdbcConn'] querySql = " source ~/.bash_profile && beeline --outputformat=csv2 --showHeader=false -u '{0}' -n hadoop -p '' -e \"\"\"{1}\"\"\" ".format( jdbcConn, hiveSql) print querySql queryResultList = os.popen(querySql).read().split("\n") if len(queryResultList) > 1: return queryResultList[:-1] else: raise Exception("No query data is come out!") # print runSqlOnHive(taskConf=TaskHive2mysql["oss_bi_all_room"], runDay='2018-04-18') def runSqlOnMysql(taskConf, runDay, hiveDataResults): if 'max_bulk_insert' in taskConf['writer']: maxInsert = taskConf['writer']['max_bulk_insert'] else: maxInsert = 28 runSqlList = [] # 组装清空sql clear_sql = taskConf['writer']['clear_sql'].format(runDay).replace( '"', "'").replace('`', '\`') if not clear_sql.strip()[-1:] == ';': clear_sql += ';' runSqlList.append(clear_sql) # 组装插入sql insert_sql = '' insert_count = 0 for line in hiveDataResults: if insert_count >= maxInsert: runSqlList.append(insert_sql) insert_count = 0 insert_sql = '' words = line.strip().split(',') insert_sql += taskConf['writer']['insert_sql'].format( runDay, *words).replace('"', "'").replace('`', '\`') if not insert_sql.strip()[-1:] == ';': insert_sql += ';' insert_count += 1 if insert_count > 0: runSqlList.append(insert_sql) # 执行所有分批sql for run_sql in runSqlList: sqlOnMysql_cmd = runSqlOnMysqlShell(taskConf['writer']['conn'], run_sql) + " && echo 'sql on mysql exec success!' " # print sqlOnMysql_cmd # 执行,并输出执行结果 os.system(sqlOnMysql_cmd)4、统筹调度脚本
/Users/nisj/PycharmProjects/BiDataProc/OpenETL-hiveSum2mysql/overallPlanning.py
# -*- coding=utf-8 -*- import datetime from configSql import * from funReadWrite import runSqlOnHive, runSqlOnMysql def dataDayRun(taskConf, runDay): hiveDataResults = [] if TaskHive2mysql["oss_bi_all_room"]['reader']['type'] == 'hive': try: hiveDataResults = runSqlOnHive(taskConf, runDay) except Exception, e: print e if taskConf['writer']['type'] == 'mysql': runSqlOnMysql(taskConf, runDay, hiveDataResults) def dateRange(beginDate, endDate): dates = [] dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d") date = beginDate[:] while date <= endDate: dates.append(date) dt = dt + datetime.timedelta(1) date = dt.strftime("%Y-%m-%d") return dates if __name__ == '__main__': # for runDay in dateRange(beginDate='2018-03-01', endDate='2018-03-31'): # print runDay dataDayRun(taskConf=TaskHive2mysql["oss_bi_all_room"], runDay='2018-05-06') dataDayRun(taskConf=TaskHive2mysql["xxx_gamezone_bringnew_audience_test_static_daily"], runDay='2018-05-06')5、说明
方案结合调度系统,可以进行报表数据的周期性传输。
配置的两个示例,一个是明细数据的直播拉取,一个是hive汇总数据传输;各具有代表意义。
其目标mysql建表语句如下:
show create table xxx_room_test; CREATE TABLE `xxx_room_test` ( `id` bigint(20) DEFAULT NULL, `live_id` varchar(100) DEFAULT NULL, `is_profession` int(11) DEFAULT NULL, `creator_uid` bigint(20) DEFAULT NULL, `subcriber_count` bigint(20) DEFAULT NULL, `last_live_time` varchar(100) DEFAULT NULL, `state` bigint(20) DEFAULT NULL, `created_time` varchar(100) DEFAULT NULL, `updated_time` varchar(100) DEFAULT NULL ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; show create table xxx_gamezone_bringnew_audience_test; CREATE TABLE `xxx_gamezone_bringnew_audience_test` ( `id` int(11) NOT NULL DEFAULT '0' COMMENT '自增ID', `calc_date` date DEFAULT NULL COMMENT '统计日期', `game_id` bigint(20) DEFAULT NULL COMMENT '游戏id', `game_name` varchar(500) DEFAULT '' COMMENT '游戏名称', `new_register_game_view_cnt` int(11) DEFAULT '0' COMMENT '当日新增观众数', `pay_uid_cnt` int(11) DEFAULT '0' COMMENT '当日新增观众-充值人数', `pay_amount` decimal(22,3) DEFAULT '0.000' COMMENT '当日新增观众-充值金额', `message_send_uid_cnt` int(11) DEFAULT '0' COMMENT '当日新增观众-弹幕发送人数', `message_send_cnt` int(11) DEFAULT '0' COMMENT '当日新增观众-弹幕发送次数', `audience_new_next_remain_cnt` int(11) DEFAULT '0' COMMENT '当日新增观众-次日留存数', `audience_new_next_remain_rate` decimal(6,2) DEFAULT '0.00' COMMENT '当日新增观众-次日留存率', `created_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', `updated_time` datetime DEFAULT '2999-12-31 23:59:59' COMMENT '统计时间' ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;