此代码通过将需要的数据分别从多个mysql数据源及hive数据源分别获取后,在python中进行整合合并,然后插入到时mysql数据库中。
hive库中本有这些mysql的表,但由于只是用到了hive中的部分数据,在hive巨量数据中查询效率极低;而在mysql中进行其中一小部分的数据查询则能极大的提高查询效率。所以,代码涉及的数据源等处理相对比较复杂。
数据本可以在hive中一个查询脚本跑完;也可以将mysql数据分别获取后插入到mysql临时表,然后关联插入到mysql目标表中。通过python脚本,后者是一个比较迅捷一些的方案。但本文中,采用的是所有数据在python内存中处理,运行速度上得来了提升,但脚本写的也确实够烦;且不具备很好的复用性。
学习点:
1、Python的多个数据结果左关联处理方法
2、mysql批量数据的插入
3、python中数据的多参数并行跑批方法
/Users/nisj/PycharmProjects/BiDataProc/love/moreSrcDataCalc.py
hive库中本有这些mysql的表,但由于只是用到了hive中的部分数据,在hive巨量数据中查询效率极低;而在mysql中进行其中一小部分的数据查询则能极大的提高查询效率。所以,代码涉及的数据源等处理相对比较复杂。
数据本可以在hive中一个查询脚本跑完;也可以将mysql数据分别获取后插入到mysql临时表,然后关联插入到mysql目标表中。通过python脚本,后者是一个比较迅捷一些的方案。但本文中,采用的是所有数据在python内存中处理,运行速度上得来了提升,但脚本写的也确实够烦;且不具备很好的复用性。
学习点:
1、Python的多个数据结果左关联处理方法
2、mysql批量数据的插入
3、python中数据的多参数并行跑批方法
/Users/nisj/PycharmProjects/BiDataProc/love/moreSrcDataCalc.py
# -*- coding=utf-8 -*- import os import re import time import datetime import warnings import threadpool warnings.filterwarnings("ignore") srcMysqlConfig_jellyfish_hadoop_stat={ 'host': 'MysqlHost', # 'host': 'MysqlHost', 'user': 'MysqlUser', 'passwd': 'MysqlPass', 'port': 6605, 'db': 'jellyfish_hadoop_stat' } srcMysqlConfig_jellyfish_stat={ 'host': 'MysqlHost', # 'host': 'MysqlHost', 'user': 'MysqlUser', 'passwd': 'MysqlPass', 'port': 50515, 'db': 'jellyfish_stat' } srcMysqlConfig_jellyfish_wealth = { 'host': 'MysqlHost', # 'host': 'MysqlHost', 'user': 'MysqlUser', 'passwd': 'MysqlPass', 'port': 10019, 'db': 'jellyfish_wealth' } srcMysqlConfig_jellyfish_server = { 'host': 'MysqlHost', # 'host': 'MysqlHost', 'user': 'MysqlUser', 'passwd': 'MysqlPass', 'port': 50506, 'db': 'jellyfish_server' } def dateRange(beginDate, endDate): dates=[] dt=datetime.datetime.strptime(endDate, "%Y-%m-%d") date=endDate[:] while date >= beginDate: dates.append(date) dt=dt - datetime.timedelta(1) date=dt.strftime("%Y-%m-%d") return dates def hiveDataCalc(runDay,roomIdList): messageSend_Data=os.popen("""/usr/lib/hive-current/bin/hive -e " \ select \ parms['roomId'] roomid,count(1) as mess_num, \ count(distinct parms['uid']) as mess_user, \ count(distinct parms['ip']) as mess_ip \ from oss_bi_all_message_send_log \ where pt_day='{runDay}' and parms['roomId'] in ({roomIdStr}) \ group by parms['roomId'] \ " """.format(runDay=runDay, roomIdStr=str(roomIdList)[1:-1])).readlines(); messageSend_Data_list=[] for ms_list in messageSend_Data: ms=re.split('\t', ms_list.replace('\n', '')) messageSend_Data_list.append(ms) return messageSend_Data_list # for messageSendData in messageSend_Data_list: # print messageSendData[0],messageSendData[1],messageSendData[2],messageSendData[3] def mysqlDataCalc(runDay,roomIdList): onlineCount_DataList = [] for roomId in roomIdList: onlineCount_Data = os.popen("""mysql -h{host} -P{port} -u{user} -p{passwd} -N -e "set names utf8;use {db}; \ select room_id, max(online_real_count) as online_real_count \ from room_online_stat_{tabSuffix} \ where substr(created_time,1,10)='{runDay}' and room_id ={roomId} \ group by room_id \ ; \ " """.format(runDay=runDay, roomId=roomId, tabSuffix=str(roomId % 256), host=srcMysqlConfig_jellyfish_stat['host'], port=srcMysqlConfig_jellyfish_stat['port'], user=srcMysqlConfig_jellyfish_stat['user'], passwd=srcMysqlConfig_jellyfish_stat['passwd'], db=srcMysqlConfig_jellyfish_stat['db'])).readlines(); onlineCount_Data_list = [] for oc_list in onlineCount_Data: oc = re.split('\t', oc_list.replace('\n', '')) onlineCount_Data_list.append(oc) onlineCount_DataList.append(onlineCount_Data_list) # print onlineCount_DataList giftPoint_DataList = [] for roomId in roomIdList: giftPoint_Data = os.popen("""mysql -h{host} -P{port} -u{user} -p{passwd} -N -e "set names utf8;use {db}; \ select room_id,sum(point) as gift_point, \ count(distinct uid) as point_user \ from room_point_detail_{tabSuffix} \ where substr(created_time,1,10)='{runDay}' and room_id ={roomId} \ group by room_id \ ; \ " """.format(runDay=runDay, roomId=roomId, tabSuffix=str(roomId % 256), host=srcMysqlConfig_jellyfish_wealth['host'], port=srcMysqlConfig_jellyfish_wealth['port'], user=srcMysqlConfig_jellyfish_wealth['user'], passwd=srcMysqlConfig_jellyfish_wealth['passwd'], db=srcMysqlConfig_jellyfish_wealth['db'])).readlines(); giftPoint_Data_list = [] for gp_list in giftPoint_Data: gp = re.split('\t', gp_list.replace('\n', '')) giftPoint_Data_list.append(gp) giftPoint_DataList.append(giftPoint_Data_list) # print giftPoint_DataList roomSubscr_DataList = [] for roomId in roomIdList: roomSubscr_Data = os.popen("""mysql -h{host} -P{port} -u{user} -p{passwd} -N -e "set names utf8;use {db}; \ select room_id, count(distinct uid) cnt from room_subscriber_uid_{tabSuffix} where substr(created_time,1,10)='{runDay}' and state=0 and room_id ={roomId} group by room_id; \ " """.format(runDay=runDay, roomId=roomId, tabSuffix=str(roomId % 256), host=srcMysqlConfig_jellyfish_server['host'], port=srcMysqlConfig_jellyfish_server['port'], user=srcMysqlConfig_jellyfish_server['user'], passwd=srcMysqlConfig_jellyfish_server['passwd'], db=srcMysqlConfig_jellyfish_server['db'])).readlines(); roomSubscr_Data_list = [] for rs_list in roomSubscr_Data: rs = re.split('\t', rs_list.replace('\n', '')) roomSubscr_Data_list.append(rs) roomSubscr_DataList.append(roomSubscr_Data_list) # print roomSubscr_DataList roomRank_DataList = [] for roomId in roomIdList: roomRank_Data = os.popen("""mysql -h{host} -P{port} -u{user} -p{passwd} -N -e "set names utf8;use {db}; \ select room_id,min(rank) rank,sum(duration) duration from room_category_rank where substr(created_time,1,10)='{runDay}' and category_id=0 and room_id ={roomId} group by room_id; \ " """.format(runDay=runDay, roomId=roomId, tabSuffix=str(roomId % 256), host=srcMysqlConfig_jellyfish_server['host'], port=srcMysqlConfig_jellyfish_server['port'], user=srcMysqlConfig_jellyfish_server['user'], passwd=srcMysqlConfig_jellyfish_server['passwd'], db=srcMysqlConfig_jellyfish_server['db'])).readlines(); roomRank_Data_list = [] for rr_list in roomRank_Data: rr = re.split('\t', rr_list.replace('\n', '')) roomRank_Data_list.append(rr) roomRank_DataList.append(roomRank_Data_list) # print roomRank_DataList return onlineCount_DataList,giftPoint_DataList,roomSubscr_DataList,roomRank_DataList def DataCom2mysql(runDay, roomIdList): # 参数初始化赋值 host=srcMysqlConfig_jellyfish_hadoop_stat['host'] port=srcMysqlConfig_jellyfish_hadoop_stat['port'] user=srcMysqlConfig_jellyfish_hadoop_stat['user'] passwd=srcMysqlConfig_jellyfish_hadoop_stat['passwd'] db=srcMysqlConfig_jellyfish_hadoop_stat['db'] os.system("""source /etc/profile; \ mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \ delete from report_room_live_detail_daily_stat where \`date\`='{runDay}' and room_id in({roomIdStr}); " """.format(host=host, port=port, user=user, passwd=passwd, db=db, runDay=runDay, roomIdStr=str(roomIdList)[1:-1])) onlineCount_DataList, giftPoint_DataList, roomSubscr_DataList, roomRank_DataList = mysqlDataCalc(runDay, roomIdList) # print onlineCount_DataList,giftPoint_DataList,roomSubscr_DataList,roomRank_DataList lineCountcomList=[] roomIdonlineCountHave = [] for roomId in roomIdList: for onlineCount in onlineCount_DataList: if len(onlineCount)!= 0 and int(roomId) == int(onlineCount[0][0]): roomIdonlineCountHave.append(roomId) lineCountcomList.append([roomId,int(onlineCount[0][1])]) for roomId in roomIdList: if roomId not in roomIdonlineCountHave: lineCountcomList.append([roomId,0]) # print lineCountcomList giftPointcomList=[] roomIdgiftPointHave = [] for roomId in roomIdList: for giftPoint in giftPoint_DataList: if len(giftPoint)!= 0 and int(roomId) == int(giftPoint[0][0]): roomIdgiftPointHave.append(roomId) giftPointcomList.append([roomId,int(giftPoint[0][1]),int(giftPoint[0][2])]) for roomId in roomIdList: if roomId not in roomIdgiftPointHave: giftPointcomList.append([roomId,0,0]) # print giftPointcomList roomSubscrcomList=[] roomIdroomSubscrHave = [] for roomId in roomIdList: for roomSubscr in roomSubscr_DataList: if len(roomSubscr)!= 0 and int(roomId) == int(roomSubscr[0][0]): roomIdroomSubscrHave.append(roomId) roomSubscrcomList.append([roomId,int(roomSubscr[0][1])]) for roomId in roomIdList: if roomId not in roomIdroomSubscrHave: roomSubscrcomList.append([roomId,0]) # print roomSubscrcomList roomRankcomList=[] roomIdroomRankHave = [] for roomId in roomIdList: for roomRank in roomRank_DataList: if len(roomRank)!= 0 and int(roomId) == int(roomRank[0][0]): roomIdroomRankHave.append(roomId) roomRankcomList.append([roomId,int(roomRank[0][1]),int(roomRank[0][2])]) for roomId in roomIdList: if roomId not in roomIdroomRankHave: roomRankcomList.append([roomId,0,0]) # print roomRankcomList messageSend_Data_list=hiveDataCalc(runDay, roomIdList) messageSendcomList=[] roomIdmessageSendHave = [] for roomId in roomIdList: for messageSend in messageSend_Data_list: if len(messageSend)!= 0 and int(roomId) == int(messageSend[0]): roomIdmessageSendHave.append(roomId) messageSendcomList.append([roomId,int(messageSend[1]),int(messageSend[2]),int(messageSend[3])]) for roomId in roomIdList: if roomId not in roomIdmessageSendHave: messageSendcomList.append([roomId,0,0,0]) # print messageSendcomList Sum_Data_list=[] for msd in messageSendcomList: for old in lineCountcomList: for gpd in giftPointcomList: for rsd in roomSubscrcomList: for rrd in roomRankcomList: if msd[0] == old[0] and msd[0] == gpd[0] and msd[0] == rsd[0] and msd[0] == rrd[0]: Sum_Data_list.append([msd[0], msd[1], msd[2], msd[3], 0, 0, old[1], gpd[1], gpd[2], rrd[2], rsd[1], rrd[1]]) # for sumd in Sum_Data_list: # print sumd # result insert table i=0 insert_mysql_sql="""/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \ insert into report_room_live_detail_daily_stat(\`date\`,room_id,barrage_count,barrage_user_count,barrage_ip_count,view_user,view_duration,view_peak_user,gift_num,gift_user,anchor_duration,subscriber_count,rank) \ values """.format(host=host, port=port, user=user, passwd=passwd, db=db) for sumd in Sum_Data_list: room_id=sumd[0] barrage_count=sumd[1] barrage_user_count=sumd[2] barrage_ip_count=sumd[3] view_user=sumd[4] view_duration=sumd[5] view_peak_user=sumd[6] gift_num=sumd[7] gift_user=sumd[8] anchor_duration=sumd[9] subscriber_count=sumd[10] rank=sumd[11] etl_time=time.strftime('%Y-%m-%d %X', time.localtime()) i += 1 insert_mysql_sql=insert_mysql_sql + """('{date}','{room_id}','{barrage_count}','{barrage_user_count}','{barrage_ip_count}','{view_user}','{view_duration}','{view_peak_user}','{gift_num}','{gift_user}','{anchor_duration}','{subscriber_count}','{rank}'),""".format(host=host, port=port, user=user, passwd=passwd, db=db, date='{runDay}'.format(runDay=runDay),room_id=room_id, barrage_count=barrage_count, barrage_user_count=barrage_user_count, barrage_ip_count=barrage_ip_count,view_user=view_user,view_duration=view_duration,view_peak_user=view_peak_user,gift_num=gift_num,gift_user=gift_user,anchor_duration=anchor_duration,subscriber_count=subscriber_count,rank=rank) if (i % 1000 == 0): insert_mysql_sql=insert_mysql_sql.rstrip(',') + """ ;" """ os.system(insert_mysql_sql) insert_mysql_sql="""/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \ insert into report_room_live_detail_daily_stat(date,room_id,barrage_count,barrage_user_count,barrage_ip_count,view_user,view_duration,view_peak_user,gift_num,gift_user,anchor_duration,subscriber_count,rank) \ values """.format(host=host, port=port, user=user, passwd=passwd, db=db) insert_mysql_sql=insert_mysql_sql.rstrip(',') + """ ;" """ os.system(insert_mysql_sql) # run parallel Batch roomIdList=[19467,2469625,3755657,96969,95277] runDayRoomIdList=[] for runDay in dateRange(beginDate='2015-11-27', endDate='2018-04-11'): runDayRoomIdList.append(([runDay, roomIdList], None)) # print runDayRoomIdList now_time=time.strftime('%Y-%m-%d %X', time.localtime()) print "当前时间是:",now_time requests=[] request_DataCom2mysql_batchCtl=threadpool.makeRequests(DataCom2mysql, runDayRoomIdList) requests.extend(request_DataCom2mysql_batchCtl) main_pool=threadpool.ThreadPool(38) [main_pool.putRequest(req) for req in requests] if __name__ == '__main__': while True: try: time.sleep(30) main_pool.poll() except KeyboardInterrupt: print("**** Interrupted!") break except threadpool.NoResultsPending: break if main_pool.dismissedWorkers: print("Joining all dismissed worker threads...") main_pool.joinAllDismissedWorkers() now_time=time.strftime('%Y-%m-%d %X', time.localtime()) print "当前时间是:",now_time # run serial Batch # roomIdList=[19467,2469625,3755657,96969,95277] # for runDay in dateRange(beginDate='2016-10-01', endDate='2016-10-08'): # DataCom2mysql(runDay=runDay, roomIdList=roomIdList)