Python获取Mysql及Hive数据计算并整合后插入到Mysql数据库

此代码通过将需要的数据分别从多个mysql数据源及hive数据源分别获取后,在python中进行整合合并,然后插入到时mysql数据库中。
hive库中本有这些mysql的表,但由于只是用到了hive中的部分数据,在hive巨量数据中查询效率极低;而在mysql中进行其中一小部分的数据查询则能极大的提高查询效率。所以,代码涉及的数据源等处理相对比较复杂。
数据本可以在hive中一个查询脚本跑完;也可以将mysql数据分别获取后插入到mysql临时表,然后关联插入到mysql目标表中。通过python脚本,后者是一个比较迅捷一些的方案。但本文中,采用的是所有数据在python内存中处理,运行速度上得来了提升,但脚本写的也确实够烦;且不具备很好的复用性。
学习点:
1、Python的多个数据结果左关联处理方法
2、mysql批量数据的插入
3、python中数据的多参数并行跑批方法

/Users/nisj/PycharmProjects/BiDataProc/love/moreSrcDataCalc.py
# -*- coding=utf-8 -*-
import os
import re
import time
import datetime
import warnings
import threadpool

warnings.filterwarnings("ignore")

srcMysqlConfig_jellyfish_hadoop_stat={
    'host': 'MysqlHost',
    # 'host': 'MysqlHost',
    'user': 'MysqlUser',
    'passwd': 'MysqlPass',
    'port': 6605,
    'db': 'jellyfish_hadoop_stat'
}

srcMysqlConfig_jellyfish_stat={
    'host': 'MysqlHost',
    # 'host': 'MysqlHost',
    'user': 'MysqlUser',
    'passwd': 'MysqlPass',
    'port': 50515,
    'db': 'jellyfish_stat'
}

srcMysqlConfig_jellyfish_wealth = {
    'host': 'MysqlHost',
    # 'host': 'MysqlHost',
    'user': 'MysqlUser',
    'passwd': 'MysqlPass',
    'port': 10019,
    'db': 'jellyfish_wealth'
}

srcMysqlConfig_jellyfish_server = {
    'host': 'MysqlHost',
    # 'host': 'MysqlHost',
    'user': 'MysqlUser',
    'passwd': 'MysqlPass',
    'port': 50506,
    'db': 'jellyfish_server'
}

def dateRange(beginDate, endDate):
    dates=[]
    dt=datetime.datetime.strptime(endDate, "%Y-%m-%d")
    date=endDate[:]
    while date >= beginDate:
        dates.append(date)
        dt=dt - datetime.timedelta(1)
        date=dt.strftime("%Y-%m-%d")
    return dates

def hiveDataCalc(runDay,roomIdList):
    messageSend_Data=os.popen("""/usr/lib/hive-current/bin/hive -e " \
            select \
                parms['roomId'] roomid,count(1) as mess_num, \
                count(distinct parms['uid']) as mess_user, \
                count(distinct parms['ip']) as mess_ip \
            from oss_bi_all_message_send_log \
            where pt_day='{runDay}' and parms['roomId'] in ({roomIdStr}) \
            group by parms['roomId'] \
                " """.format(runDay=runDay, roomIdStr=str(roomIdList)[1:-1])).readlines();

    messageSend_Data_list=[]
    for ms_list in messageSend_Data:
        ms=re.split('\t', ms_list.replace('\n', ''))
        messageSend_Data_list.append(ms)

    return messageSend_Data_list

    # for messageSendData in messageSend_Data_list:
    #     print messageSendData[0],messageSendData[1],messageSendData[2],messageSendData[3]

def mysqlDataCalc(runDay,roomIdList):
    onlineCount_DataList = []
    for roomId in roomIdList:
        onlineCount_Data = os.popen("""mysql -h{host} -P{port} -u{user} -p{passwd} -N -e "set names utf8;use {db}; \
                        select room_id, max(online_real_count) as online_real_count \
                        from room_online_stat_{tabSuffix} \
                        where substr(created_time,1,10)='{runDay}' and room_id ={roomId} \
                        group by room_id \
                        ; \
                        " """.format(runDay=runDay, roomId=roomId, tabSuffix=str(roomId % 256),
                                     host=srcMysqlConfig_jellyfish_stat['host'],
                                     port=srcMysqlConfig_jellyfish_stat['port'],
                                     user=srcMysqlConfig_jellyfish_stat['user'],
                                     passwd=srcMysqlConfig_jellyfish_stat['passwd'],
                                     db=srcMysqlConfig_jellyfish_stat['db'])).readlines();

        onlineCount_Data_list = []
        for oc_list in onlineCount_Data:
            oc = re.split('\t', oc_list.replace('\n', ''))
            onlineCount_Data_list.append(oc)
        onlineCount_DataList.append(onlineCount_Data_list)
        # print onlineCount_DataList

    giftPoint_DataList = []
    for roomId in roomIdList:
        giftPoint_Data = os.popen("""mysql -h{host} -P{port} -u{user} -p{passwd} -N -e "set names utf8;use {db}; \
                    select room_id,sum(point) as gift_point, \
                    count(distinct uid) as point_user \
                    from room_point_detail_{tabSuffix} \
                    where substr(created_time,1,10)='{runDay}' and room_id ={roomId} \
                    group by room_id \
                    ; \
                    " """.format(runDay=runDay, roomId=roomId, tabSuffix=str(roomId % 256),
                                 host=srcMysqlConfig_jellyfish_wealth['host'], port=srcMysqlConfig_jellyfish_wealth['port'],
                                 user=srcMysqlConfig_jellyfish_wealth['user'],
                                 passwd=srcMysqlConfig_jellyfish_wealth['passwd'],
                                 db=srcMysqlConfig_jellyfish_wealth['db'])).readlines();

        giftPoint_Data_list = []
        for gp_list in giftPoint_Data:
            gp = re.split('\t', gp_list.replace('\n', ''))
            giftPoint_Data_list.append(gp)
        giftPoint_DataList.append(giftPoint_Data_list)
    # print giftPoint_DataList


    roomSubscr_DataList = []
    for roomId in roomIdList:
        roomSubscr_Data = os.popen("""mysql -h{host} -P{port} -u{user} -p{passwd} -N -e "set names utf8;use {db}; \
                    select room_id, count(distinct uid) cnt
                    from room_subscriber_uid_{tabSuffix}
                    where substr(created_time,1,10)='{runDay}' and state=0 and room_id ={roomId}
                    group by room_id; \
                    " """.format(runDay=runDay, roomId=roomId, tabSuffix=str(roomId % 256),
                                 host=srcMysqlConfig_jellyfish_server['host'], port=srcMysqlConfig_jellyfish_server['port'],
                                 user=srcMysqlConfig_jellyfish_server['user'],
                                 passwd=srcMysqlConfig_jellyfish_server['passwd'],
                                 db=srcMysqlConfig_jellyfish_server['db'])).readlines();

        roomSubscr_Data_list = []
        for rs_list in roomSubscr_Data:
            rs = re.split('\t', rs_list.replace('\n', ''))
            roomSubscr_Data_list.append(rs)
        roomSubscr_DataList.append(roomSubscr_Data_list)
    # print roomSubscr_DataList

    roomRank_DataList = []
    for roomId in roomIdList:
        roomRank_Data = os.popen("""mysql -h{host} -P{port} -u{user} -p{passwd} -N -e "set names utf8;use {db}; \
                select room_id,min(rank) rank,sum(duration) duration
                from room_category_rank
                where substr(created_time,1,10)='{runDay}' and category_id=0 and room_id ={roomId}
                group by room_id; \
                    " """.format(runDay=runDay, roomId=roomId, tabSuffix=str(roomId % 256),
                                 host=srcMysqlConfig_jellyfish_server['host'], port=srcMysqlConfig_jellyfish_server['port'],
                                 user=srcMysqlConfig_jellyfish_server['user'],
                                 passwd=srcMysqlConfig_jellyfish_server['passwd'],
                                 db=srcMysqlConfig_jellyfish_server['db'])).readlines();

        roomRank_Data_list = []
        for rr_list in roomRank_Data:
            rr = re.split('\t', rr_list.replace('\n', ''))
            roomRank_Data_list.append(rr)
        roomRank_DataList.append(roomRank_Data_list)
    # print roomRank_DataList

    return onlineCount_DataList,giftPoint_DataList,roomSubscr_DataList,roomRank_DataList

def DataCom2mysql(runDay, roomIdList):
    # 参数初始化赋值
    host=srcMysqlConfig_jellyfish_hadoop_stat['host']
    port=srcMysqlConfig_jellyfish_hadoop_stat['port']
    user=srcMysqlConfig_jellyfish_hadoop_stat['user']
    passwd=srcMysqlConfig_jellyfish_hadoop_stat['passwd']
    db=srcMysqlConfig_jellyfish_hadoop_stat['db']

    os.system("""source /etc/profile; \
             mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \
             delete from report_room_live_detail_daily_stat where \`date\`='{runDay}' and room_id in({roomIdStr}); " """.format(host=host, port=port, user=user, passwd=passwd, db=db, runDay=runDay, roomIdStr=str(roomIdList)[1:-1]))

    onlineCount_DataList, giftPoint_DataList, roomSubscr_DataList, roomRank_DataList = mysqlDataCalc(runDay, roomIdList)
    # print onlineCount_DataList,giftPoint_DataList,roomSubscr_DataList,roomRank_DataList

    lineCountcomList=[]
    roomIdonlineCountHave = []
    for roomId in roomIdList:
        for onlineCount in onlineCount_DataList:
            if len(onlineCount)!= 0 and int(roomId) == int(onlineCount[0][0]):
                roomIdonlineCountHave.append(roomId)
                lineCountcomList.append([roomId,int(onlineCount[0][1])])
    for roomId in roomIdList:
        if roomId not in roomIdonlineCountHave:
            lineCountcomList.append([roomId,0])
    # print lineCountcomList

    giftPointcomList=[]
    roomIdgiftPointHave = []
    for roomId in roomIdList:
        for giftPoint in giftPoint_DataList:
            if len(giftPoint)!= 0 and int(roomId) == int(giftPoint[0][0]):
                roomIdgiftPointHave.append(roomId)
                giftPointcomList.append([roomId,int(giftPoint[0][1]),int(giftPoint[0][2])])
    for roomId in roomIdList:
        if roomId not in roomIdgiftPointHave:
            giftPointcomList.append([roomId,0,0])
    # print giftPointcomList

    roomSubscrcomList=[]
    roomIdroomSubscrHave = []
    for roomId in roomIdList:
        for roomSubscr in roomSubscr_DataList:
            if len(roomSubscr)!= 0 and int(roomId) == int(roomSubscr[0][0]):
                roomIdroomSubscrHave.append(roomId)
                roomSubscrcomList.append([roomId,int(roomSubscr[0][1])])
    for roomId in roomIdList:
        if roomId not in roomIdroomSubscrHave:
            roomSubscrcomList.append([roomId,0])
    # print roomSubscrcomList

    roomRankcomList=[]
    roomIdroomRankHave = []
    for roomId in roomIdList:
        for roomRank in roomRank_DataList:
            if len(roomRank)!= 0 and int(roomId) == int(roomRank[0][0]):
                roomIdroomRankHave.append(roomId)
                roomRankcomList.append([roomId,int(roomRank[0][1]),int(roomRank[0][2])])
    for roomId in roomIdList:
        if roomId not in roomIdroomRankHave:
            roomRankcomList.append([roomId,0,0])
    # print roomRankcomList

    messageSend_Data_list=hiveDataCalc(runDay, roomIdList)
    messageSendcomList=[]
    roomIdmessageSendHave = []
    for roomId in roomIdList:
        for messageSend in messageSend_Data_list:
            if len(messageSend)!= 0 and int(roomId) == int(messageSend[0]):
                roomIdmessageSendHave.append(roomId)
                messageSendcomList.append([roomId,int(messageSend[1]),int(messageSend[2]),int(messageSend[3])])
    for roomId in roomIdList:
        if roomId not in roomIdmessageSendHave:
            messageSendcomList.append([roomId,0,0,0])
    # print messageSendcomList

    Sum_Data_list=[]
    for msd in messageSendcomList:
        for old in lineCountcomList:
            for gpd in giftPointcomList:
                for rsd in roomSubscrcomList:
                    for rrd in roomRankcomList:
                        if msd[0] == old[0] and msd[0] == gpd[0] and msd[0] == rsd[0] and msd[0] == rrd[0]:
                            Sum_Data_list.append([msd[0], msd[1], msd[2], msd[3], 0, 0, old[1], gpd[1], gpd[2], rrd[2], rsd[1], rrd[1]])

    # for sumd in Sum_Data_list:
    #     print sumd

    # result insert table
    i=0
    insert_mysql_sql="""/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \
           insert into report_room_live_detail_daily_stat(\`date\`,room_id,barrage_count,barrage_user_count,barrage_ip_count,view_user,view_duration,view_peak_user,gift_num,gift_user,anchor_duration,subscriber_count,rank) \
           values """.format(host=host, port=port, user=user, passwd=passwd, db=db)
    for sumd in Sum_Data_list:
        room_id=sumd[0]
        barrage_count=sumd[1]
        barrage_user_count=sumd[2]
        barrage_ip_count=sumd[3]
        view_user=sumd[4]
        view_duration=sumd[5]
        view_peak_user=sumd[6]
        gift_num=sumd[7]
        gift_user=sumd[8]
        anchor_duration=sumd[9]
        subscriber_count=sumd[10]
        rank=sumd[11]
        etl_time=time.strftime('%Y-%m-%d %X', time.localtime())

        i += 1
        insert_mysql_sql=insert_mysql_sql + """('{date}','{room_id}','{barrage_count}','{barrage_user_count}','{barrage_ip_count}','{view_user}','{view_duration}','{view_peak_user}','{gift_num}','{gift_user}','{anchor_duration}','{subscriber_count}','{rank}'),""".format(host=host, port=port, user=user, passwd=passwd, db=db, date='{runDay}'.format(runDay=runDay),room_id=room_id, barrage_count=barrage_count, barrage_user_count=barrage_user_count, barrage_ip_count=barrage_ip_count,view_user=view_user,view_duration=view_duration,view_peak_user=view_peak_user,gift_num=gift_num,gift_user=gift_user,anchor_duration=anchor_duration,subscriber_count=subscriber_count,rank=rank)
        if (i % 1000 == 0):
            insert_mysql_sql=insert_mysql_sql.rstrip(',') + """ ;" """
            os.system(insert_mysql_sql)

            insert_mysql_sql="""/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \
                   insert into report_room_live_detail_daily_stat(date,room_id,barrage_count,barrage_user_count,barrage_ip_count,view_user,view_duration,view_peak_user,gift_num,gift_user,anchor_duration,subscriber_count,rank) \
                   values """.format(host=host, port=port, user=user, passwd=passwd, db=db)

    insert_mysql_sql=insert_mysql_sql.rstrip(',') + """ ;" """
    os.system(insert_mysql_sql)

# run parallel Batch
roomIdList=[19467,2469625,3755657,96969,95277]
runDayRoomIdList=[]
for runDay in dateRange(beginDate='2015-11-27', endDate='2018-04-11'):
    runDayRoomIdList.append(([runDay, roomIdList], None))
# print runDayRoomIdList

now_time=time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time
requests=[]
request_DataCom2mysql_batchCtl=threadpool.makeRequests(DataCom2mysql, runDayRoomIdList)
requests.extend(request_DataCom2mysql_batchCtl)
main_pool=threadpool.ThreadPool(38)
[main_pool.putRequest(req) for req in requests]

if __name__ == '__main__':
    while True:
        try:
            time.sleep(30)
            main_pool.poll()
        except KeyboardInterrupt:
            print("**** Interrupted!")
            break
        except threadpool.NoResultsPending:
            break

    if main_pool.dismissedWorkers:
        print("Joining all dismissed worker threads...")
        main_pool.joinAllDismissedWorkers()

now_time=time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time

# run serial Batch
# roomIdList=[19467,2469625,3755657,96969,95277]
# for runDay in dateRange(beginDate='2016-10-01', endDate='2016-10-08'):
#     DataCom2mysql(runDay=runDay, roomIdList=roomIdList)

猜你喜欢

转载自blog.csdn.net/BabyFish13/article/details/79959764
今日推荐