Python obtains Mysql and Hive data, calculates and integrates it and inserts it into the Mysql database

This code obtains the required data from multiple mysql data sources and hive data sources respectively, integrates and merges them in python, and then inserts them into the mysql database.
There are these MySQL tables in the hive library, but because only part of the data in hive is used, the query efficiency in the huge amount of hive data is extremely low; while querying a small part of the data in mysql can greatly improve query efficiency. Therefore, the processing of data sources involved in the code is relatively complicated.
The data can be run through a query script in hive; MySQL data can also be obtained separately and inserted into the MySQL temporary table, and then inserted into the MySQL target table in association. Through python scripting, the latter is a faster solution. However, in this article, all data is processed in python memory, and the running speed is improved, but the script writing is really annoying; and it does not have good reusability.
Learning points:
1. Left association processing method of multiple data results in Python
2. Insertion of mysql batch data
3. Multi-parameter parallel batch running method of data in python/

Users/nisj/PycharmProjects/BiDataProc/love/moreSrcDataCalc.py
# -*- coding=utf-8 -*-
import them
import re
import time
import datetime
import warnings
import threadpool

warnings.filterwarnings("ignore")

srcMysqlConfig_jellyfish_hadoop_stat={
    'host': 'MysqlHost',
    # 'host': 'MysqlHost',
    'user': 'MysqlUser',
    'passwd': 'MysqlPass',
    'port': 6605,
    'db': 'jellyfish_hadoop_stat'
}

srcMysqlConfig_jellyfish_stat={
    'host': 'MysqlHost',
    # 'host': 'MysqlHost',
    'user': 'MysqlUser',
    'passwd': 'MysqlPass',
    'port': 50515,
    'db': 'jellyfish_stat'
}

srcMysqlConfig_jellyfish_wealth = {
    'host': 'MysqlHost',
    # 'host': 'MysqlHost',
    'user': 'MysqlUser',
    'passwd': 'MysqlPass',
    'port': 10019,
    'db': 'jellyfish_wealth'
}

srcMysqlConfig_jellyfish_server = {
    'host': 'MysqlHost',
    # 'host': 'MysqlHost',
    'user': 'MysqlUser',
    'passwd': 'MysqlPass',
    'port': 50506,
    'db': 'jellyfish_server'
}

def dateRange(beginDate, endDate):
    dates=[]
    dt=datetime.datetime.strptime(endDate, "%Y-%m-%d")
    date=endDate[:]
    while date >= beginDate:
        dates.append(date)
        dt = dt - datetime.timedelta (1)
        date=dt.strftime("%Y-%m-%d")
    return dates

def hiveDataCalc (runDay, roomIdList):
    messageSend_Data=os.popen("""/usr/lib/hive-current/bin/hive -e " \
            select \
                parms['roomId'] roomid,count(1) as mess_num, \
                count(distinct parms['uid']) as mess_user, \
                count(distinct parms['ip']) as mess_ip \
            from oss_bi_all_message_send_log \
            where pt_day='{runDay}' and parms['roomId'] in ({roomIdStr}) \
            group by parms['roomId'] \
                " """.format(runDay=runDay, roomIdStr=str(roomIdList)[1:-1])).readlines();

    messageSend_Data_list=[]
    for ms_list in messageSend_Data:
        ms=re.split('\t', ms_list.replace('\n', ''))
        messageSend_Data_list.append(ms)

    return messageSend_Data_list

    # for messageSendData in messageSend_Data_list:
    #     print messageSendData[0],messageSendData[1],messageSendData[2],messageSendData[3]

def mysqlDataCalc(runDay,roomIdList):
    onlineCount_DataList = []
    for roomId in roomIdList:
        onlineCount_Data = os.popen("""mysql -h{host} -P{port} -u{user} -p{passwd} -N -e "set names utf8;use {db}; \
                        select room_id, max(online_real_count) as online_real_count \
                        from room_online_stat_{tabSuffix} \
                        where substr(created_time,1,10)='{runDay}' and room_id ={roomId} \
                        group by room_id \
                        ; \
                        " """.format(runDay=runDay, roomId=roomId, tabSuffix=str(roomId % 256),
                                     host=srcMysqlConfig_jellyfish_stat['host'],
                                     port=srcMysqlConfig_jellyfish_stat['port'],
                                     user=srcMysqlConfig_jellyfish_stat['user'],
                                     passwd=srcMysqlConfig_jellyfish_stat['passwd'],
                                     db=srcMysqlConfig_jellyfish_stat['db'])).readlines();

        onlineCount_Data_list = []
        for oc_list in onlineCount_Data:
            oc = re.split('\t', oc_list.replace('\n', ''))
            onlineCount_Data_list.append(oc)
        onlineCount_DataList.append(onlineCount_Data_list)
        # print onlineCount_DataList

    giftPoint_DataList = []
    for roomId in roomIdList:
        giftPoint_Data = os.popen("""mysql -h{host} -P{port} -u{user} -p{passwd} -N -e "set names utf8;use {db}; \
                    select room_id,sum(point) as gift_point, \
                    count(distinct uid) as point_user \
                    from room_point_detail_{tabSuffix} \
                    where substr(created_time,1,10)='{runDay}' and room_id ={roomId} \
                    group by room_id \
                    ; \
                    " """.format(runDay=runDay, roomId=roomId, tabSuffix=str(roomId % 256),
                                 host=srcMysqlConfig_jellyfish_wealth['host'], port=srcMysqlConfig_jellyfish_wealth['port'],
                                 user=srcMysqlConfig_jellyfish_wealth['user'],
                                 passwd=srcMysqlConfig_jellyfish_wealth['passwd'],
                                 db=srcMysqlConfig_jellyfish_wealth['db'])).readlines();

        giftPoint_Data_list = []
        for gp_list in giftPoint_Data:
            gp = re.split('\t', gp_list.replace('\n', ''))
            giftPoint_Data_list.append(gp)
        giftPoint_DataList.append(giftPoint_Data_list)
    # print giftPoint_DataList


    roomSubscr_DataList = []
    for roomId in roomIdList:
        roomSubscr_Data = os.popen("""mysql -h{host} -P{port} -u{user} -p{passwd} -N -e "set names utf8;use {db}; \
                    select room_id, count(distinct uid) cnt
                    from room_subscriber_uid_{tabSuffix}
                    where substr(created_time,1,10)='{runDay}' and state=0 and room_id ={roomId}
                    group by room_id; \
                    " """.format(runDay=runDay, roomId=roomId, tabSuffix=str(roomId % 256),
                                 host=srcMysqlConfig_jellyfish_server['host'], port=srcMysqlConfig_jellyfish_server['port'],
                                 user=srcMysqlConfig_jellyfish_server['user'],
                                 passwd=srcMysqlConfig_jellyfish_server['passwd'],
                                 db=srcMysqlConfig_jellyfish_server['db'])).readlines();

        roomSubscr_Data_list = []
        for rs_list in roomSubscr_Data:
            rs = re.split('\t', rs_list.replace('\n', ''))
            roomSubscr_Data_list.append(rs)
        roomSubscr_DataList.append(roomSubscr_Data_list)
    # print roomSubscr_DataList

    roomRank_DataList = []
    for roomId in roomIdList:
        roomRank_Data = os.popen("""mysql -h{host} -P{port} -u{user} -p{passwd} -N -e "set names utf8;use {db}; \
                select room_id,min(rank) rank,sum(duration) duration
                from room_category_rank
                where substr(created_time,1,10)='{runDay}' and category_id=0 and room_id ={roomId}
                group by room_id; \
                    " """.format(runDay=runDay, roomId=roomId, tabSuffix=str(roomId % 256),
                                 host=srcMysqlConfig_jellyfish_server['host'], port=srcMysqlConfig_jellyfish_server['port'],
                                 user=srcMysqlConfig_jellyfish_server['user'],
                                 passwd=srcMysqlConfig_jellyfish_server['passwd'],
                                 db=srcMysqlConfig_jellyfish_server['db'])).readlines();

        roomRank_Data_list = []
        for rr_list in roomRank_Data:
            rr = re.split('\t', rr_list.replace('\n', ''))
            roomRank_Data_list.append(rr)
        roomRank_DataList.append(roomRank_Data_list)
    # print roomRank_DataList

    return onlineCount_DataList,giftPoint_DataList,roomSubscr_DataList,roomRank_DataList

def DataCom2mysql(runDay, roomIdList):
    # parameter initialization assignment
    host=srcMysqlConfig_jellyfish_hadoop_stat['host']
    port=srcMysqlConfig_jellyfish_hadoop_stat['port']
    user=srcMysqlConfig_jellyfish_hadoop_stat['user']
    passwd=srcMysqlConfig_jellyfish_hadoop_stat['passwd']
    db=srcMysqlConfig_jellyfish_hadoop_stat['db']

    os.system("""source /etc/profile; \
             mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \
             delete from report_room_live_detail_daily_stat where \`date\`='{runDay}' and room_id in({roomIdStr}); " """.format(host=host, port=port, user=user, passwd=passwd, db=db, runDay=runDay, roomIdStr=str(roomIdList)[1:-1]))

    onlineCount_DataList, giftPoint_DataList, roomSubscr_DataList, roomRank_DataList = mysqlDataCalc(runDay, roomIdList)
    # print onlineCount_DataList,giftPoint_DataList,roomSubscr_DataList,roomRank_DataList

    lineCountcomList=[]
    roomIdonlineCountHave = []
    for roomId in roomIdList:
        for onlineCount in onlineCount_DataList:
            if len(onlineCount)!= 0 and int(roomId) == int(onlineCount[0][0]):
                roomIdonlineCountHave.append(roomId)
                lineCountcomList.append([roomId,int(onlineCount[0][1])])
    for roomId in roomIdList:
        if roomId not in roomIdonlineCountHave:
            lineCountcomList.append([roomId,0])
    # print lineCountcomList

    giftPointcomList=[]
    roomIdgiftPointHave = []
    for roomId in roomIdList:
        for giftPoint in giftPoint_DataList:
            if len(giftPoint)!= 0 and int(roomId) == int(giftPoint[0][0]):
                roomIdgiftPointHave.append(roomId)
                giftPointcomList.append([roomId,int(giftPoint[0][1]),int(giftPoint[0][2])])
    for roomId in roomIdList:
        if roomId not in roomIdgiftPointHave:
            giftPointcomList.append([roomId,0,0])
    # print giftPointcomList

    roomSubscrcomList=[]
    roomIdroomSubscrHave = []
    for roomId in roomIdList:
        for roomSubscr in roomSubscr_DataList:
            if len(roomSubscr)!= 0 and int(roomId) == int(roomSubscr[0][0]):
                roomIdroomSubscrHave.append(roomId)
                roomSubscrcomList.append([roomId,int(roomSubscr[0][1])])
    for roomId in roomIdList:
        if roomId not in roomIdroomSubscrHave:
            roomSubscrcomList.append([roomId,0])
    # print roomSubscrcomList

    roomRankcomList=[]
    roomIdroomRankHave = []
    for roomId in roomIdList:
        for roomRank in roomRank_DataList:
            if len(roomRank)!= 0 and int(roomId) == int(roomRank[0][0]):
                roomIdroomRankHave.append(roomId)
                roomRankcomList.append([roomId,int(roomRank[0][1]),int(roomRank[0][2])])
    for roomId in roomIdList:
        if roomId not in roomIdroomRankHave:
            roomRankcomList.append([roomId,0,0])
    # print roomRankcomList

    messageSend_Data_list=hiveDataCalc(runDay, roomIdList)
    messageSendcomList=[]
    roomIdmessageSendHave = []
    for roomId in roomIdList:
        for messageSend in messageSend_Data_list:
            if len(messageSend)!= 0 and int(roomId) == int(messageSend[0]):
                roomIdmessageSendHave.append(roomId)
                messageSendcomList.append([roomId,int(messageSend[1]),int(messageSend[2]),int(messageSend[3])])
    for roomId in roomIdList:
        if roomId not in roomIdmessageSendHave:
            messageSendcomList.append([roomId,0,0,0])
    # print messageSendcomList

    Sum_Data_list=[]
    for msd in messageSendcomList:
        for old in lineCountcomList:
            for gpd in giftPointcomList:
                for rsd in roomSubscrcomList:
                    for rrd in roomRankcomList:
                        if msd[0] == old[0] and msd[0] == gpd[0] and msd[0] == rsd[0] and msd[0] == rrd[0]:
                            Sum_Data_list.append([msd[0], msd[1], msd[2], msd[3], 0, 0, old[1], gpd[1], gpd[2], rrd[2], rsd[1], rrd[1]])

    # for sumd in Sum_Data_list:
    #     print sumd

    # result insert table
    i=0
    insert_mysql_sql="""/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \
           insert into report_room_live_detail_daily_stat(\`date\`,room_id,barrage_count,barrage_user_count,barrage_ip_count,view_user,view_duration,view_peak_user,gift_num,gift_user,anchor_duration,subscriber_count,rank) \
           values """.format(host=host, port=port, user=user, passwd=passwd, db=db)
    for sumd in Sum_Data_list:
        room_id=sumd[0]
        barrage_count=sumd[1]
        barrage_user_count=sumd[2]
        barrage_ip_count=sumd[3]
        view_user=sumd[4]
        view_duration=sumd[5]
        view_peak_user=sumd[6]
        gift_num=sumd[7]
        gift_user=sumd[8]
        anchor_duration=sumd[9]
        subscriber_count=sumd[10]
        rank=sumd[11]
        etl_time=time.strftime('%Y-%m-%d %X', time.localtime())

        i += 1
        insert_mysql_sql=insert_mysql_sql + """('{date}','{room_id}','{barrage_count}','{barrage_user_count}','{barrage_ip_count}','{view_user}','{view_duration}','{view_peak_user}','{gift_num}','{gift_user}','{anchor_duration}','{subscriber_count}','{rank}'),""".format(host=host, port=port, user=user, passwd=passwd, db=db, date='{runDay}'.format(runDay=runDay),room_id=room_id, barrage_count=barrage_count, barrage_user_count=barrage_user_count, barrage_ip_count=barrage_ip_count,view_user=view_user,view_duration=view_duration,view_peak_user=view_peak_user,gift_num=gift_num,gift_user=gift_user,anchor_duration=anchor_duration,subscriber_count=subscriber_count,rank=rank)
        if (i % 1000 == 0):
            insert_mysql_sql=insert_mysql_sql.rstrip(',') + """ ;" """
            os.system(insert_mysql_sql)

            insert_mysql_sql="""/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \
                   insert into report_room_live_detail_daily_stat(date,room_id,barrage_count,barrage_user_count,barrage_ip_count,view_user,view_duration,view_peak_user,gift_num,gift_user,anchor_duration,subscriber_count,rank) \
                   values """.format(host=host, port=port, user=user, passwd=passwd, db=db)

    insert_mysql_sql=insert_mysql_sql.rstrip(',') + """ ;" """
    os.system(insert_mysql_sql)

# run parallel Batch
roomIdList=[19467,2469625,3755657,96969,95277]
runDayRoomIdList = []
for runDay in dateRange (beginDate = '2015-11-27', endDate = '2018-04-11'):
    runDayRoomIdList.append (([runDay, roomIdList], None))
# print runDayRoomIdList

now_time=time.strftime('%Y-%m-%d %X', time.localtime())
print "The current time is: ",now_time
requests=[]
request_DataCom2mysql_batchCtl=threadpool.makeRequests(DataCom2mysql, runDayRoomIdList)
requests.extend(request_DataCom2mysql_batchCtl)
main_pool=threadpool.ThreadPool(38)
[main_pool.putRequest(req) for req in requests]

if __name__ == '__main__':
    while True:
        try:
            time.sleep(30)
            main_pool.poll()
        except KeyboardInterrupt:
            print("**** Interrupted!")
            break
        except threadpool.NoResultsPending:
            break

    if main_pool.dismissedWorkers:
        print("Joining all dismissed worker threads...")
        main_pool.joinAllDismissedWorkers()

now_time=time.strftime('%Y-%m-%d %X', time.localtime())
print "The current time is: ",now_time

# run serial Batch
# roomIdList=[19467,2469625,3755657,96969,95277]
# for runDay in dateRange (beginDate = '2016-10-01', endDate = '2016-10-08'):
# DataCom2mysql (runDay = runDay, roomIdList = roomIdList)

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324607230&siteId=291194637