Python批量跑Hive数据到本地目录

适用于需要跑多日数据及临时数据需求的时候；Hql代码及目录路径可以随时按需更换；为提高效率采用了并行的方式。
/Users/nisj/PycharmProjects/BiDataProc/love/HiveRunData2LocalFile.py

# -*- coding=utf-8 -*-
import os
import datetime
import warnings
import time
import threadpool

warnings.filterwarnings("ignore")

def dateRange(beginDate, endDate):
    dates = []
    dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
    date = beginDate[:]
    while date <= endDate:
        dates.append(date)
        dt = dt + datetime.timedelta(1)
        date = dt.strftime("%Y-%m-%d")
    return dates

def hiveRunData2localFile(pt_day):
    os.system("""/usr/lib/hive-current/bin/hive -e " \
            with tab_pay_info as( \
            select '{pt_day}' runday,x.uid,sum(x.pay_amount) total_pay_amount,max(x.pay_amount) max_pay_amount \
            from (select pt_day,uid,amount pay_amount from oss_bi_all_chushou_pay_info where pt_day between '{pt_day}' and '{pt_day7after}' and state=0 \
            union all \
            select pt_day,uid,amount pay_amount from oss_bi_all_open_point_record where pt_day between '{pt_day}' and '{pt_day7after}' and state=0 and (key='alipay_cp' or key='tmall_pay') \
            ) x \
            group by x.uid \
            ), \
            tab_all_device as( \
            select pt_day,parms['uid'] uid,max(device_info['device_model']) device_model,max(parms['ip']) ip \
            from oss_bi_all_device_log \
            where pt_day='{pt_day}' \
            group by pt_day,parms['uid']) \
            select '{pt_day}' pt_day,a1.appkey,a1.appsource,a1.uid,a2.device_model,a2.ip, \
            substr(a3.id_card_num,7,8) birthday,2018-substr(a3.id_card_num,7,4) age,case when substr(a3.id_card_num,-2,1)%2=1 then '男' when substr(a3.id_card_num,-2,1)%2=0 then '女' end sex, \
            case when a4.uid is null then 0 else 1 end is_pay,a4.max_pay_amount,a4.total_pay_amount \
            from oss_bi_type_of_retention_user a1 \
            left join tab_all_device a2 on a1.uid=a2.uid and a1.pt_day=a2.pt_day \
            left join xxx_user_id_card a3 on a1.uid=a3.uid \
            left join tab_pay_info a4 on a1.uid=a4.uid and a1.pt_day=a4.runday \
            where a1.pt_day='{pt_day}' and a1.remain_type=1 and a1.remain_day_num=0 \
            ; \
            ">/home/hadoop/nisj/love/hive2Local/{pt_day}.txt """.format(pt_day=pt_day, pt_day7after=(datetime.datetime.strptime(pt_day, "%Y-%m-%d") + datetime.timedelta(6)).strftime('%Y-%m-%d')));

# # run serial Batch
# for ptDay in dateRange(beginDate='2018-03-01', endDate='2018-03-01'):
#     print ptDay
#     hiveRunData2localFile(pt_day=ptDay)

# run parallel Batch
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是：",now_time

runDay_list = dateRange(beginDate='2018-03-01', endDate='2018-03-31')
requests = []
request_hiveRunData2localFile_batchCtl = threadpool.makeRequests(hiveRunData2localFile, runDay_list)
requests.extend(request_hiveRunData2localFile_batchCtl)
main_pool = threadpool.ThreadPool(16)
[main_pool.putRequest(req) for req in requests]

if __name__ == '__main__':
    while True:
        try:
            time.sleep(30)
            main_pool.poll()
        except KeyboardInterrupt:
            print("**** Interrupted!")
            break
        except threadpool.NoResultsPending:
            break

    if main_pool.dismissedWorkers:
        print("Joining all dismissed worker threads...")
        main_pool.joinAllDismissedWorkers()

now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是：",now_time

用并行跑批的时候，要么安装threadpool模块，要么将threadpool代码脚本拷到跑批目录下。
跟出的多个数据可以考虑压缩后再传输，提高效率。

cd love/
cd hive2Local/
zip 201803.zip 2018*
sz 201803.zip

Python批量跑Hive数据到本地目录

猜你喜欢