适用于需要跑多日数据及临时数据需求的时候;Hql代码及目录路径可以随时按需更换;为提高效率采用了并行的方式。
/Users/nisj/PycharmProjects/BiDataProc/love/HiveRunData2LocalFile.py
跟出的多个数据可以考虑压缩后再传输,提高效率。
/Users/nisj/PycharmProjects/BiDataProc/love/HiveRunData2LocalFile.py
# -*- coding=utf-8 -*- import os import datetime import warnings import time import threadpool warnings.filterwarnings("ignore") def dateRange(beginDate, endDate): dates = [] dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d") date = beginDate[:] while date <= endDate: dates.append(date) dt = dt + datetime.timedelta(1) date = dt.strftime("%Y-%m-%d") return dates def hiveRunData2localFile(pt_day): os.system("""/usr/lib/hive-current/bin/hive -e " \ with tab_pay_info as( \ select '{pt_day}' runday,x.uid,sum(x.pay_amount) total_pay_amount,max(x.pay_amount) max_pay_amount \ from (select pt_day,uid,amount pay_amount from oss_bi_all_chushou_pay_info where pt_day between '{pt_day}' and '{pt_day7after}' and state=0 \ union all \ select pt_day,uid,amount pay_amount from oss_bi_all_open_point_record where pt_day between '{pt_day}' and '{pt_day7after}' and state=0 and (key='alipay_cp' or key='tmall_pay') \ ) x \ group by x.uid \ ), \ tab_all_device as( \ select pt_day,parms['uid'] uid,max(device_info['device_model']) device_model,max(parms['ip']) ip \ from oss_bi_all_device_log \ where pt_day='{pt_day}' \ group by pt_day,parms['uid']) \ select '{pt_day}' pt_day,a1.appkey,a1.appsource,a1.uid,a2.device_model,a2.ip, \ substr(a3.id_card_num,7,8) birthday,2018-substr(a3.id_card_num,7,4) age,case when substr(a3.id_card_num,-2,1)%2=1 then '男' when substr(a3.id_card_num,-2,1)%2=0 then '女' end sex, \ case when a4.uid is null then 0 else 1 end is_pay,a4.max_pay_amount,a4.total_pay_amount \ from oss_bi_type_of_retention_user a1 \ left join tab_all_device a2 on a1.uid=a2.uid and a1.pt_day=a2.pt_day \ left join xxx_user_id_card a3 on a1.uid=a3.uid \ left join tab_pay_info a4 on a1.uid=a4.uid and a1.pt_day=a4.runday \ where a1.pt_day='{pt_day}' and a1.remain_type=1 and a1.remain_day_num=0 \ ; \ ">/home/hadoop/nisj/love/hive2Local/{pt_day}.txt """.format(pt_day=pt_day, pt_day7after=(datetime.datetime.strptime(pt_day, "%Y-%m-%d") + datetime.timedelta(6)).strftime('%Y-%m-%d'))); # # run serial Batch # for ptDay in dateRange(beginDate='2018-03-01', endDate='2018-03-01'): # print ptDay # hiveRunData2localFile(pt_day=ptDay) # run parallel Batch now_time = time.strftime('%Y-%m-%d %X', time.localtime()) print "当前时间是:",now_time runDay_list = dateRange(beginDate='2018-03-01', endDate='2018-03-31') requests = [] request_hiveRunData2localFile_batchCtl = threadpool.makeRequests(hiveRunData2localFile, runDay_list) requests.extend(request_hiveRunData2localFile_batchCtl) main_pool = threadpool.ThreadPool(16) [main_pool.putRequest(req) for req in requests] if __name__ == '__main__': while True: try: time.sleep(30) main_pool.poll() except KeyboardInterrupt: print("**** Interrupted!") break except threadpool.NoResultsPending: break if main_pool.dismissedWorkers: print("Joining all dismissed worker threads...") main_pool.joinAllDismissedWorkers() now_time = time.strftime('%Y-%m-%d %X', time.localtime()) print "当前时间是:",now_time用并行跑批的时候,要么安装threadpool模块,要么将threadpool代码脚本拷到跑批目录下。
跟出的多个数据可以考虑压缩后再传输,提高效率。
cd love/ cd hive2Local/ zip 201803.zip 2018* sz 201803.zip