datax抽取mysql数据(python脚本自动生成hive建表语句以及json文件)

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/zuochang_liu/article/details/85335517

 该脚本用的python版本是2.7.15的,所以安装了mysqldb模块,如果是python3.0及以上的版本,则需要安装pymysql模块,连接数据库的代码需要做相应的修改

# coding=utf-8
_author_ = 'liuzc'
import MySQLdb
import re
import sys
reload(sys)
sys.setdefaultencoding('utf8')

host = '10.10.10.10'
port = 3333
user = 'aa'
passwd = 'aa'
db = 'aa'
charset = 'utf8'
mysqlTableName = "user_info"
# 需要把结果输出到哪个文件
dataxDir = "D:\\datax.txt"

def connectDB():
    """
    创建数据库连接
    :return:
    """
    con = MySQLdb.connect(host=host,port=port,user=user,passwd=passwd,db=db,charset=charset)
    return con

def queryData():
    # SQL 查询语句
    sql = "SELECT column_name,column_comment,data_type FROM INFORMATION_SCHEMA.Columns WHERE table_name='%s' AND table_schema='%s'" % (mysqlTableName,db)
    # 创建空列表
    create_table_list = []
    datax_json_list = []
    datax_writer_json_list = []
    create_table_list.append("etl_time timestamp")
    con = connectDB()
    # 使用cursor()方法获取操作游标
    cursor = con.cursor()
    try:
        # 执行SQL语句
        cursor.execute(sql)
        # 获取所有记录列表
        results = cursor.fetchall()
        for row in results:
            # 列名
            column_name = hump2underline(row[0])
            # 注释
            column_comment = row[1]
            # Mysql中字段数据类型
            date_type = row[2]
            # 给列表赋值
            create_table_list.append("%s %s comment '%s'"  % (column_name,getDataType(date_type),column_comment))
            datax_json_list.append('\''+column_name+'\'')
            datax_writer_json_list.append("                         { \n                            \"name\":\"%s\",\n                            \"type\":\"%s\"\n                          }"%(column_name,getDataType(date_type)))
    except:
        print "Error: unable to fecth data"

    # 生成建表语句
    createTable(mysqlTableName,create_table_list)
    # 生成datax的json文件
    createDataxJson(mysqlTableName,datax_json_list,datax_writer_json_list)

    # 关闭数据库连接
    con.close()

def hump2underline(str):
    """
    驼峰命名转下划线
    :param hunp_str:
    :return:
    """
    # 匹配正则,匹配小写字母和大写字母的分界位置
    p = re.compile(r'([a-z]|\d)([A-Z])')
    # 这里第二个参数使用了正则分组的后向引用
    newColumn = re.sub(p, r'\1_\2', str).lower()
    return newColumn

def getDataType(str):
    """
    根据mysql的数据类型
    转换成hive的数据类型
    :param str:
    :return:
    """
    if str == "varchar":
        return "string"
    elif str == "datetime":
        return "timestamp"
    elif str == "date":
        return "string"
    elif str == "smallint":
        return "int"
    elif str == "varbinary":
        return "string"
    else:
        # bigint int
        return str


def createTable(mysqlTableName,create_table_list):
    """
    生成hive建表语句
    :param mysqlTableName:
    :param create_table_list:
    :return:
    """
    create_table_list.append("create_date string")
    create_table_list.append("modify_date string")
    hive_str1 = "use wedw_dwd; \n-- drop table if EXISTS %s_df;\n" % mysqlTableName
    hive_str2 = "CREATE TABLE if not EXISTS %s_df \n(\n" % mysqlTableName
    hive_str3 = ',\n'.join(create_table_list)
    hive_str4 = "\n)\nPARTITIONED BY (\ndate_id string COMMENT '数据加工时间'\n)\nrow format delimited fields terminated by '\\t'\nSTORED AS rcfile\n;"
    createTableStr = hive_str1 + hive_str2 + hive_str3 + hive_str4
    f = open(dataxDir, 'a')
    f.write("==================================================createTable============================================================== \r\n")
    f.write(createTableStr + "\r\n")
    f.close()




def createDataxJson(mysqlTableName,datax_json_list,datax_writer_json_list):
    """
    创建datax的json文件
    :param mysqlTableName:
    :param create_table_list:
    :return:
    """
    json_str1 = "{\n   \"job\":{\n       \"settings\":{\n           \"speed\":{\n               \"channel\":3\n           },\n"
    json_str2 = "           \"errorLimit\":{\n                  \"record\":0,\n                  \"percentage\":0.02 \n           }\n       },\n"
    json_str3 = "      \"content\":[   \n          { \n              \"reader\":{\n                  \"name\":\"mysqlreader\",\n                  \"parameter\":{\n"
    json_str4 = "                      \"username\":\"%s\",\n                      \"password\":\"%s\",\n                      \"column\":[\n                               %s \n                                       ],\n" %(user,passwd,',\n                               '.join(datax_json_list))
    json_str5 = "                      \"where\":\"gmt_created>='$bizdate'and gmt_created<DATE_ADD('$bizdate',INTERVAL 1 DAY)\",\n"
    json_str6 = "                      \"splitPk\":\"id\",\n                      \"connection\":[\n                           { \n                              \"table\":[\n"
    json_str7 = "                                   \"%s\"\n                              ],\n" % mysqlTableName
    json_str8 = "                              \"jdbcUrl\":[\n                                       \"jdbc:mysql://%s:%s/%s\"\n                              ]\n" %(host,port,db)
    json_str9 = "                         }\n                     ]\n                   }\n             },\n"
    json_str10 = "             \"writer\":{\n                  \"name\":\"hdfswriter\",\n                  \"parameter\":{\n                      \"defaultFS\":\"hdfs://xy180-wecloud-198:8020\",\n                      \"fileType\":\"text\",\n"
    json_str11 = "                      \"path\":\"/data/hive/warehouse/wedw/ods/push_center_$bizdate/ds_name=push_center_0\",\n"
    json_str12 = "                      \"fileName\":\"point_msg_his_0\",\n                      \"column\":[\n"
    json_str13 = "%s \n                      ],\n" %(',\n'.join(datax_writer_json_list))
    json_str14 = "                     \"writeMode\":\"append\",\n                     \"fieldDelimiter\":\"\t\", \n                     \"compress\":\"GZIP\" \n                 } \n                }\n           }\n         ]\n     }\n   }"
    dataxJsonStr = json_str1+json_str2+json_str3+json_str4+json_str5+json_str6+json_str7 +json_str8+json_str9+json_str10+json_str11+json_str12+json_str13+json_str14
    f = open(dataxDir, 'a')
    f.write("==================================================dataxJson============================================================== \r\n")
    f.write(dataxJsonStr + "\r\n")
    f.close()

if __name__ == '__main__':
    try:
        queryData()
    except Exception as e:
        print('Error:', e)

猜你喜欢

转载自blog.csdn.net/zuochang_liu/article/details/85335517
今日推荐