版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/zuochang_liu/article/details/85335517
该脚本用的python版本是2.7.15的,所以安装了mysqldb模块,如果是python3.0及以上的版本,则需要安装pymysql模块,连接数据库的代码需要做相应的修改
# coding=utf-8
_author_ = 'liuzc'
import MySQLdb
import re
import sys
reload(sys)
sys.setdefaultencoding('utf8')
host = '10.10.10.10'
port = 3333
user = 'aa'
passwd = 'aa'
db = 'aa'
charset = 'utf8'
mysqlTableName = "user_info"
# 需要把结果输出到哪个文件
dataxDir = "D:\\datax.txt"
def connectDB():
"""
创建数据库连接
:return:
"""
con = MySQLdb.connect(host=host,port=port,user=user,passwd=passwd,db=db,charset=charset)
return con
def queryData():
# SQL 查询语句
sql = "SELECT column_name,column_comment,data_type FROM INFORMATION_SCHEMA.Columns WHERE table_name='%s' AND table_schema='%s'" % (mysqlTableName,db)
# 创建空列表
create_table_list = []
datax_json_list = []
datax_writer_json_list = []
create_table_list.append("etl_time timestamp")
con = connectDB()
# 使用cursor()方法获取操作游标
cursor = con.cursor()
try:
# 执行SQL语句
cursor.execute(sql)
# 获取所有记录列表
results = cursor.fetchall()
for row in results:
# 列名
column_name = hump2underline(row[0])
# 注释
column_comment = row[1]
# Mysql中字段数据类型
date_type = row[2]
# 给列表赋值
create_table_list.append("%s %s comment '%s'" % (column_name,getDataType(date_type),column_comment))
datax_json_list.append('\''+column_name+'\'')
datax_writer_json_list.append(" { \n \"name\":\"%s\",\n \"type\":\"%s\"\n }"%(column_name,getDataType(date_type)))
except:
print "Error: unable to fecth data"
# 生成建表语句
createTable(mysqlTableName,create_table_list)
# 生成datax的json文件
createDataxJson(mysqlTableName,datax_json_list,datax_writer_json_list)
# 关闭数据库连接
con.close()
def hump2underline(str):
"""
驼峰命名转下划线
:param hunp_str:
:return:
"""
# 匹配正则,匹配小写字母和大写字母的分界位置
p = re.compile(r'([a-z]|\d)([A-Z])')
# 这里第二个参数使用了正则分组的后向引用
newColumn = re.sub(p, r'\1_\2', str).lower()
return newColumn
def getDataType(str):
"""
根据mysql的数据类型
转换成hive的数据类型
:param str:
:return:
"""
if str == "varchar":
return "string"
elif str == "datetime":
return "timestamp"
elif str == "date":
return "string"
elif str == "smallint":
return "int"
elif str == "varbinary":
return "string"
else:
# bigint int
return str
def createTable(mysqlTableName,create_table_list):
"""
生成hive建表语句
:param mysqlTableName:
:param create_table_list:
:return:
"""
create_table_list.append("create_date string")
create_table_list.append("modify_date string")
hive_str1 = "use wedw_dwd; \n-- drop table if EXISTS %s_df;\n" % mysqlTableName
hive_str2 = "CREATE TABLE if not EXISTS %s_df \n(\n" % mysqlTableName
hive_str3 = ',\n'.join(create_table_list)
hive_str4 = "\n)\nPARTITIONED BY (\ndate_id string COMMENT '数据加工时间'\n)\nrow format delimited fields terminated by '\\t'\nSTORED AS rcfile\n;"
createTableStr = hive_str1 + hive_str2 + hive_str3 + hive_str4
f = open(dataxDir, 'a')
f.write("==================================================createTable============================================================== \r\n")
f.write(createTableStr + "\r\n")
f.close()
def createDataxJson(mysqlTableName,datax_json_list,datax_writer_json_list):
"""
创建datax的json文件
:param mysqlTableName:
:param create_table_list:
:return:
"""
json_str1 = "{\n \"job\":{\n \"settings\":{\n \"speed\":{\n \"channel\":3\n },\n"
json_str2 = " \"errorLimit\":{\n \"record\":0,\n \"percentage\":0.02 \n }\n },\n"
json_str3 = " \"content\":[ \n { \n \"reader\":{\n \"name\":\"mysqlreader\",\n \"parameter\":{\n"
json_str4 = " \"username\":\"%s\",\n \"password\":\"%s\",\n \"column\":[\n %s \n ],\n" %(user,passwd,',\n '.join(datax_json_list))
json_str5 = " \"where\":\"gmt_created>='$bizdate'and gmt_created<DATE_ADD('$bizdate',INTERVAL 1 DAY)\",\n"
json_str6 = " \"splitPk\":\"id\",\n \"connection\":[\n { \n \"table\":[\n"
json_str7 = " \"%s\"\n ],\n" % mysqlTableName
json_str8 = " \"jdbcUrl\":[\n \"jdbc:mysql://%s:%s/%s\"\n ]\n" %(host,port,db)
json_str9 = " }\n ]\n }\n },\n"
json_str10 = " \"writer\":{\n \"name\":\"hdfswriter\",\n \"parameter\":{\n \"defaultFS\":\"hdfs://xy180-wecloud-198:8020\",\n \"fileType\":\"text\",\n"
json_str11 = " \"path\":\"/data/hive/warehouse/wedw/ods/push_center_$bizdate/ds_name=push_center_0\",\n"
json_str12 = " \"fileName\":\"point_msg_his_0\",\n \"column\":[\n"
json_str13 = "%s \n ],\n" %(',\n'.join(datax_writer_json_list))
json_str14 = " \"writeMode\":\"append\",\n \"fieldDelimiter\":\"\t\", \n \"compress\":\"GZIP\" \n } \n }\n }\n ]\n }\n }"
dataxJsonStr = json_str1+json_str2+json_str3+json_str4+json_str5+json_str6+json_str7 +json_str8+json_str9+json_str10+json_str11+json_str12+json_str13+json_str14
f = open(dataxDir, 'a')
f.write("==================================================dataxJson============================================================== \r\n")
f.write(dataxJsonStr + "\r\n")
f.close()
if __name__ == '__main__':
try:
queryData()
except Exception as e:
print('Error:', e)