python3-- data migration tool

Due to business requirements, the need to increase the number Mysql database, so it is necessary to redistribute the data from the old database according to different sub-library rules Mysql database. Could use ETL tools, but, the company is too broken VM does not support it. Ah, just recently learn a python, I took it to practice your hand.

Features: 1, many to many data migration

             2, HTTP (from where the fall, you get up from where)

             3, the number of comparison data. Is the number of pieces of data before comparing the source and target databases are the same.

             4, the data filtering table. Filter out unwanted data table.

Note: This tool must be put in the first place when reading data id, id because the script will be re-edited. Of course, if you do not you can put this note off on it.

 

Master File DataTanslate.py

import pymysql
import hashlib
import hascode
import nev_dataTransferSql
import datetime
import configparser
import math
import sys
#获取参数
conf = configparser.ConfigParser()
conf.read("./dataBaseConfig.ini")

#源数据库参数
sourceDBUrl  = (conf.get('soureDB', 'sourceDBUrl')).split(',')
sourceDBUser = (conf.get('soureDB', 'sourceDBUser')).split(',')
sourceDBKey =  (conf.get('soureDB', 'sourceDBKey')).split(',')
sourceDataBse = (conf.get('soureDB', 'sourceDataBse')).split(',')

#目标数据库参数
targetDBUrl =  (conf.get('targetDB', 'targetDBUrl')).split(',')
targetDBUser = (conf.get('targetDB', 'targetDBUser')).split(',')
targetDBKey =  (conf.get('targetDB', 'targetDBKey')).split(',')
targetDBs = (conf.get('targetDB', 'targetDBs')).split(',')
#当前执行数据库标识
sourcedatabseFlg =(conf.get('soureDB', 'sourcedatabseFlg')).split(',')
currentSourcedatabseFlg = conf.get('progress', 'currentSourcedatabseFlg')
dataBaseInterval = int(conf.get('config', 'dataBaseInterval'))
#不需要迁移的表
filterTables = (conf.get('filter', 'tables')).split(',')
dataBaseNo = 0
def connectDB(url,user,key,dataBase):
    # 连接数据库
    db = pymysql.connect(url, user,key, dataBase)
    # 使用cursor()方法创建一个游标对象
    cursor = db.cursor()
    return db,cursor

def getDataFromTarget(sourceDBUrl,sourceDBUser, sourceDBKey,sql,dataBase):
    #从源数据库获取数据
    # 连接数据库
    db, cursor = connectDB(sourceDBUrl,sourceDBUser, sourceDBKey,dataBase)
    # 使用execute()方法执行SQL语句
    cursor.execute(sql)

    # 使用fetall()获取全部数据
    data = cursor.fetchall()

    # 关闭游标和数据库的连接
    cursor.close()
    db.close()
    return data

def insertDataToTarget(index,sql,args):
    #向数据库插入数据
    # 连接数据库
    db, cursor = connectDB(targetDBUrl[index], targetDBUser[index],targetDBKey[index], targetDBs[index])
    # 使用execute()方法执行SQL语句
    cursor.executemany(sql,args)
    # 关闭游标和数据库的连接
    cursor.close()
    db.commit()
    db.close()
#分库规则
def formatVin(args):
    vin = args[2]
    m = hashlib.md5()
    m.update(vin.encode(encoding='UTF-8'))
    num = hascode.getHashCode(m.hexdigest())
    mvalue = (num & 2147483647)%len(targetDBs)
    if(mvalue <= len(targetDBs) ):
          return mvalue
    else:
          return 0
     # print(vin)
#根据分库规则对数据分组
def shunt(args):
   list_ = [[()]] * len(targetDBs)
   for index in range(len(args)):
       index2 = formatVin((args[index]))
       if (len((list_[index2])[0]) == 0):
           (list_[index2]) = [args[index]]
       else:
           (list_[index2]).append(args[index])
   for index3 in range(len(list_)):
       list_[index3] = tuple(list_[index3])
   return list_

def tables(sourceDBUrl,sourceDBUser, sourceDBKey,sourceDataBse,pageIndex,size,getData_sql,insert_sql,currentPage,currentTable):
    #开始执行数据迁移
    #print(getData_sql)
    getData_sql = getData_sql  + str(pageIndex) + " ," + str(size)
    #D获取需要插入的数据
    args = getDataFromTarget(sourceDBUrl,sourceDBUser, sourceDBKey,getData_sql,sourceDataBse)
    if(len(list(args)) == 0):
        return
    args = editResultID(args, pageIndex,currentPage,currentTable)
    result = shunt(args)
    for index in range(len(result)):
        if(len(result)>0):
           print("插入数据:")
           # insertDataToTarget(index,insert_sql,result[index])
        else:
            break
    return len(result)
#b保存当前数据插入信息,以备续传
def saveCurrentData(data,currentPage,currentTable):
    print("当前源数据库",sourcedatabseFlg[dataBaseNo])
    print("当前表",currentTable)
    print("总页数",pageCount)
    print("页码", currentPage)
    print("开始ID",(data[0])[0])
    print("结束ID",(data[len(data)-1])[0])
    conf = configparser.ConfigParser()
    conf.read("./dataBaseConfig.ini")
    conf.set("progress", "table", currentTable)
    conf.set("progress", "pageNo", str(currentPage))
    conf.set("progress", "beginId", str((data[0])[0]))
    conf.set("progress", "endId", str((data[len(data)-1])[0]))
    conf.write(open("./dataBaseConfig.ini", "w"))

def editResultID(result,pageIndex,currentPage,currentTable):
    count = pageIndex + 1 + dataBaseInterval * dataBaseNo
    newResult = list(result)
    for index in range(len(newResult)):
        temp = list(newResult[index])
        temp[0] = count + index
        newResult[index] = tuple(temp)
    saveCurrentData(newResult,currentPage,currentTable)
    return tuple(newResult)

def getPageCount(sourceDBUrl,sourceDBUser, sourceDBKey,sourceDb,currentTable,pageSize):
    sql = "select count(*) from " + currentTable
    amount = (getDataFromTarget(sourceDBUrl,sourceDBUser, sourceDBKey,sql,sourceDb)[0])[0]
    pageCount = math.ceil(amount/pageSize) + 1
    print("总页数为:" +str(pageCount))
    return pageCount

def saveCurrentProcess(currentTable,currentPage):
    global  currentSourcedatabseFlg
    currentSourcedatabseFlg = sourcedatabseFlg[dataBaseNo]
    conf = configparser.ConfigParser()
    conf.read("./dataBaseConfig.ini")
    conf.set("progress", "table", currentTable)
    conf.set("progress", "pageNo", str(currentPage))
    conf.set("progress", "beginId", str(1))
    conf.set("progress", "endId", str((0)))
    conf.set("progress", "currentsourcedatabseflg", str(sourcedatabseFlg[dataBaseNo]))
    conf.write(open("./dataBaseConfig.ini", "w"))
def start(thread_num):
    try:
        #从配置文件读取信息
        beginPageNo = int(conf.get("progress", "pageNo"))
        beginTable = conf.get("progress", "table")
        # 数据转移配置
        pageSize = int(conf.get('config', 'pageSize'))
        print("开始页码:",beginPageNo)
        starttime = datetime.datetime.now()
        #获取转移数据所需sql,以及所有表
        sqlList,tablesList = nev_dataTransferSql.getDataTransferSql(sourceDBUrl[0], sourceDBUser[0], sourceDBKey[0],sourceDataBse[0], filterTables, [], " ORDER BY id desc ", " LIMIT ")
        tableList = []
        for table in  tablesList:
            if(table[0] not in filterTables):
                tableList.append(table[0])
        print("要迁移的表:%s"% tableList)
        for dataBaseIndex in range(len(sourcedatabseFlg)):
            if(currentSourcedatabseFlg != sourcedatabseFlg[dataBaseIndex]):
                continue
            # 已完成表数量
            amount = 0
            print("第%d个库"%dataBaseIndex)
            for value in sqlList:
                currentTable = str(tableList[amount])
                print(currentTable,"开始")
                #判断当前表是否续传
                if(beginTable != currentTable):
                    beginPageNo = 1
                global pageCount
                #获取数据页数
                pageCount = getPageCount(sourceDBUrl[dataBaseIndex],sourceDBUser[dataBaseIndex], sourceDBKey[dataBaseIndex],sourceDataBse[dataBaseIndex],currentTable,pageSize)
                for i in range(beginPageNo,pageCount):
                    index = (i-1)*pageSize
                    if(tables(sourceDBUrl[dataBaseIndex],sourceDBUser[dataBaseIndex], sourceDBKey[dataBaseIndex],sourceDataBse[dataBaseIndex],index,pageSize,value[0] ,value[1],i,currentTable)==0):
                        break
                    print("表 %s 已完成 %s" % (str(tableList[amount]), str(((i / (pageCount - 1)) * 100)) + "%"))
                print("%d-----整体已完成:%s" % (thread_num, str((amount / (len(tableList) - 1)) * 100) + "%"))
                if(amount == len(tableList)-1):
                    global dataBaseNo
                    if(dataBaseIndex +1 < len(sourcedatabseFlg)):
                        dataBaseNo = dataBaseIndex + 1
                        saveCurrentProcess(tableList[0],0)
                amount += 1
        endtime = datetime.datetime.now()
        print("总耗时:%s"% (endtime - starttime).seconds)
    except OSError as err:
        print("OS error: {0}".format(err))
    except ValueError:
        print("Could not convert data to an integer.")
    except:
        print("Unexpected error:", sys.exc_info()[0])
        raise
start(1)

Used to generate sql file

import pymysql
import configparser
#获取参数
conf = configparser.ConfigParser()
conf.read("./dataBaseConfig.ini")

def connectDB(url,user,key,table):
    # 连接数据库
    #db = pymysql.connect("localhost", "root", "admin", table)
    db = pymysql.connect(url, user,key, table)
    # 使用cursor()方法创建一个游标对象
    cursor = db.cursor()
    return db,cursor


def getDataFromTarget(sourceDBUrl, sourceDBUser, sourceDBKey,sql,table):
    # 从目标数据库取出数据
    db, cursor = connectDB(sourceDBUrl, sourceDBUser, sourceDBKey,table)
    # 使用execute()方法执行SQL语句
    cursor.execute(sql)

    # 使用fetall()获取全部数据
    data = cursor.fetchall()

    # 关闭游标和数据库的连接
    cursor.close()
    db.close()
    return data
def editeCloumn(cloumn,reType,reCloumn):
    #编辑表字段
    if(cloumn[1]== "datetime"):
        return  "date_format("+ cloumn[0] + ",'%Y-%m-%d %H:%i:%S') AS " + cloumn[0]
    else:
        return cloumn[0]

def editeSql(table,cloumns,_cloumns,orderBy,limit):
    #生成SQL文 table:表名;cloumns:字段集合;_cloumns:完整的字段集合;orderBy:查询排序:limit:分页
    sql_select =  "select " + cloumns + " from " + table
    if( len(orderBy)>0 and table!="user"):
        sql_select +=orderBy
    if(len(limit)>0):
        sql_select += " limit "
    sql_insert = "insert into " + table + "(" + _cloumns + ")" + "values( "  + "%s,"*(len(_cloumns.split(",")) - 1) + "%s )"
    return sql_select,sql_insert

#获取表结构,并输出参数字符串
def getTableStructure(sourceDBUrl, sourceDBUser, sourceDBKey,table,filterCloumn,reType,reCloumn):
    #table:表名;filterCloumn:过滤字段:filterCloumn;reType:替换字段类型;reCloumn:替换字段;
    structure = getDataFromTarget(sourceDBUrl, sourceDBUser, sourceDBKey,'desc ' + table[0], "toyota")
    cloumns = ""
    _cloumns = ""
    for cloumn in structure:
        if(cloumn[0] not in filterCloumn):
            value = editeCloumn(cloumn,reType,reCloumn)
            if(len(cloumns)>0):
                cloumns += "," + value
                _cloumns += "," + cloumn[0]
            else:
                cloumns = value
                _cloumns = cloumn[0]
    return cloumns,_cloumns


def getDataTransferSql(sourceDBUrl, sourceDBUser, sourceDBKey,dataBase,filterTable,filterCloumn,orderBy,limit):
    tablse = getDataFromTarget(sourceDBUrl, sourceDBUser, sourceDBKey,'show tables',dataBase)
    list = []
    for table in tablse:
        if(table[0] not in filterTable):
           cloumns,_cloumns = getTableStructure(sourceDBUrl, sourceDBUser, sourceDBKey,table,filterCloumn,"","")
           list.append(editeSql(table[0], cloumns,_cloumns,orderBy,limit))
    return list,tablse

hascode related

def convert_n_bytes(n, b):
    bits = b * 8
    return (n + 2 ** (bits - 1)) % 2 ** bits - 2 ** (bits - 1)

def convert_4_bytes(n):
    return convert_n_bytes(n, 4)

def getHashCode(s):
    h = 0
    n = len(s)
    for i, c in enumerate(s):
        h = h + ord(c) * 31 ** (n - 1 - i)
    return convert_4_bytes(h)

Delete data from the target database  deleteDataFromTables.py

import pymysql
import configparser
#获取数据库参数
conf = configparser.ConfigParser()
conf.read("./dataBaseConfig.ini")
#目标数据库
targetDBUrl =  (conf.get('targetDB', 'targetDBUrl')).split(',')
targetDBUser = (conf.get('targetDB', 'targetDBUser')).split(',')
targetDBKey =  (conf.get('targetDB', 'targetDBKey')).split(',')
targetDBList = (conf.get('targetDB', 'targetDBs')).split(',')
beginPageNo = int(conf.get("progress", "pageNo"))
beginid = int(conf.get("progress", "beginid"))
endid = int(conf.get("progress", "endid"))
targetTables = conf.get("progress", "table")
def connectDB(url,user,key,table):
    # 连接数据库
    db = pymysql.connect(url, user,key, table)
    # 使用cursor()方法创建一个游标对象
    cursor = db.cursor()
    return db,cursor
def deletData(beginId,endId):
    for DBIndex in range(len(targetDBUrl)):
        db, cursor = connectDB(targetDBUrl[DBIndex], targetDBUser[DBIndex], targetDBKey[DBIndex], targetDBList[DBIndex])
        print("<-------------------------正在删除" + targetDBUser[DBIndex] + "库----------------------------->")
        print("正在删除" +  targetTables + "表")
        sql = "delete from " + targetTables + " where id>= " + str(beginId) + " and id<= " + str(endId) + " ;"
        print(sql)
        try:
            cursor.execute(sql)
            db.commit()
        except:
            print("操作异常" )
            db.rollback()
        cursor.close()
        db.close()
deletData(beginid,endid)

Compare the number of pieces of data in Table

import pymysql
import configparser
#获取参数
conf = configparser.ConfigParser()
conf.read("./dataBaseConfig.ini")

#目标数据库参数
sourceDBUrl =  (conf.get('soureDB', 'sourceDBUrl')).split(',')
sourceDBUser = (conf.get('soureDB', 'sourceDBUser')).split(',')
sourceDBKey =  (conf.get('soureDB', 'sourceDBKey')).split(',')
sourceDataBase = (conf.get('soureDB', 'sourceDataBse')).split(',')
#目标数据库参数
targetDBUrl =  (conf.get('targetDB', 'targetDBUrl')).split(',')
targetDBUser = (conf.get('targetDB', 'targetDBUser')).split(',')
targetDBKey =  (conf.get('targetDB', 'targetDBKey')).split(',')
targetDBs = (conf.get('targetDB', 'targetDBs')).split(',')
#需要过滤的数据表
filterTables = (conf.get('filter', 'tables')).split(',')
beginId = conf.get('compare', 'beginId')
enId = conf.get('compare', 'enId')
def connectDB(url,user,key,table):
    # 连接数据库
    db = pymysql.connect(url, user,key, table)
    # 使用cursor()方法创建一个游标对象
    cursor = db.cursor()
    return db,cursor


def getDataFromTarget(sourceDBUrl,sourceDBUser,sourceDBKey,sql,dataBase):
    # 从目标数据库取出数据
    db, cursor = connectDB(sourceDBUrl, sourceDBUser, sourceDBKey,dataBase)
    # 使用execute()方法执行SQL语句
    cursor.execute(sql)

    # 使用fetall()获取全部数据
    data = cursor.fetchall()

    # 关闭游标和数据库的连接
    cursor.close()
    db.close()
    return data
def editeCloumn(cloumn,reType,reCloumn):
    #编辑表字段
    if(cloumn[1]== "datetime"):
        return  "date_format("+ cloumn[0] + ",'%Y-%m-%d %H:%i:%S') AS " + cloumn[0]
    else:
        return cloumn[0]

def editeSql(table):
    #生成SQL文 table:表名;cloumns:字段集合;_cloumns:完整的字段集合;orderBy:查询排序:limit:分页
    sql_selectTarget =  "select count(*)"  + " from " + table + " where 1=1"
    if(beginId):
        sql_selectTarget = sql_selectTarget + " and id >= " + str(beginId)
    if(enId):
        sql_selectTarget = sql_selectTarget + " and id <= " + str(enId)
    sql_selectSource = "select count(*)" + " from " + table
    return sql_selectTarget,sql_selectSource
def getTableRecodAmount(sourceDBUrl,sourceDBUser,sourceDBKey,sql,dataBase):
    # 从目标数据库取出数据
    db, cursor = connectDB(sourceDBUrl, sourceDBUser, sourceDBKey,dataBase)
    # 使用execute()方法执行SQL语句
    cursor.execute(sql)
    # 使用fetall()获取全部数据
    amount = cursor.fetchall()
    # 关闭游标和数据库的连接
    cursor.close()
    db.close()
    return amount

def getDataTransferSql(targetDBUrl,targetDBUser,targetDBKey,targetDBs):
    tablse = getDataFromTarget(targetDBUrl,targetDBUser,targetDBKey,'show tables',targetDBs)
    list = []
    list2 = []
    newTables=[]
    for table in tablse:
        if(table[0] not in filterTables):
           sql_selectTarget, sql_selectSource = editeSql(table[0])
           list.append(sql_selectTarget)
           list2.append(sql_selectSource)
           newTables.append(table)
    return list,list2,newTables

def start():
    index = 0
    sqlList, sqlList2,tables = getDataTransferSql(targetDBUrl[index],targetDBUser[index],targetDBKey[index],targetDBs[index])
    for index2 in range(len(sqlList)):
        amount = 0
        amount2 = 0
        for index3 in range(len(targetDBs)):
          amount = amount + ((getTableRecodAmount(targetDBUrl[index3],targetDBUser[index3],targetDBKey[index3],sqlList[index2],targetDBs[index3]) )[0])[0]
        print("目标库 " + "表:" + (tables[index2])[0] + "数据总数为:" + str(amount))

        for index4 in range(len(sourceDataBase)):
          amount2 = amount2 + ((getTableRecodAmount(sourceDBUrl[index4],sourceDBUser[index4],sourceDBKey[index4],sqlList2[index2],sourceDataBase[index4]) )[0])[0]
        print("源数据库 " + "表:" + (tables[index2])[0] + "数据总数为:" + str(amount2))
        print( "表:" + (tables[index2])[0] + "相差" + str(amount2  - amount))
start()

Profile dataBaseConfig.ini

[soureDB]
sourcedburl = 
sourcedbuser = 
sourcedbkey = 
sourcedatabse = 
sourcedatabseflg = 

[targetDB]
targetdburl = 
targetdbuser =
targetdbkey = 
targetdbs = 
[progress]
currentsourcedatabseflg = 
table = 
pageno = 1
beginid = 
endid = 

[config]
pagesize = 
databaseinterval = 10000000

[filter]
tables = 

[compare]
beginId =
enId =
Configuration parameters (dataBaseConfig.ini)                
                   
Source Database Configuration [soureDB]                
  sourcedburl =  Source database link The source database, used between a plurality of databases "," separated          
  sourcedbuser = username          
  sourcedbkey =  password          
  sourcedatabse =  data storage name          
  sourcedatabseflg =  Source database flag (required, a flag bit corresponding to each database, and is the only)            
                   
Target database configuration [targetDB]                
  targetdburl = Target database link, use multiple databases, "" interval              
  targetdbuser = username              
  targetdbkey =  password              
  targetdbs = data storage name              
                   
Configure schedule [progress]                
  currentsourcedatabseflg =  The current source database flag (required) Initially all set to be empty, the script will automatically update the data; setting a general re-start data transfer          
  table = alarm Current table          
  pageno = 1 current page          
  beginid = Id start          
  entity = End id              
customize                  
  [config]                
  pagesize = 5 Each time the number of pieces of data extraction (optional)              
  databaseinterval = 10000000 Different source database, ID spacing between the same form. For example, starting from the first database ID 1, the second database with a table ID from 1 + 10 million start, avoid ID conflicts
                   
  [filter]                
  tables = user When you need to migrate data and migration statistics table              
                   
Access range data from the target database comparison [compare]                
  beginId = 0 Start ID              
  enId =40000000 Stop ID              
After configuring the parameters directly into the script has been configured VM environment to perform any directory python3 DataTanslate.py can; if she fails, abort the script, perform deleteDataFromTables.py then execute DataTanslate.py.                  

Guess you like

Origin blog.csdn.net/fz250052/article/details/91418416