Article directory

TANE algorithm

TANE algorithm

import package

from pandas import *
from collections import defaultdict#字典类
import numpy as NP
import sys
# import pdb
# pdb.set_trace()

Equivalence class division: list_duplicates

def list_duplicates(seq):
    tally = defaultdict(list)
    for i,item in enumerate(seq):#这两行核心代码实现了合并相同元组：等价类的划分
        tally[item].append(i)#只有item一样才能实现合并
    return ((key,locs) for key,locs in tally.items() 
                            if len(locs)>0)

#测试：对属性集A的划分结果
for element in list_duplicates(data2D['A'].tolist()):
    print(element)

Find the right set: findCplus

Question : The process of calculating cplus only uses the cplus of the upper layer, so how is the layer that first conforms to the definition of cplus defined? From which floor do we start? How to observe the change process through the debug function?

def findCplus(x): # this computes the Cplus of x as an intersection of smaller Cplus sets
    global dictCplus
    thesets=[]
    for a in x:
        #计算的过程涉及到了递归调用
        if x.replace(a,'') in dictCplus.keys():
            temp = dictCplus[x.replace(a,'')]
        else:
            temp=findCplus(x.replace(a,'')) # compute C+(X\{A}) for each A at a time
        #dictCplus[x.replace(a,'')] = temp
        thesets.insert(0, set(temp))
    if list(set.intersection(*thesets)) == []:
        cplus = []
    else:
        cplus = list(set.intersection(*thesets))  # compute the intersection in line 2 of pseudocode
    return cplus

Compute function dependencies: compute_dependencies

Initially, dictCplus only has {'NULL': listofcolumns}. When calculating layer 1, it is intersected by layer 0. Therefore, the result is R, and the real calculation of cplus starts from the second layer
Q According to the logic of the code, cplus(x) should not contain elements other than x, why the output result is unqualified, because it has not passed the FD verification, and there is only one attribute, so it must not pass. When the second layer is called, the output result is naturally qualified.
When calculating dependence on L2, how do AB and BD reflect the attributes of this combination in the code?

Reflected by lever, lever is a list of current layer attribute sets, L[i+1] = generate_next_level(L[i])
Why say A∈X, if A∈C+(X) and X\{A}->A holds, then X\{A}->A can be considered as the minimum non-trivial functional dependency.
- A∈X, X\{A}->A depends on the establishment
- A∈C+(X), A does not depend on a subset of X. The explanation is as follows: if A does not belong to Cplus, then there is B, X\B->B, then X->A is obviously not the smallest, because there is a newer X\B->A. If A belongs to Cplus, there is no such B, that is, A does not depend on a subset of X, so X->A is the smallest.

def compute_dependencies(level, listofcols):#参数为Li层，即当前层的属性？还是直接A,B,C,D
    global dictCplus#属性-右方集dict
    global finallistofFDs#FD List
    global listofcolumns#属性集List
    #FUN1:计算所有X∈Li的右方集Cplus
    #通过上层结点{A})计算当前层的每个X的Cplus(X)
    #或通过computeCplus
    print(listofcols)
    for x in level:
        thesets=[]
        for a in x:
            if x.replace(a,'') in dictCplus.keys():#如果Cplus(X\A)已经在当前右方集List中
                temp = dictCplus[x.replace(a,'')]#temp存入的是Cplus(X\A)---即X\A的右集合
            else:#否则，计算右方集
                temp=computeCplus(x.replace(a,'')) # compute C+(X\{A}) for each A at a time
                dictCplus[x.replace(a,'')] = temp#存入dictCplus中
            thesets.insert(0, set(temp))#通过set，将temp转换为集合，再将该对象插入到列表的第0个位置
        if list(set.intersection(*thesets)) == []:#set.intersection(set1, set2 ... etc)求并集
            dictCplus[x] = []
        else:
            dictCplus[x] = list(set.intersection(*thesets)) # 即伪代码第二行中的计算交集
    #FUN2：找到最小函数依赖
    #并对Cplus进行剪枝(最小性剪枝)：1.删掉已经成立的2.取掉必不可能的 留下的仍然是“有希望的”''' 
    for x in level:
        
        for a in x:
            if a in dictCplus[x]:#即如果A取得X与Cplus的交集
                #if x=='BCJ': print "dictCplus['BCJ'] = ", dictCplus[x]
                if validfd(x.replace(a,''), a): # 即X\{A}->A函数依赖成立
                    finallistofFDs.append([x.replace(a,''), a]) # line 6
                    print ("compute_dependencies:level：%s adding key FD: %s"%(level,[x.replace(a,''),a]))
                    dictCplus[x].remove(a)  # line 7
                    listofcols=listofcolumns[:]#copy listofcolumns 实则为接下来剪枝做准备
                    for j in x: # this loop computes R\X
                        if j in listofcols: listofcols.remove(j)#此时listofcools更新
                         
                    for b in listofcols: # 在 C+(X)删掉所有属于R\X即不属于X的元素，即所留下的Cpuls元素全部属于X
#                         print(b)
#                         print (dictCplus[x])
                        if b in dictCplus[x]:
#                             print(b)
#                             print (dictCplus[x])
                            dictCplus[x].remove(b)
    for x in level:
        print (x)
        print (dictCplus[x])

#测试:但为什么L1层的右方集和理解中不一样呢？---L1是由公式算出来的
compute_dependencies(L[2], listofcolumns[:])

['A', 'B', 'C', 'D']
AB
['A']
BD
['D']

Calculate the right set: computeCplus

Question: What is the difference between NULL and ''

def computeCplus(x): 
    # this computes the Cplus from the first definition in section 3.2.2 of TANE paper. 
    #output should be a list of single attributes
    global listofcolumns#始终=[A,B,C,D]
    listofcols = listofcolumns[:]#copy
    if x=='': return listofcols # because C+{phi} = R(φ=Phi)
    cplus = []
    for a in listofcols:#A∈R并且满足如下条件：
        for b in x:
            temp = x.replace(a,'')
            temp = temp.replace(b,'')
            if not validfd(temp, b):
                cplus.append(a)
    return cplus

print(listofcolumns)
computeCplus('C')

['A', 'B', 'C', 'D']
['A', 'B', 'C', 'D']

Validity test: validfd

def validfd(y,z):#验证Y->是否符合函数依赖
    if y=='' or z=='': return False
    ey = computeE(y)#计算误差e(X)
    eyz = computeE(y+z)#计算误差e(XU{A})
    if ey == eyz :#引理3.5
        return True
    else:
        return False

e(X) calculation: computeE

The calculation formula of error e(X) is as follows:
$e(X)=\left(\left\|\widehat{\pi_{ X}}\right\|-\left|\widehat{\pi_{X}}\right|\right) /|r|$

$\left\|\widehat{\pi_{X}}\right\|$ means to strip the partition $\widehat{\pi_{X}}$ The sum of the sizes of all equivalence classes in , doublenormdenoted by
$\left|\widehat{\pi_{X}}\right|$ the order of the stripped partition: how many equivalence classes are there inside,len(dictpartitions[''.join(sorted(x))])expressed

def computeE(x):#属性集为x
    global totaltuples#元组数
    global dictpartitions#关于每个属性集的剥离分区
    doublenorm = 0
    for i in dictpartitions[''.join(sorted(x))]:#''.join(sorted(x))先将x排序--即BCA to ABC，再转换为字符串，用''隔开
        #测试 print(i) # i为剥离分区中的等价类，对于testABCD-test，x=D,i取[0,3]、[1,2]
        doublenorm = doublenorm + len(i)#doublenorm存储所有等价类的大小的和
    e = (doublenorm-len(dictpartitions[''.join(sorted(x))]))/float(totaltuples)
    return e

#测试 testdataABCD.csv
print(computeE('A'))#4-2 / 4 = 0.5
print(computeE('C'))#0-0/4 = 0
print(dictpartitions)

Superkey check: check_superkey

(dictpartitions[x] == [[]]) or (dictpartitions[x] == []) what is the difference

#测试
dictpartitions

def check_superkey(x):
    global dictpartitions#关于每个属性集的剥离分区
    if ((dictpartitions[x] == [[]]) or (dictpartitions[x] == [])):#如果剥离分区为空，则说明π_x只有单例等价类组成
        return True
    else:
        return False

Prune attributes: prune

Subtract the attribute set X that does not meet the conditions. For some X, if X->A does not hold, then the superset of X is also unreasonable. We delete X from the attribute inclusion grid, so as not to consider the superset and save judgment time

step combing
- Lines 1, 2, and 3 reflect the pruning strategy for the right set: if Cplus(X)=φ, then delete X----X is useless, and the purpose of pruning is to no longer consider the superset
- Lines 4-8 reflect the key pruning strategy: if X is a (super) key, then delete X----X is useful, but it has already been used, and the pruning is to no longer consider the superset
If X is a superkey, how to determine whether X\{A}->A is the minimum functional dependency?
Let X be a superkey and let A∈X. A dependency X\{A}→A is efficient and minimal if and only if X\{A} is a key and for all B∈X, A∈C+(X\{B}).
More intuitively, it is guaranteed that:
- X\{A}->A holds (non-trivial): X\A can only be a key, and X\A->A is valid. Conversely, if X\A is not a key, then X\A->A must not hold.
- A does not depend on the proper subset of X (minimum): all B∈X, all B∈X, A∈C+(X \ {B}) (defined by cplus or Lemma 3.1)
Question: line4~line7
- Through the compute_dependencies process C+(X) only retains the elements in X, then, isn’t C+(X)\X empty?
  
  A: Not empty can only mean that for all A that belong to the intersection of X and C+(X), X\A->A is not true. Otherwise, if one is true, according to line8, C+(X) only keeps the elements in X. Because there is no hope for the elements that do not belong to X, it is impossible to be the smallest. So pruning Cplus, removing the hopeless B, and removing the A that has fulfilled the wish.
- Question: During pruning, the dependency X→A is output on line 7 if and only if if X is a superkey, A∈C+ (X)\X and for all B∈X, A∈C + ((X +A)\{B}). ? How to explain this sentence, how to get involved with Lemma 4.2.
  
  Answer: When proceeding to the step of prune, all the attributes belonging to X in C+(X) have been considered (and none of them are true. If it is true, C+ (X)\X will be empty), so it is necessary to consider the attributes that do not belong to X. Namely C+ (X)\X, then, since X is a superkey, then X+A is also a superkey, (X+A) is equivalent to ((X)) in Lemma 4.2, then X is equivalent to Lemma 4.2 ((X\A)), B is the same as B in 4.2, (obviously here B!=A), so ((X \ {A}))→A is the minimum, that is, X->A is the minimum!

def prune(level):
    global dictCplus#属性集的右方集
    global finallistofFDs#FD
    for x in level: # line 1
        '''Angle1:右方集修剪'''
        if dictCplus[x]==[]: # line 2
            level.remove(x) # line 3：若Cplus(X)=φ,则删除X
        '''Angle2:键修剪'''
        if check_superkey(x): # line 4   ### should this check for a key, instead of super key??? Not sure.
            temp = dictCplus[x][:]# 初始化temp 为 computes cplus(x)
            #1. 求得C+(X) \ X
            for i in x: # this loop computes C+(X) \ X
                if i in temp: temp.remove(i)# temp为C+(X) \ X
            #2. line 5：for each a ∈ Cplus(X)\X do        
            for a in temp: 
                thesets=[]
                #3. 计算Cplus((X+A)\ {B})
                for b in x:
                    if not( ''.join(sorted((x+a).replace(b,''))) in dictCplus.keys()): 
                    # ''.join(sorted((x+a).replace(b,''))表示的就是XU{a}\{b}
                        dictCplus[''.join(sorted((x+a).replace(b,'')))] = findCplus(''.join(sorted((x+a).replace(b,''))))
                    thesets.insert(0,set(dictCplus[''.join(sorted((x+a).replace(b,'')))]))
                #4. 计算Cplus((X+A)\ {B})交集，判断a是否在其中
                if a in list(set.intersection(*thesets)): # line 6 set.intersection(*thesets)为求所有thesets元素的并集
                    finallistofFDs.append([x, a]) # line 7
                    #测试
                    print ("pruning:level：%s adding key FD: %s"%(level,[x,a]))
            # 只要x是超键，就要剪掉x。
            if x in level:level.remove(x)#如果此时在line3中已经删除X,则不执行.

Generate the next level: generate_next_level

def generate_next_level(level):
    #首先令 L[i+1] 这一层为空集
    nextlevel=[]
    for i in range(0,len(level)): # 选择一个属性集
        for j in range(i+1, len(level)): # 将其与后面的所有属性集进行比较
            #如果这两个元素属于同一个前缀块，那么就可以合并:只有最后一个属性不同，其余都相同
            if ((not level[i]==level[j]) and level[i][0:-1]==level[j][0:-1]):  # i.e. line 2 and 3
                x = level[i]+level[j][-1]  #line 4  X = Y U Z      
                flag = True
                for a in x: # this entire for loop is for the 'for all' check in line 5
                    if not(x.replace(a, '') in level):
                        flag=False
                if flag==True:
                    nextlevel.append(x)
                    #计算新的属性集X上的剥离分区
                    #=pi_y*pi_z（其中y为level[i]，z为level[j]）
                    stripped_product(x, level[i] , level[j] ) # compute partition of x as pi_y * pi_z (where y is level[i] and z is level[j])
    return nextlevel

#测试generate_next_level
Ltest=['A','B','C','D']#测试'ABCD'和‘ABD’
nextLtest = generate_next_level(Ltest)
# nextnextLtest = generate_next_level(nextLtest)
# nextnextnextLest = generate_next_level(nextnextLtest)
print(nextLtest)
# print(nextnextLtest)
# print(nextnextnextLest)

y:A partitionY:[[0, 1], [2, 3]],z:B partitionZ[[0, 1, 2]]
x=AB,partitionX=[[0, 1]]
y:A partitionY:[[0, 1], [2, 3]],z:C partitionZ[]
x=AC,partitionX=[]
y:A partitionY:[[0, 1], [2, 3]],z:D partitionZ[[0, 3], [1, 2]]
x=AD,partitionX=[]
y:B partitionY:[[0, 1, 2]],z:C partitionZ[]
x=BC,partitionX=[]
y:B partitionY:[[0, 1, 2]],z:D partitionZ[[0, 3], [1, 2]]
x=BD,partitionX=[[1, 2]]
y:C partitionY:[],z:D partitionZ[[0, 3], [1, 2]]
x=CD,partitionX=[]
['AB', 'AC', 'AD', 'BC', 'BD', 'CD']

# 测试 [-1]表示最后一个位置
ltest = 'ABCDEF'
ltest[-1]

'F'

Generate stripped partition: stripped_product

While generating the next level, the stripped partitions of the new set of attributes are computed. Except that the attribute set on level=1 is the computeSingletonPartitionsstripped partition obtained by calculation, the stripped partitions of the attribute set on other levels are obtained stripped_productby

def stripped_product(x,y,z):
    global dictpartitions#剥离分区
    global tableT
    tableS = ['']*len(tableT)
    #partitionY、partitionZ是属性集Y、Z上的剥离分区，已知！
    #partitionY is a list of lists, each list is an equivalence class
    partitionY = dictpartitions[''.join(sorted(y))] 
    partitionZ = dictpartitions[''.join(sorted(z))]
    print("y:%s partitionY:%s,z:%s partitionZ%s"%(y,partitionY,z,partitionZ))
    partitionofx = [] # line 1
    for i in range(len(partitionY)): # line 2
        for t in partitionY[i]: # line 3
            tableT[t] = i
        tableS[i]='' #line 4
    for i in range(len(partitionZ)): # line 5
        for t in partitionZ[i]: # line 6
            if ( not (tableT[t] == 'NULL')): # line 7
                tableS[tableT[t]] = sorted(list(set(tableS[tableT[t]]) | set([t]))) 
        for t in partitionZ[i]: # line 8
            if (not (tableT[t] == 'NULL')) and len(tableS[tableT[t]])>= 2 : # line 9
                partitionofx.append(tableS[tableT[t]]) 
            if not (tableT[t] == 'NULL'): tableS[tableT[t]]='' # line 10
    for i in range(len(partitionY)): # line 11
        for t in partitionY[i]: # line 12
            tableT[t]='NULL'
    dictpartitions[''.join(sorted(x))] = partitionofx#生成属性集X上的剥离分区
    print('x=%s,partitionX=%s'%(x,partitionofx))

#测试stripped_product

Calculate the stripped partition: computeSingletonPartitions

Equivalence class division:
list_duplicates(data2D[a].tolist()) merges the repeated elements in each column and returns a two-tuple

element value
Index: the location where the element is stored, for example, for column A, the element is (1, [0, 1]), (5, [2])

def computeSingletonPartitions(listofcols):
    global data2D
    global dictpartitions
    for a in listofcols:
        dictpartitions[a]=[]
        for element in list_duplicates(data2D[a].tolist()): # list_duplicates returns 2-tuples, where 1st is a value, and 2nd is a list of indices where that value occurs
            if len(element[1])>1: # 忽略单例等价类
                dictpartitions[a].append(element[1])

#测试computeSingletonPartitions
#此时考虑的属性集只有A,B,C,D，在单个属性集上面生成剥离分区
'''测试list_duplicates函数的返回值:返回的是每个属性列表中每个属性的剥离分区'''
dictpartitions = {
    
    }
dictCplus = {
    
    'NULL': listofcolumns}
datatest = read_csv('testdataABCD.csv')
print(datatest)
for a in listofcolumns:#为索引列
    print ("a=",a)
    dictpartitions[a]=[]
    print (a,datatest[a].tolist())
    for element in list_duplicates(datatest[a].tolist()):
        print ("element=",element)
    for element in list_duplicates(datatest[a].tolist()): # list_duplicates returns 2-tuples, where 1st is a value, and 2nd is a list of indices where that value occurs
        if len(element[1])>1: # ignore singleton equivalence classes
            dictpartitions[a].append(element[1])
print(dictpartitions)#存放的是每个属性集上的剥离分区
print(dictCplus)

   A  B  C  D
0  1  1  5  5
1  1  1  1  3
2  5  1  2  3
3  5  2  3  5
a= A
A [1, 1, 5, 5]
element= (1, [0, 1])
element= (5, [2, 3])
a= B
B [1, 1, 1, 2]
element= (1, [0, 1, 2])
element= (2, [3])
a= C
C [5, 1, 2, 3]
element= (5, [0])
element= (1, [1])
element= (2, [2])
element= (3, [3])
a= D
D [5, 3, 3, 5]
element= (5, [0, 3])
element= (3, [1, 2])
{'A': [[0, 1], [2, 3]], 'B': [[0, 1, 2]], 'C': [], 'D': [[0, 3], [1, 2]]}
{'NULL': ['A', 'B', 'C', 'D']}

implement

Get data attributes

data2D = read_csv('testdataABCD.csv')
data2D

	A	B	C	D
0	1	1	5	5
1	1	1	1	3
2	5	1	2	3
3	5	2	3	5

#------------------------------------------------------- START ---------------------------------------------------
'''
如果嵌入到项目中，需要这么写代码
if len(sys.argv) > 1:
    infile=str(sys.argv[1]) # this would be e.g. "testdata.csv"
data2D = read_csv(infile)
'''

totaltuples = len(data2D.index) #return num of tuple：4
#listofcolumns为属性集列表：初始时，只有A，B，C，D..下面为初始化过程
listofcolumns = list(data2D.columns.values) # returns ['A', 'B', 'C', 'D', .....]
print(totaltuples)
print(listofcolumns)


tableT = ['NULL']*totaltuples #这是用于函数stripped_product中的表T

L0 = []
dictCplus = {
    
    'NULL': listofcolumns}#右方集字典集，初始时，NULL的右方集为R，即A,B,C,D...
#用于存储剥离分区
dictpartitions = {
    
    } # maps 'stringslikethis' to a list of lists, each of which contains indices

tableT

4
['A', 'B', 'C', 'D']
['NULL', 'NULL', 'NULL', 'NULL']

Compute the stripped partition

'''计算每个属性集上的剥离分区'''
computeSingletonPartitions(listofcolumns)
dictpartitions,listofcolumns

({'A': [[0, 1], [2, 3]], 'B': [[0, 1, 2]], 'C': [], 'D': [[0, 3], [1, 2]]},
 ['A', 'B', 'C', 'D'])

TANE main algorithm

finallistofFDs=[]
#print dictCplus['NULL']
#初始时，L1层包含的属性集为：A,B,C,D...

L1=listofcolumns[:]  # L1 is a copy of listofcolumns

i=1

L = [L0,L1]
while (not (L[i] == [])):#第i层的包含的属性集不为空
    compute_dependencies(L[i],listofcolumns[:])# 计算该层的函数依赖
    prune(L[i])#剪枝，删除Li中的集合，修剪搜索空间
    temp = generate_next_level(L[i])
    L.append(temp)#将生成的层追加到L集合中
    i=i+1

print ("List of all FDs: " , finallistofFDs)
#  correct result
#  List of all FDs:  [['C', 'D'], ['C', 'A'], ['C', 'B'], ['AD', 'B'], ['AD', 'C']]
# Total number of FDs found:  5
print ("Total number of FDs found: ", len(finallistofFDs))
print(L)

['A', 'B', 'C', 'D']
A
['A', 'C', 'B', 'D']
B
['A', 'C', 'B', 'D']
C
['A', 'C', 'B', 'D']
D
['A', 'C', 'B', 'D']
pruning:level：['A', 'B', 'C', 'D'] adding key FD: ['C', 'A']
pruning:level：['A', 'B', 'C', 'D'] adding key FD: ['C', 'B']
pruning:level：['A', 'B', 'C', 'D'] adding key FD: ['C', 'D']
['A', 'B', 'C', 'D']
AB
['A', 'C', 'B', 'D']
AD
['A', 'C', 'B', 'D']
BD
['A', 'C', 'B', 'D']
pruning:level：['AB', 'AD', 'BD'] adding key FD: ['AD', 'C']
pruning:level：['AB', 'AD', 'BD'] adding key FD: ['AD', 'B']
List of all FDs:  [['C', 'A'], ['C', 'B'], ['C', 'D'], ['AD', 'C'], ['AD', 'B']]
Total number of FDs found:  5
[[], ['A', 'B', 'D'], ['AB', 'BD'], []]

It can be seen from the output results:

test

#测试 不执行
# print(listofcolumns)
# L1=listofcolumns[:]
# print (L1)

['A', 'B', 'C', 'D']
['A', 'B', 'C', 'D']

#测试lever
L

[[], ['A', 'B', 'D'], ['AB', 'BD'], []]

TANE algorithm code implementation