TensorFlow(七) 地址匹配

import tensorflow as tf
import numpy as np
import string,random

n=10
#街道名称
street_names=['abbey','baker','canal','donner','elm']
#街道类型
street_types=['rd','st','ln','pass','ave']
#邮编
rand_zips=[ random.randint(65000,65999) for i in range(5) ]
#门牌号码
numbers=[random.randint(1,9999) for i in range(n)]
#街道名称
streets=[random.choice(street_names) for i in range(n)]
#街道类型
street_stuffs=[random.choice(street_types) for i in range(n)]
#邮编
zips=[random.choice(rand_zips) for i in range(n)]
# '4905 baker rd', '6417 canal st'
full_streets=[str(x)+' '+y+' '+z for x,y,z in zip(numbers,streets,street_stuffs)]
#['8758 elm st', 65469], ['3237 donner rd', 65479] shape=(10,2)
ref_data=[list(x) for x in zip(full_streets,zips)]

#生成错误随机数据
def create_type(s):
    rand_ind=random.choice(range(len(s)))
    s=list(s)
    s[rand_ind]=random.choice(string.ascii_lowercase)
    s=''.join(s)
    return s
#错误街道名
type_streets=[ create_type(x) for x in streets]
type_full_streets=[str(x)+' '+y+' '+z for x,y,z in zip(numbers,type_streets,street_stuffs)]
#['8758 efm st', 65469], ['3237 donnzr rd', 65479] shape=(10,2)
test_data=[list(x) for x in zip(type_full_streets,zips)]
#声明变量 占位符
sess=tf.Session()
test_address=tf.sparse_placeholder(dtype=tf.string)
test_zip=tf.placeholder(shape=[None,1],dtype=tf.float32)
ref_address=tf.sparse_placeholder(dtype=tf.string)
ref_zip=tf.placeholder(shape=[None,n],dtype=tf.float32)

#邮编距离 shape=(1,10) target=(0,无穷尽) 当为0时 相似度为一
zip_dist=tf.square(tf.subtract(ref_zip,test_zip))
#地址距离 返回(n,1)  (错误的个数 归一化)
address_dist=tf.edit_distance(test_address,ref_address,normalize=True)

#计算相似度 (0-1)
zip_max=tf.gather(tf.squeeze(zip_dist),tf.argmax(zip_dist,1))
zip_min=tf.gather(tf.squeeze(zip_dist),tf.argmin(zip_dist,1))
#(1,10)
zip_sim=tf.div(tf.subtract(zip_max,zip_dist),tf.subtract(zip_max,zip_min) )
#(10,1)
address_sim=tf.subtract(1.,address_dist)

#加权平均
address_weight=0.5
zip_weight=1-address_weight

#shape=(1,10)
weight_sim=tf.add(tf.transpose(tf.multiply(address_weight,address_sim)),tf.multiply(zip_weight,zip_sim) )
#获取最大索引
top_match_index=tf.arg_max(weight_sim,1)

#地址转换成稀疏向量
def sparese_word_vec(word):
    #['abbey', 'baker']
    num_words=len(word)
    indcies=[[xi,0,yi] for xi,x in enumerate(word) for yi,y in enumerate(x) ]
    chars=list(''.join(word))
    return (tf.SparseTensorValue(indcies,chars,[num_words,1,1]))

#获取标准地址
referece_address=np.array([x[0] for x in ref_data])
#(1,10)
referece_zips=np.array( [[x[1] for x in ref_data]])

#转换
sparse_ref_set=sparese_word_vec(referece_address)

for i in range(n):
    test_address_entry=test_data[i][0]
    test_zip_entry=[[test_data[i][1]]]

    test_address_repeat=[test_address_entry]*n
    sparse_test_set=sparese_word_vec(test_address_repeat)

    feeddict={test_address:sparse_test_set,test_zip:test_zip_entry,ref_address:sparse_ref_set,ref_zip:referece_zips }
    #获取最大相似度索引
    best_match=sess.run(top_match_index,feed_dict=feeddict)
    best_street=referece_address[best_match]
    [best_zip]=referece_zips[0][best_match]
    [[test_zip_]]=test_zip_entry
    print("Error address : " +str(test_address_entry) +"   "+str(test_zip_))
    print("Match: "+str(best_street)+"  "+str(best_zip))

猜你喜欢

转载自www.cnblogs.com/x0216u/p/9237761.html