吴恩达 深度学习 course5 week2:Operations on word vectors - v1

# -*- coding:utf-8 -*- 
#Author: shenying
#Date: 18-7-17 下午1:19

import numpy as np
from w2v_utils import *

words,word_to_vec_map=read_glove_vecs('data2/glove.6B.50d.txt')

def cosine_similarity(u,v):
    distance=0.0
    dot=np.dot(u,v)
    norm_u=np.sqrt(np.sum(np.square(u)))
    norm_v=np.sqrt(np.sum(np.square(v)))
    cosine_similarity=dot/(norm_u*norm_v)
    return cosine_similarity

# father = word_to_vec_map["father"]
# mother = word_to_vec_map["mother"]
# ball = word_to_vec_map["it"]
# crocodile = word_to_vec_map["she"]
# france = word_to_vec_map["france"]
# italy = word_to_vec_map["italy"]
# paris = word_to_vec_map["paris"]
# rome = word_to_vec_map["rome"]
#
# print("cosine_similarity(father, mother) = ", cosine_similarity(father, mother))
# print("cosine_similarity(ball, crocodile) = ",cosine_similarity(ball, crocodile))
# print("cosine_similarity(france - paris, rome - italy) = ",cosine_similarity(france - paris, rome - italy))

def complete_analogy(word_a,word_b,word_c,word_to_vec_map):
    word_a,word_b,word_c=word_a.lower(),word_b.lower(),word_c.lower()
    e_a,e_b,e_c=word_to_vec_map[word_a],word_to_vec_map[word_b],word_to_vec_map[word_c]

    words=word_to_vec_map.keys()
    max_cosine_sim=-100
    best_word=None

    for w in words:
        if w in [word_a,word_b,word_c]:
            continue
        cosine_sim=cosine_similarity(e_b-e_a,word_to_vec_map[w]-e_c)

        if cosine_sim>max_cosine_sim:
            max_cosine_sim=cosine_sim
            best_word=w

    return best_word

# triads_to_try = [('italy', 'italian', 'spain'), ('india', 'delhi', 'japan'), ('man', 'woman', 'boy'), ('small', 'smallest', 'large')]
# for triad in triads_to_try:
#     print ('{} -> {} :: {} -> {}'.format( *triad, complete_analogy(*triad,word_to_vec_map)))

g=word_to_vec_map['woman']-word_to_vec_map['man']
# print('gender')
# print(g)

# print ('List of names and their similarities with constructed vector:')
# # girls and boys name
# name_list = ['john', 'marie', 'sophie', 'ronaldo', 'priya', 'rahul', 'danielle', 'reza', 'katy', 'yasmin']
# for w in name_list:
#     print (w, cosine_similarity(word_to_vec_map[w], g))
#
# print('Other words and their similarities:')
# word_list = ['lipstick', 'guns', 'science', 'arts', 'literature', 'warrior','doctor', 'tree', 'receptionist',
#              'technology',  'fashion', 'teacher', 'engineer', 'pilot', 'computer', 'singer']
# for w in word_list:
#     print (w, cosine_similarity(word_to_vec_map[w], g))

def neutralize(word,g,word_to_vec_map):
    e=word_to_vec_map[word]
    e_biascomponent=(np.dot(e,g)/np.square(np.linalg.norm(g)))*g
    e_debiased=e-e_biascomponent
    return e_debiased
# e = "receptionist"
# print("cosine similarity between " + e + " and g, before neutralizing: ", cosine_similarity(word_to_vec_map["receptionist"], g))
#
# e_debiased = neutralize("receptionist", g, word_to_vec_map)
# print("cosine similarity between " + e + " and g, after neutralizing: ", cosine_similarity(e_debiased, g))

def equalize(pair,bias_axis,word_to_vec_map):
    w1,w2=pair
    e_w1,e_w2=word_to_vec_map[w1],word_to_vec_map[w2]

    mu=(e_w1+e_w2)/2

    mu_B = np.dot(mu, bias_axis) / np.sum(bias_axis * bias_axis) * bias_axis
    mu_orth=mu-mu_B

    e_w1B = np.dot(e_w1, bias_axis) / np.sum(bias_axis * bias_axis) * bias_axis
    e_w2B = np.dot(e_w2, bias_axis) / np.sum(bias_axis * bias_axis) * bias_axis

    corrected_e_w1B = np.sqrt(np.abs(1 - np.sum(mu_orth * mu_orth))) * (e_w1B - mu_B) / np.linalg.norm(e_w1 - mu_orth - mu_B)
    corrected_e_w2B = np.sqrt(np.abs(1 - np.sum(mu_orth * mu_orth))) * (e_w2B - mu_B) / np.linalg.norm(e_w2 - mu_orth - mu_B)

    e1=corrected_e_w1B+mu_orth
    e2=corrected_e_w2B+mu_orth

    return e1,e2

# print("cosine similarities before equalizing:")
# print("cosine_similarity(word_to_vec_map[\"man\"], gender) = ", cosine_similarity(word_to_vec_map["man"], g))
# print("cosine_similarity(word_to_vec_map[\"woman\"], gender) = ", cosine_similarity(word_to_vec_map["woman"], g))
# print()
# e1, e2 = equalize(("man", "woman"), g, word_to_vec_map)
# print("cosine similarities after equalizing:")
# print("cosine_similarity(e1, gender) = ", cosine_similarity(e1, g))
# print("cosine_similarity(e2, gender) = ", cosine_similarity(e2, g))






猜你喜欢

转载自blog.csdn.net/qq_31119155/article/details/81162853