会议:2021 interspeech
作者:Manh Luong
单位:Vinai Research, Hanoi, Vietnam
abstract
基于VAE的说话人特征和内容的解耦,认为同一说话人两句话中的说话人信息是一样的,内容是不一样的。说话人特征+内容=一句话包含的信息。
method
# style_mu, style_logvar是同一个fc输出的维度split结果
# content_mu1, content_logvar1是同一个fc输出的维度split结果
style_mu1, style_logvar1, content_mu1, content_logvar1 = self.encode(x1)
z_content1 = self._reparameterize(content_mu1, content_logvar1, train)
# 同一个人另一句话的编码信息
style_mu2, style_logvar2, content_mu2, content_logvar2 = self.encode(x2)
z_content2 = self._reparameterize(content_mu2, content_logvar2, train)
# 说话人向量的 均值/方差 再次平均
style_mu2 = style_mu2.detach()
style_logvar2 = style_logvar2.detach()
z_style_mu = (style_mu1 + style_mu2)/2
z_style_logvar = (style_logvar1 + style_logvar2)/2
z_style = self._reparameterize(z_style_mu, z_style_logvar)
# 说话人向量和内容拼接
z1 = torch.cat((z_style, z_content1), dim=-1)
z2 = torch.cat((z_style, z_content2), dim=-1)
## parameters of distribution of sample 1
q_z1_mu = torch.cat((z_style_mu, content_mu1), dim=-1)
q_z1_logvar = torch.cat((z_style_logvar, content_logvar1), dim=-1)
## parameters of distribution of sample 2
q_z2_mu = torch.cat((z_style_mu, content_mu2), dim=-1)
q_z2_logvar = torch.cat((z_style_logvar, content_logvar2), dim=-1)
recons_x1 = self.decode(z1)
recons_x2 = self.decode(z2)
其中
# 高斯采样
def _reparameterize(self, mu, logvar, train=True):
if train:
epsilon = Variable(torch.empty(logvar.size()).normal_()).cuda()
std = logvar.mul(0.5).exp_()
return epsilon.mul(std).add_(mu)
else:
return mu