# import module
import os
import glob
import random
from datetime import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch import optim
from torch.utils.data import Dataset, DataLoader
from torch import autograd
from torch.autograd import Variable

import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import logging
from tqdm import tqdm

# seed setting
def same_seeds(seed):
    # Python built-in random module
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Torch
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

same_seeds(2022)
workspace_dir = '../input'

建立数据集

注意fnames是一个list类型的文件，和原代码不同，这里使用Image.open()来读取数据

# prepare for CrypkoDataset

class CrypkoDataset(Dataset):
    def __init__(self, fnames, transform):
        self.transform = transform
        self.fnames = fnames
        self.num_samples = len(self.fnames)

    def __getitem__(self,idx):
        fname = self.fnames[idx]
        img = Image.open(fname)
        img = self.transform(img)
        return img

    def __len__(self):
        return self.num_samples

def get_dataset(root):
    # glob.glob返回匹配给定通配符的文件列表
    fnames = glob.glob(os.path.join(root, '*')) # list
    transform = transforms.Compose([        
        transforms.Resize((64, 64)),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
    ])
    dataset = CrypkoDataset(fnames, transform)
    return dataset

显示一些图片

temp_dataset = get_dataset(os.path.join(workspace_dir, 'faces'))

images = [temp_dataset[i] for i in range(4)]
grid_img = torchvision.utils.make_grid(images, nrow=4)
plt.figure(figsize=(10,10))
plt.imshow(grid_img.permute(1, 2, 0))
plt.show()

模型设置

生成器

生成器的目的是将输入向量z映射到真的数据空间。这儿我们的数据为图片，意味着我们需要将输入向量z转换为 3x64x64的RGB图像。实际操作时，通过一系列的二维转置卷积，每次转置卷积后跟一个二维的batch norm层和一个relu激活层。生成器的输出接入tanh函数以便满足输出范围为[−1,1]。值得一提的是，每个转置卷积后面跟一个 batch norm 层，是DCGAN论文的一个主要贡献。这些网络层有助于训练时的梯度计算。

反卷积参考这里：ConvTranspose2d原理，深度网络如何进行上采样？_月下花弄影的博客-CSDN博客

# Generator

class Generator(nn.Module):
    """
    Input shape: (batch, in_dim)
    Output shape: (batch, 3, 64, 64)
    """
    def __init__(self, in_dim, feature_dim=64):
        super().__init__()
    
        #input: (batch, 100)
        self.l1 = nn.Sequential(
            nn.Linear(in_dim, feature_dim * 8 * 4 * 4, bias=False),
            nn.BatchNorm1d(feature_dim * 8 * 4 * 4),
            nn.ReLU()
        )
        self.l2 = nn.Sequential(
            self.dconv_bn_relu(feature_dim * 8, feature_dim * 4),               #(batch, feature_dim * 16, 8, 8)     
            self.dconv_bn_relu(feature_dim * 4, feature_dim * 2),               #(batch, feature_dim * 16, 16, 16)     
            self.dconv_bn_relu(feature_dim * 2, feature_dim),                   #(batch, feature_dim * 16, 32, 32)     
        )
        self.l3 = nn.Sequential(
            nn.ConvTranspose2d(feature_dim, 3, kernel_size=5, stride=2,
                               padding=2, output_padding=1, bias=False),
            nn.Tanh()   
        )
        self.apply(weights_init)
    def dconv_bn_relu(self, in_dim, out_dim):
        return nn.Sequential(
            nn.ConvTranspose2d(in_dim, out_dim, kernel_size=5, stride=2,
                               padding=2, output_padding=1, bias=False),        #double height and width
            nn.BatchNorm2d(out_dim),
            nn.ReLU(True)
        )
    def forward(self, x):
        y = self.l1(x)
        y = y.view(y.size(0), -1, 4, 4)
        y = self.l2(y)
        y = self.l3(y)
        return y

判别器

判别器的输入为3 *64 *64，输出为概率（分数），依次通过卷积层，BN层，LeakyReLU层，最后通过sigmoid函数输出得分

WGAN的思路是将discriminator训练为距离函数，所以discriminator不需要最后的非线性sigmoid层

# Discriminator
class Discriminator(nn.Module):
    """
    Input shape: (batch, 3, 64, 64)
    Output shape: (batch)
    """
    def __init__(self, model_type, in_dim, feature_dim=64):
        super(Discriminator, self).__init__()
            
        #input: (batch, 3, 64, 64)
        """
        Remove last sigmoid layer for WGAN
        """
        
        self.model_type = model_type
        
        self.l1 = nn.Sequential(
            nn.Conv2d(in_dim, feature_dim, kernel_size=4, stride=2, padding=1), #(batch, 3, 32, 32)
            nn.LeakyReLU(0.2),
            self.conv_bn_lrelu(feature_dim, feature_dim * 2),                   #(batch, 3, 16, 16)
            self.conv_bn_lrelu(feature_dim * 2, feature_dim * 4),               #(batch, 3, 8, 8)
            self.conv_bn_lrelu(feature_dim * 4, feature_dim * 8),               #(batch, 3, 4, 4)
            nn.Conv2d(feature_dim * 8, 1, kernel_size=4, stride=1, padding=0)            
        )        
        
        if self.model_type == 'GAN':
            self.l1.add_module(
                'sigmoid', nn.Sigmoid() 
            )
        
        self.apply(weights_init)
        
    def conv_bn_lrelu(self, in_dim, out_dim):
        layer = nn.Sequential(
            nn.Conv2d(in_dim, out_dim, 4, 2, 1),
            nn.BatchNorm2d(out_dim),
            nn.LeakyReLU(0.2),
        )
        
        if self.model_type == 'WGAN-GP':
            layer[1] = nn.InstanceNorm2d(out_dim)
        
        return layer
    
    def forward(self, x):
        y = self.l1(x)
        y = y.view(-1)
        return y

权重初始化

DCGAN指出，所有的权重都以均值为0，标准差为0.2的正态分布随机初始化。weights_init 函数读取一个已初始化的模型并重新初始化卷积层，转置卷积层，batch normalization 层。这个函数在模型初始化之后使用。

在生成器和判别器的初始化函数中：self.apply(weights_init)

# setting for weight init function
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        m.weight.data.normal_(0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)

训练

流程

prepare_environment: construct the models, create directory for the log and ckpt
1. in_dim=z_dim=100，z的分布（高斯分布）深度为100
2. 因为input的是图片，3个通道，所以Discriminator(3)
3. 如果模型中有BN层，需要在训练时添加model.train()，在测试时添加model.eval()。其中model.train()是保证BN层用每一批数据的均值和方差，而model.eval()是保证BN用全部训练数据的均值和方差。
4. 根据模型类别选择优化器
train: 训练generator 和 discriminator
- 在训练generator 的时候，要重新将generator生成的fake image送入discriminator，得到判别结果。因为此时的discriminator已经经过了训练，generator要骗过这个更新过的discriminator。这就是李宏毅老师上课说过的generator 和 discriminator“砥砺前行”。
inference: after training, you can pass the generator ckpt path into it and the function will save the result for you

损失函数

二元分类

正如老师在上课时所讲的，GAN的训练过程是一个minmax训练，但是几乎没有人会真的使用梯度上升的方法，所以实作和理论有出入。GAN的思想和二元分类有一定的关系，下面先看二元分类问题的损失函数，希望该Loss function越小越好。

当y=1时，L(y^,y)=−log y^。如果y^越接近1，L(y^,y)≈0，表示预测效果越好；如果y^越接近0，L(y^,y)≈+∞，表示预测效果越差。

当y=0时，L(y^,y)=−log (1−y^)。如果y^越接近0，L(y^,y)≈0，表示预测效果越好；如果y^越接近1，L(y^,y)≈+∞，表示预测效果越差。

discriminator

下面是李老师PPT给出的discriminator损失函数

套用二元分类的损失函数，让y^=D(y)，当数据采集自Pdata时，标签y=1，损失函数为−log y^；当数据采集自PG时，标签y=0，损失函数为−log (1−y^)。将二者相加，其实就是V(G,D)的相反数，也就是说，训练discriminator可以直接使用二元交叉熵损失(BCELoss)，其中真实图片的label为1，生成的图片的label为0

r_label = torch.ones((bs)).to(self.device)
f_label = torch.zeros((bs)).to(self.device)
r_loss = self.loss(r_logit, r_label)
f_loss = self.loss(f_logit, f_label)
loss_D = (r_loss + f_loss) / 2

generator

下面是李老师PPT给出的generator损失函数

抛去V(G,D)中不相关的第一项，变成：

D(G(z))的值位于0-1，log(1-D(G(z)))的最小值是负无穷，问题在于，越往负无穷的方向，loss曲线的梯度越大，最后梯度爆炸。所以实际训练时，对于generator的训练不会使用梯度下降和最小化的目标函数。使用下面的目标函数来替代原来的生成器损失（这部分可以参考CS231n课程CS231n 2022PPT笔记- 生成模型Generative Modeling_iwill323的博客-CSDN博客）：

套用二元分类的损失函数，让y^=D(G(z))，让标签y=1，则损失函数为−log (y^)，所以也可以直接使用二元交叉熵损失(BCELoss)，只要指定label为1

loss_G = self.loss(f_logit, r_label)

WGAN

损失函数

loss_D = -torch.mean(r_logit) + torch.mean(f_logit)

WGAN-GP参考李宏毅2022机器学习HW6解析_机器学习手艺人的博客-CSDN博客代码，但是效果没有做出来，计算了30个epoch还是生成噪音图。

训练函数

class TrainerGAN():
    def __init__(self, config, device):
        self.config = config        
        self.model_type = self.config["model_type"]
        self.device = device
        
        self.G = Generator(self.config["z_dim"])
        self.D = Discriminator(self.model_type, 3)  # 3代表输入通道数
                
        self.loss = nn.BCELoss()        
 
        if self.model_type == 'GAN' or self.model_type == 'WGAN-GP':
            self.opt_D = torch.optim.Adam(self.D.parameters(), lr=self.config["lr"], betas=(0.5, 0.999))
            self.opt_G = torch.optim.Adam(self.G.parameters(), lr=self.config["lr"], betas=(0.5, 0.999))
        elif self.model_type == 'WGAN':
            self.opt_D = torch.optim.RMSprop(self.D.parameters(), lr=self.config["lr"])
            self.opt_G = torch.optim.RMSprop(self.G.parameters(), lr=self.config["lr"])    
 
        self.dataloader = None
        self.log_dir = os.path.join(self.config["save_dir"], 'logs')
        self.ckpt_dir = os.path.join(self.config["save_dir"], 'checkpoints')
        
        FORMAT = '%(asctime)s - %(levelname)s: %(message)s'
        logging.basicConfig(level=logging.INFO, 
                            format=FORMAT,
                            datefmt='%Y-%m-%d %H:%M')
        
        self.steps = 0
        self.z_samples = torch.randn(100, self.config["z_dim"], requires_grad = True).to(self.device)  # 打印100个看看生成的效果
        
    def prepare_environment(self):
        """
        Use this funciton to prepare function
        """
        os.makedirs(self.log_dir, exist_ok=True)
        os.makedirs(self.ckpt_dir, exist_ok=True)
        
        # update dir by time
        time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
        self.log_dir = os.path.join(self.log_dir, time+f'_{self.config["model_type"]}')
        self.ckpt_dir = os.path.join(self.ckpt_dir, time+f'_{self.config["model_type"]}')
        os.makedirs(self.log_dir)
        os.makedirs(self.ckpt_dir)
        
        # model preparation
        self.G = self.G.to(self.device)
        self.D = self.D.to(self.device)
        self.G.train()
        self.D.train()
        
    def gp(self, r_imgs, f_imgs):
        """
        Implement gradient penalty function
        """
        Tensor = torch.cuda.FloatTensor
        alpha = Tensor(np.random.random((r_imgs.size(0), 1, 1, 1)))
        interpolates = (alpha*r_imgs + (1 - alpha)*f_imgs).requires_grad_(True)
        d_interpolates = self.D(interpolates)
        fake = Variable(Tensor(r_imgs.shape[0]).fill_(1.0), requires_grad=False)
        gradients = autograd.grad(
            outputs=d_interpolates,
            inputs=interpolates,
            grad_outputs=fake,
            create_graph=True,
            retain_graph=True,
            only_inputs=True,
        )[0]
        
        gradients = gradients.view(gradients.size(0), -1)
        gradient_penalty = ((gradients.norm(1, dim=1) - 1)**2).mean()
        return gradient_penalty
        
    def train(self, dataloader):
        """
        Use this function to train generator and discriminator
        """
        self.prepare_environment()

        for e, epoch in enumerate(range(self.config["n_epoch"])):
            progress_bar = tqdm(dataloader)
            progress_bar.set_description(f"Epoch {e+1}")
            for i, data in enumerate(progress_bar):
                bs = data.size(0)  # batch size
                
                # *********************
                # *    Train D        *
                # *********************
                z = torch.randn(bs, self.config["z_dim"]).to(self.device) # z甚至可以在训练前生成固定一个，反复使用               
                f_imgs = self.G(z)
                r_imgs = data.to(self.device)                
 
                # Discriminator forwarding
                r_logit = self.D(r_imgs)  # 判断真实图像
                f_logit = self.D(f_imgs.detach())  # 判断生成的假图像  使用detach()是为了避免对G求导
                
                # SETTING DISCRIMINATOR LOSS
                if self.model_type == 'GAN':
                    r_label = torch.ones((bs)).to(self.device)
                    f_label = torch.zeros((bs)).to(self.device)
                    r_loss = self.loss(r_logit, r_label)
                    f_loss = self.loss(f_logit, f_label)
                    loss_D = (r_loss + f_loss) / 2
                elif self.model_type == 'WGAN':
                    loss_D = -torch.mean(r_logit) + torch.mean(f_logit)
                elif self.model_type == 'WGAN-GP':
                    aa = -torch.mean(r_logit) + torch.mean(f_logit)
                    bb = self.gp(r_imgs, f_imgs)
                    loss_D = aa + bb # 最后一项是gradient_penalty
 
                # Discriminator backwarding
                self.D.zero_grad()
                if self.model_type != 'WGAN-GP':
                    loss_D.backward()
                else:
                    loss_D.backward(retain_graph=True)
                self.opt_D.step()                
                
                # SETTING WEIGHT CLIP:
                if self.model_type == 'WGAN':
                    for p in self.D.parameters():
                         p.data.clamp_(-self.config["clip_value"], self.config["clip_value"])
 
                # *********************
                # *    Train G        *
                # *********************
                if self.steps % self.config["n_critic"] == 0:
                    # Generator forwarding      
                    f_logit = self.D(f_imgs)  # f_imgs没必要再生成一遍
                    if self.model_type == 'GAN':                        
                        loss_G = self.loss(f_logit, r_label)
                    elif self.model_type == 'WGAN' or self.model_type == 'WGAN-GP':
                        loss_G = -torch.mean(f_logit)                        
 
                    # Generator backwarding
                    self.G.zero_grad()
                    loss_G.backward(retain_graph=True)
                    self.opt_G.step()               
                    
                if self.steps % 10 == 0:
                    progress_bar.set_postfix(loss_G=loss_G.item(), loss_D=loss_D.item())
                    print(aa.detach(), bb.detach())
                self.steps += 1       
 
            self.G.eval()
            # G()最后一层是tanh(), 输出是-1到1，也就是说，G()的输出要变成0-1才是图像
            f_imgs_sample = (self.G(self.z_samples).data + 1) / 2.0 
            filename = os.path.join(self.log_dir, f'Epoch_{epoch+1:03d}.jpg')
            torchvision.utils.save_image(f_imgs_sample, filename, nrow=10)
            logging.info(f'Save some samples to {filename}.')
 
            # Show some images during training.
            grid_img = torchvision.utils.make_grid(f_imgs_sample.cpu(), nrow=10)
            plt.figure(figsize=(10,10))
            plt.imshow(grid_img.permute(1, 2, 0))
            plt.show()
 
            self.G.train()
 
            if (e+1) % 5 == 0 or e == 0:
                # Save the checkpoints.
                torch.save(self.G.state_dict(), os.path.join(self.ckpt_dir, f'G_{e}.pth'))
                torch.save(self.D.state_dict(), os.path.join(self.ckpt_dir, f'D_{e}.pth'))
 
        logging.info('Finish training')
 
    def inference(self, G_path, n_generate=1000, n_output=30, show=False):
        """
        1. G_path is the path for Generator ckpt
        2. You can use this function to generate final answer
        """
 
        self.G.load_state_dict(torch.load(G_path))
        self.G.to(self.devices[0])
        self.G.eval()
        z = torch.randn(n_generate, self.config["z_dim"]).to(self.devices[0])
        imgs = (self.G(z).data + 1) / 2.0
        
        os.makedirs('output', exist_ok=True)
        for i in range(n_generate):
            torchvision.utils.save_image(imgs[i], f'output/{i+1}.jpg')
        
        if show:
            row, col = n_output//10 + 1, 10
            grid_img = torchvision.utils.make_grid(imgs[:n_output].cpu(), nrow=row)
            plt.figure(figsize=(row, col))
            plt.imshow(grid_img.permute(1, 2, 0))
            plt.show()

训练

读取数据

# create dataset by the above function
batch_size = 512
num_workers = 2
dataset = get_dataset(os.path.join(workspace_dir, 'faces'))
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last = True)
print('训练集总长度是 {:d}, batch数量是 {:.2f}'.format(len(dataset), len(dataset)/batch_size))

Set config

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'DEVICE: {device}')

config = {
    "model_type": "WGAN",    
    "lr": 1e-4,
    "n_epoch": 60,
    "n_critic": 5,  # 训练一次generator，多训练几次discriminator，效果更好 n_critic=5意味着训练比是1:5
    "z_dim": 100,
    "workspace_dir": workspace_dir, # define in the environment setting
    "save_dir": workspace_dir,
    'clip_value': 1
}

trainer = TrainerGAN(config, device)
trainer.train(dataloader)

推断

# save the 1000 images into ./output folder
trainer.inference(f'{workspace_dir}/checkpoints/2022-03-31_15-59-17_GAN/G_0.pth') # you have to modify the path when running this line

GAN效果

下面是GAN产生的图片，效果挺一般。只是大体运行了一下，再调一调能好多了。

除了效果差，训练过中可以发现到了第22个epoch，图像突然会变差，前一个还是正常的人像（下面gif中暂停的，左上角是红头发的那一幅图像），下一个epoch突然变坏，根据李宏毅2022机器学习HW6解析_机器学习手艺人的博客-CSDN博客，loss_G突然增大，loss_D接近于0，这说明后续的训练discriminator相对generator表现的太好，这与GAN的训练背道而驰，GAN训练最好的结果是loss_G小，loss_D大，也就是discriminator无法分辨generator的结果。

还有一个问题是，训练都后面，生成的图像多样性变差，具体原因老师上课讲过了

下面是WGAN生成的图像，一直到epoch=50都比较稳定

关于计算速度，发现了一个有意思的事。同样的超参数：

config = {

"model_type": "GAN",

"batch_size": 64,

"lr": 1e-4,

"n_epoch": 10,

"n_critic": 1,

"z_dim": 100,

"workspace_dir": workspace_dir,

}

英伟达3090显卡的计算时间为428秒，而3080显卡更快，只需要327秒，不知道为什么

理论部分参考：李宏毅机器学习——对抗生成网络（GAN）_iwill323的博客-CSDN博客理解GAN网络基本原理_ifreewolf99的博客-CSDN博客李宏毅机器学习——对抗生成网络（GAN）_iwill323的博客-CSDN博客

代码参考：生成对抗网络GAN和DCGAN的理解（pytorch+李宏毅老师作业6） - 富士山上 - 博客园

李宏毅2022机器学习HW6解析_机器学习手艺人的博客-CSDN博客

李宏毅机器学习作业6-使用GAN生成动漫人物脸

任务和数据集

评价方法

FID

AFD (Anime face detection) rate

代码

导包

建立数据集

显示一些图片

模型设置

生成器

判别器

权重初始化

训练

流程

损失函数

二元分类

discriminator

generator

WGAN

训练函数

训练

读取数据

Set config

推断

GAN效果

猜你喜欢