DCGAN实现手写数字识别demo

论文解读和原理介绍，在网上已经有大量文章，这里就不在赘述。
论文地址：Unsupervised Representations Learning With Deep Convolutional Generative Adversarial Networks
论文解读：深度卷积对抗生成网络(DCGAN)（个人感觉最好的一篇）
各种框架的代码：
【theano】 https://github.com/Newmu/dcgan_code
【tensorflow】 https://github.com/carpedm20/DCGAN-tensorflow
【keras】 https://github.com/jacobgil/keras-dcgan
【torch】 https://github.com/soumith/dcgan.torch
下面是用minist数据集和tensorflow框架实现的一个demo：
# -*- coding:utf-8 -*-

'''
逻辑框架
    1. 获取数据
        a. 图像数据
        b. 随机向量
    2. 构建计算图
        a. 生成器
        b. 判别器
        c. DCGAN
            连接 生成器 和 判别器
            定义损失函数
            define train_op
    3. 实施训练流程

'''

import os
import tensorflow as tf
from tensorflow import gfile
import numpy as np
from PIL import Image
from tensorflow.examples.tutorials.mnist import input_data

# 获取 数据
mnist = input_data.read_data_sets('MINIST_data/', one_hot = True)
# 设置图片保存路径
output_dir = './local_run'
if not gfile.Exists(output_dir):
    gfile.MakeDirs(output_dir)

def get_default_params():
    '''
    设置默认参数
    :return:
    '''
    return tf.contrib.training.HParams(
        z_dim = 100, # 随机向量的长度
        init_conv_size = 4, # 初始化随机向量，特征图的大小
        g_channels = [128, 64, 32, 1], # 生成器中每一个反卷积层的通道数目
        d_channels = [32, 64, 128, 256], # 判别器中每一个卷积层的通道数目
        # 需要注意的是：判别器每一层步长都是2，这样的话，每一层特征图减半，通道数×2
        batch_size = 128,
        learning_rate = 0.002, # 学习率
        beta1 = 0.5, # adamoptimizer 的 参数
        img_size = 32, # 生成目标图像的大小
        # 从最初的大小4的特征图，到最后的32的图像，等于 × 2^3
    )

hps = get_default_params()

# print(hps)
# print(hps.img_size) # 32
# print(hps.g_channels) # [128, 64, 32, 1]
# print(minist.train.images.shape) # (55000, 784)

class MnistData(object):
    '''
    图像数据获取类
    '''
    def __init__(self, mnist_train, z_dim, img_size):
        '''
        初始化对象
        :param minist_train: 数据集
        :param z_dim: 初始特征图大小
        :param img_size:  目标图像大小
        '''

        self._data = mnist_train
        self._example_num = len(mnist_train) # 数据集 大小
        # z_data 为随机向量，形状为 [self._example_num, z_dim]
        self._z_data = np.random.standard_normal((self._example_num, z_dim))
        # np.random.standard_normal() 生成正态分布样本
        self._indicator = 0 # 起始点
        self._resize_mnist_img(img_size) # 因为 原图像是 28*28，所以需要 resize成 32 × 32
        self._random_shuffle() # 将数据集打乱

    def _random_shuffle(self):
        '''
        数据集随机打乱
        :return:
        '''
        p = np.random.permutation(self._example_num)
        self._z_data = self._z_data[p] # 将随机向量打乱
        self._data = self._data[p] # 将 数据集打乱

    def _resize_mnist_img(self, img_size):
        '''
        resize mnist image to goal img_size
        what shall we do ?
        1. numpy -> PIL img
        2. PIL img -> resize
        3. PIL img -> numpy
        '''
        # 此刻的数据集是0到1之间，tf自动做了归一化
        # 因为需要映射成0到255的
        data = np.asarray(self._data * 255, np.uint8)
        # reshape一下 [example_num, 784] -> [example_num, 28, 28]
        data = data.reshape((self._example_num, 28, 28))
        new_data = []

        for i in range(self._example_num):
            img = data[i]
            img = Image.fromarray(img) # 将 np 转化为 Image 对象
            img = img.resize((img_size, img_size)) # 使用对象的resize方法
            img = np.asarray(img) # 将对象 再转化为 np
            img = img.reshape((img_size, img_size, 1)) # 将 np 的shape，设置成 图像格式（灰度1通道）
            new_data.append(img) # 单张图像的 np，存入一个列表
        new_data = np.asarray(new_data, dtype=np.float32) # 将 该列表 转化为 一个 np。（真实繁琐）
        new_data = new_data / 127.5 - 1  # 进行一个归一化，归一化到-1 到 1 之间，与 tanh 一致
        # [new_example, img_size, img_size, 1]
        self._data = new_data # 当前的值为： (55000, 32, 32, 1)


    def next_batch(self, batch_size):
        '''
        获得一个batch
        :param batch_size: batch 大小
        :return:
        '''
        end_indicator = self._indicator + batch_size
        if end_indicator > self._example_num:
            self._random_shuffle()
            self._indicator = 0
            end_indicator = self._indicator + batch_size
        assert end_indicator < self._example_num

        batch_data = self._data[self._indicator:end_indicator]
        batch_z = self._z_data[self._indicator:end_indicator]
        self._indicator = end_indicator

        return batch_data, batch_z


mnist_data = MnistData(mnist.train.images, hps.z_dim, hps.img_size)
batch_data, batch_z = mnist_data.next_batch(5)


def conv2d_transpose(inputs, out_channel, name, training, with_bn_relu = True):
    '''
    生成器 反卷积 封装
    :param inputs: 输入 tensor
    :param out_channel: 输出通道数
    :param name: 命名空间
    :param training: 使用于 bn
    :param with_bn_relu: 因为最后一层是不需要做bn 和 relu的，所以立一个flag
    :return: 输出 tensor
    '''
    with tf.variable_scope(name):
        conv2d_trans = tf.layers.conv2d_transpose(inputs,
                                                  out_channel,
                                                  [5, 5],
                                                  strides=(2, 2),
                                                  padding='SAME',
                                                  )
        # 对 tf.layers.conv2d_transpose 的解释，参考：
        # https://www.w3cschool.cn/tensorflow_python/tensorflow_python-wfg62t8h.html

        # 判断是否需要 bn
        if with_bn_relu:
            bn = tf.layers.batch_normalization(conv2d_trans,
                                               training = training
                                               )
            relu = tf.nn.relu(bn)
            return relu
        else:
            return conv2d_trans

def conv2d(inputs, out_channel, name, training):
    '''
    判别器 卷积 封装
    :param inputs: 输入tensor
    :param out_channel: 输出通道数目
    :param name:  空间命名
    :param training: bn 参数
    :return:
    '''
    # 在本卷积网络中，使用leaky_relu
    def leaky_relu(x, leak=0.2, name=''):
        # 下句做一个解释，如果x大于0，就是x；如果小于0，再乘上一个小数，就比较大了
        return tf.maximum(x, x * leak, name = name)

    with tf.variable_scope(name):
        conv2d_output = tf.layers.conv2d(
            inputs,
            out_channel,
            [5, 5],
            strides = (2, 2),
            padding = 'SAME'
        )
        bn = tf.layers.batch_normalization(
            conv2d_output,
            training=training
        )
        return leaky_relu(bn, name='outputs')


class Generator():
    '''生成器 构建'''

    def __init__(self, channels, init_conv_size):
        '''
        构造函数
        :param channels: 输出通道数量
        :param init_conv_size: 初始化的特征图大小
        '''
        self._channels = channels
        self._init_conv_size = init_conv_size
        self._reuse = False
        # 这里的生成器或者判别器在构建完图之后可能需要多次使用，所有需要重用它

    def __call__(self, inputs, training):
        '''
        call 魔术方法，将对象当成函数使用
        :param inputs:
        :param training:
        :return:
        '''
        inputs = tf.convert_to_tensor(inputs) # 将随机向量转变成tensor
        with tf.variable_scope('generator', reuse = self._reuse):
            # 第一次构建的时候，reuse为Fales，构建完成之后，设置为true
            '''
            random_vactor -> fc -> self._channels[0] * init_conv_size**2(还是个一维向量)
            -> reshape -> [init_conv_size, init_conv_size, channels[0]]
            '''
            with tf.variable_scope('input_conv'):
                # 1. 将 随机向量 进行 映射
                fc = tf.layers.dense(
                    inputs,
                    self._channels[0] * self._init_conv_size * self._init_conv_size,
                )
                # 2. 将 得到的 向量 reshape
                conv0 = tf.reshape(fc, [-1, self._init_conv_size, self._init_conv_size, self._channels[0]])

                # 3. 进行 bn
                bn0 = tf.layers.batch_normalization(conv0, training = training)
                relu0 = tf.nn.relu(bn0)
            # 到此，第一个反卷积结束，还有三个
            deconv_inputs = relu0
            # 因为 第一个通道在上面已经倍使用， 所以这里从1 开始
            for i in range(1, len(self._channels)):
                # 首先，先判断是否是最后一层，最后一层不能使用relu
                with_bn_relu = (i != len(self._channels) - 1)
                deconv_inputs = conv2d_transpose(
                    deconv_inputs,
                    self._channels[i],
                    'deconv-%d'%i,
                    training,
                    with_bn_relu,
                )
            img_inputs = deconv_inputs
            with tf.variable_scope('generate_img'):
                # imgs value range:[-1 , 1]
                imgs = tf.tanh(img_inputs, name = 'imgs')

        self._reuse = True
        # 关于 reuse 的说明，参考：https://blog.csdn.net/zSean/article/details/75057806

        # 保存生成器所有的参数，因为在gan中生成器和判别器是分开训练的，所有需要保存之前的参数
        self.variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'generator')
        # tf.get_collection 获得 某个 scope下的 所有的参数
        return  imgs # 将最后 生成的图像 返回

# 定义 判别器
class Discriminator(object):

    def __init__(self, channels):
        self._channels = channels
        self._reuse = False

    def __call__(self, inputs, training):
        '''
        本方法流程：
            首先进行四个卷积操作
            其次，展平进行全连接操作
            最后，输出一个二分类的logits
        :param inputs: 图像
        :param training:  用在bn中
        :return: logits
        '''
        inputs = tf.convert_to_tensor(inputs, dtype=tf.float32)
        conv_inputs = inputs

        with tf.variable_scope('discriminator', reuse = self._reuse):
            for i in range(len(self._channels)):
                conv_inputs = conv2d(
                    conv_inputs,
                    self._channels[i],
                    'conv-%d' % i,
                    training
                )
            fc_inputs = conv_inputs
            with tf.variable_scope('fc'):
                # 展平
                flatten = tf.contrib.layers.flatten(fc_inputs)
                # 获得 logits
                logits = tf.layers.dense(flatten, 2, name = 'logits')

        self._reuse = True
        self.variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='discriminator')

        return logits

class DCGAN(object):
    '''DCGAN 主逻辑程序'''

    def __init__(self, hps):
        '''

        :param hps:  所有超参数
        '''
        g_channels = hps.g_channels
        d_channels = hps.d_channels

        self._batch_size = hps.batch_size
        self._init_conv_size = hps.init_conv_size
        self._z_dim = hps.z_dim
        self._img_size = hps.img_size

        self._generator = Generator(g_channels, self._init_conv_size)
        self._discriminator = Discriminator(d_channels)

    def build(self):
        '''创建计算图'''

        # 定义 随机向量
        self._z_placeholder = tf.placeholder(tf.float32, (self._batch_size, self._z_dim))

        # 定义真实图像
        self._img_placeholder = tf.placeholder(tf.float32, (self._batch_size, self._img_size, self._img_size, 1))

        # 执行过下面一行之后，就有了生成的图像
        # 也就是说，结合上面一句代码，现在就有了 真实图像 和 生成图像
        generated_imgs = self._generator(self._z_placeholder, training=True)

        # 先把假的图像输入到判别器中，得到假图像的logits
        fake_img_logits = self._discriminator(generated_imgs, training=True)

        # 然后把真实的图像输入到判别器中，得到真实图像的logits
        real_img_logits = self._discriminator(self._img_placeholder, training=True)


        # 有了两个logits之后，就可以定义损失函数了

        # 生成器loss损失函数
        # 真图像 使用 1 来代替
        loss_on_fake_to_real = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels = tf.ones([self._batch_size], dtype = tf.int64), # 标签都是1
                logits = fake_img_logits
            )
        )

        # 判别器的损失函数 有两个
        #  1.将假的判断为假的
        #  2.将真的判断为真的
        loss_on_fake_to_fake = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels = tf.zeros([self._batch_size], dtype = tf.int64),
                logits = fake_img_logits
            )
        )

        loss_on_real_to_real = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels = tf.ones([self._batch_size], dtype = tf.int64),
                logits = real_img_logits
            )
        )


        # 收集变量的方法：
        # |--关于 tf.add_to_collection()、tf.get_collection()和 tf.add_n() 参考链接：
        # |--https://blog.csdn.net/uestc_c2_403/article/details/72415791
        tf.add_to_collection('g_losses', loss_on_fake_to_real)
        tf.add_to_collection('d_losses', loss_on_fake_to_fake)
        tf.add_to_collection('d_losses', loss_on_real_to_real)

        loss = {
            'g': tf.add_n(tf.get_collection('g_losses'), name = 'total_g_loss'),
            'd': tf.add_n(tf.get_collection('d_losses'), name = 'total_d_loss')
        }

        return self._z_placeholder, self._img_placeholder, generated_imgs, loss


    def build_train_op(self, losses, learning_rate, beta1):
        '''

        :param losses: 损失函数
        :param learning_rate: 学习率
        :param beta1: op 参数
        :return:
        '''
        # 判别器 和 生成器 分别 op

        # 得到 两个 optimizer
        g_opt = tf.train.AdamOptimizer(learning_rate = learning_rate, beta1 = beta1)
        d_opt = tf.train.AdamOptimizer(learning_rate = learning_rate, beta1 = beta1)

        g_opt_op = g_opt.minimize(losses['g'], var_list = self._generator.variables)
        d_opt_op = d_opt.minimize(losses['d'], var_list = self._discriminator.variables )

        # gan 是交叉执行的，需要用到下面的控制依赖方法：tf.control_dependencies([g_opt_op, d_opt_op])
        # 这样达到的效果是：先训练生成器，在训练判别器，交替执行
        # 参考链接：https://blog.csdn.net/PKU_Jade/article/details/73498753
        with tf.control_dependencies([g_opt_op, d_opt_op]):
            return tf.no_op(name = 'train') # tf.no_op() 什么都不做， 返回创建的操作


def combine_imgs(batch_imgs, img_size, rows = 8, cols = 16):
    '''
    将小图合并为一张大图
    :param batch_imgs: 类型 [batch_size, img_size, img_size, 1]
    :param img_size: 图像大小
    :param rows: 行数量
    :param cols: 列数量
    :return:
    '''
    result_big_img = []
    for i in range(rows):
        row_imgs = []
        for j in range(cols):
            # [img_size, img_size, 1]
            img = batch_imgs[cols * i + j]
            img = img.reshape((img_size, img_size))
            img = (img + 1) * 127.5 # 将归一化还原
            row_imgs.append(img) # 现在列表中保存的是16张图像
        row_imgs = np.hstack(row_imgs)
        result_big_img.append(row_imgs)

    # 8 × 32， 16 × 32
    result_big_img = np.vstack(result_big_img)
    result_big_img = np.asarray(result_big_img, np.uint8)

    # 使用 PIL 将矩阵变为图像
    result_big_img = Image.fromarray(result_big_img)
    # 将图像返回
    return result_big_img


#########################################
# 实验流程 开始
#########################################

dcgan = DCGAN(hps) # 创建 dcgan对象
z_placeholder, img_placeholder, generated_imgs, losses = dcgan.build()
train_op = dcgan.build_train_op(losses, hps.learning_rate, hps.beta1)


init_op = tf.global_variables_initializer()
train_steps = 10000

with tf.Session() as sess:
    sess.run(init_op)
    for step in range(train_steps):
        batch_imgs, batch_z = mnist_data.next_batch(hps.batch_size)
        fetches = [train_op, losses['g'], losses['d']]
        should_sample = (step + 1) % 50 == 0

        if should_sample:
            fetches += [generated_imgs]

        output_values = sess.run(
            fetches,
            feed_dict={
                z_placeholder: batch_z,
                img_placeholder:batch_imgs
            }
        )

        _, g_loss_val, d_loss_val = output_values[0: 3]

        print('step:%4d, g_loss:%4.3f, d_loss: %4.3f' % (step, g_loss_val, d_loss_val))

        if should_sample:
            gen_imgs_val = output_values[3]

            # 这是生成的图像
            gen_img_path = os.path.join(output_dir, '%05d-gen.jpg'%(step + 1))

            # 将真实图像也输出
            gt_img_path = os.path.join(output_dir, '%05d-gt.jpg' % (step + 1))

            # 组装 生成图片 和 真实图片
            gen_img = combine_imgs(gen_imgs_val, hps.img_size)
            gt_img = combine_imgs(batch_imgs, hps.img_size)

            # 将 生成图片 和 真实图片 保存到本地
            gen_img.save(gen_img_path)
            gt_img.save(gt_img_path)
迭代500次的效果图：

迭代3000次效果图：
在这里插入图片描述
迭代5000次的效果图：
就酱，感觉还是学到了不少东西
O(∩_∩)O哈哈~
DCGAN实现手写数字识别demo

猜你喜欢