用DCGAN生成手写体数字图像代码解析

该项目源码位于：
https://github.com/jazzsaxmafia/dcgan_tensorflow/tree/master/mnist 项目共有四个python文件，分别是util.py，load.py，model.py和train.py

util.py文件把项目中用到的工具函数整理到一起，包括onehot，图像剪裁，图像拼接

# !/usr/bin/python
# -*- coding: utf-8 -*-
import cv2
import scipy.misc
import ipdb
import numpy as np

def OneHot(X, n=None, negative_class=0.):
    #将输入的矩阵压平，转化成向量l
    X = np.asarray(X).flatten()
    #得到输入数据中最大的值
    if n is None:
        n = np.max(X) + 1
    #生成l*n的矩阵，初始个中元素都置为0
    Xoh = np.ones((len(X), n)) * negative_class
    #将矩阵每一行中X的值所对应的第n个位置置1，其他位为0
    Xoh[np.arange(len(X)), X] = 1.
    return Xoh
# 例：生成如下的矩阵
# [[ 0.  0.  0. ...,  0.  0.  0.]
#  [ 0.  0.  0. ...,  0.  0.  1.]
#  [ 1.  0.  0. ...,  0.  0.  0.]
#  ...,
#  [ 0.  0.  0. ...,  0.  0.  1.]
#  [ 0.  0.  0. ...,  0.  0.  0.]
#  [ 0.  0.  0. ...,  0.  0.  0.]]

def crop_resize(image_path, resize_shape=(64,64)):
    #resize_image 必须是个正方形
    image = cv2.imread(image_path)
    height, width, channel = image.shape #得到输入图像初始的参数

    #if width != height, 新正方形的边长是原来图像较短的那一条边
    #resize_image 位于原来图像的正中间区域
    if width == height:
        resized_image = cv2.resize(image, resize_shape)
    elif width > height:
        resized_image = cv2.resize(image, (int(width * float(resize_shape[0])/height), resize_shape[1]))
        cropping_length = int( (resized_image.shape[1] - resize_shape[0]) / 2)
        resized_image = resized_image[:,cropping_length:cropping_length+resize_shape[1]]
    else:
        resized_image = cv2.resize(image, (resize_shape[0], int(height * float(resize_shape[1])/width)))
        cropping_length = int( (resized_image.shape[0] - resize_shape[1]) / 2)
        resized_image = resized_image[cropping_length:cropping_length+resize_shape[0], :]

    return resized_image/127.5 - 1#把返回的图像中每个像素值的范围压缩在-1到1之间

#把原图为28x28的图像拼凑起来组成一个392x392的图像
def save_visualization(X, (nh, nw), save_path='./vis/sample.jpg'):
    h,w = X.shape[1], X.shape[2]
    img = np.zeros((h * nh, w * nw, 3))

    for n,x in enumerate(X):
        j = n / nw
        i = n % nw
        img[j*h:j*h+h, i*w:i*w+w, :] = x

    scipy.misc.imsave(save_path, img)

load.py文件的作用是将MNIST数据导入并进行数据集的划分

# !/usr/bin/python
# -*- coding: utf-8 -*-
import sys
sys.path.append('..')

import numpy as np
import os
from time import time
from collections import Counter
import random
from matplotlib import pyplot as plt

data_dir = '/Users/Chris/MNIST_data'
def mnist():
    #训练集有六万组数据
    fd = open(os.path.join(data_dir,'train-images-idx3-ubyte'))
    loaded = np.fromfile(file=fd,dtype=np.uint8)
    trX = loaded[16:].reshape((60000,28*28)).astype(float)

    fd = open(os.path.join(data_dir,'train-labels-idx1-ubyte'))
    loaded = np.fromfile(file=fd,dtype=np.uint8)
    trY = loaded[8:].reshape((60000))

    #测试集有一万组数据
    fd = open(os.path.join(data_dir,'t10k-images-idx3-ubyte'))
    loaded = np.fromfile(file=fd,dtype=np.uint8)
    teX = loaded[16:].reshape((10000,28*28)).astype(float)

    fd = open(os.path.join(data_dir,'t10k-labels-idx1-ubyte'))
    loaded = np.fromfile(file=fd,dtype=np.uint8)
    teY = loaded[8:].reshape((10000))

    trY = np.asarray(trY)
    teY = np.asarray(teY)

    return trX, teX, trY, teY

def mnist_with_valid_set():
    trX, teX, trY, teY = mnist()

    #从训练集取出后1万组作为验证集，前5万组作为新的测试集
    train_inds = range(len(trX))
    np.random.shuffle(train_inds)
    trX = trX[train_inds]
    trY = trY[train_inds]
    #trX, trY = shuffle(trX, trY)
    vaX = trX[50000:]
    vaY = trY[50000:]
    trX = trX[:50000]
    trY = trY[:50000]

    return trX, vaX, teX, trY, vaY, teY

model.py是模型的网络结构，该文件还包括模型会用到的激活函数和正则化方法等

# !/usr/bin/python
#-*- coding: utf-8 -*-
import tensorflow as tf
import ipdb

def batchnormalize(X, eps=1e-8, g=None, b=None):
    if X.get_shape().ndims == 4:
        mean = tf.reduce_mean(X, [0,1,2])
        std = tf.reduce_mean( tf.square(X-mean), [0,1,2] )
        X = (X-mean) / tf.sqrt(std+eps)

        if g is not None and b is not None:
            g = tf.reshape(g, [1,1,1,-1])
            b = tf.reshape(b, [1,1,1,-1])
            X = X*g + b

    elif X.get_shape().ndims == 2:
        mean = tf.reduce_mean(X, 0)
        std = tf.reduce_mean(tf.square(X-mean), 0)
        X = (X-mean) / tf.sqrt(std+eps)

        if g is not None and b is not None:
            g = tf.reshape(g, [1,-1])
            b = tf.reshape(b, [1,-1])
            X = X*g + b

    else:
        raise NotImplementedError

    return X

#此模型实用的激活函数，形状类似rule，但与rule不同的是x的负半轴那一部分的线段有斜率
def lrelu(X, leak=0.2):
    f1 = 0.5 * (1 + leak)
    f2 = 0.5 * (1 - leak)
    return f1 * X + f2 * tf.abs(X)

def bce(o, t):
    o = tf.clip_by_value(o, 1e-7, 1. - 1e-7)#将o中各个元素的值都压缩在1e-7至1. - 1e-7之间
    return -(t * tf.log(o) + (1.- t)*tf.log(1. - o))#交叉熵

class DCGAN():
    def __init__(
            self,
            batch_size=100,
            image_shape=[28,28,1],
            dim_z=100,#噪声z的维度
            dim_y=10, #数字0 - 9, 10类
            dim_W1=1024,
            dim_W2=128,
            dim_W3=64,
            dim_channel=1,#灰度图像的通道数
            ):

        self.batch_size = batch_size
        self.image_shape = image_shape
        self.dim_z = dim_z
        self.dim_y = dim_y

        self.dim_W1 = dim_W1
        self.dim_W2 = dim_W2
        self.dim_W3 = dim_W3
        self.dim_channel = dim_channel

        #初始化参数，在这里设置name是为了训练时获取参数列表
        self.gen_W1 = tf.Variable(tf.random_normal([dim_z+dim_y, dim_W1], stddev=0.02), name='gen_W1')
        self.gen_W2 = tf.Variable(tf.random_normal([dim_W1+dim_y, dim_W2*7*7], stddev=0.02), name='gen_W2')
        self.gen_W3 = tf.Variable(tf.random_normal([5,5,dim_W3,dim_W2+dim_y], stddev=0.02), name='gen_W3')
        self.gen_W4 = tf.Variable(tf.random_normal([5,5,dim_channel,dim_W3+dim_y], stddev=0.02), name='gen_W4')

        #64个大小为5x5的卷积核，通道数为11
        self.discrim_W1 = tf.Variable(tf.random_normal([5,5,dim_channel+dim_y,dim_W3], stddev=0.02), name='discrim_W1')
        #128个大小为5x5的卷积核，通道数为74
        self.discrim_W2 = tf.Variable(tf.random_normal([5,5,dim_W3+dim_y,dim_W2], stddev=0.02), name='discrim_W2')
        self.discrim_W3 = tf.Variable(tf.random_normal([dim_W2*7*7+dim_y,dim_W1], stddev=0.02), name='discrim_W3')
        self.discrim_W4 = tf.Variable(tf.random_normal([dim_W1+dim_y,1], stddev=0.02), name='discrim_W4')

    def build_model(self):

        Z = tf.placeholder(tf.float32, [self.batch_size, self.dim_z])
        Y = tf.placeholder(tf.float32, [self.batch_size, self.dim_y])

        image_real = tf.placeholder(tf.float32, [self.batch_size]+self.image_shape)
        image_gen = self.generate(Z,Y)

        p_real = self.discriminate(image_real, Y)#真实图像判别概率
        p_gen = self.discriminate(image_gen, Y)#生成图像判别概率

        #判别网络的目标是让真实图像的判别概率越接近1同时生成图像判别概率越接近0
        discrim_cost_real = bce(p_real, tf.ones_like(p_real))
        discrim_cost_gen = bce(p_gen, tf.zeros_like(p_gen))
        discrim_cost = tf.reduce_mean(discrim_cost_real) + tf.reduce_mean(discrim_cost_gen)

        #生成网络的目标是让生成图像判别概率越接近1
        gen_cost = tf.reduce_mean(bce( p_gen, tf.ones_like(p_gen) ))

        return Z, Y, image_real, discrim_cost, gen_cost, p_real, p_gen

    def discriminate(self, image, Y):
        yb = tf.reshape(Y, tf.stack([self.batch_size, 1, 1, self.dim_y]))#shape=(128, 1, 1, 10)
        X = tf.concat([image, yb*tf.ones([self.batch_size, 28, 28, self.dim_y])],3)#shape=(128, 28, 28, 11)

        #第一层是个卷积层，得到64个14x14的feature map
        h1 = lrelu( tf.nn.conv2d( X, self.discrim_W1, strides=[1,2,2,1], padding='SAME' ))#shape=(128, 14, 14, 64)
        h1 = tf.concat([h1, yb*tf.ones([self.batch_size, 14, 14, self.dim_y])],3)#shape=(128, 14, 14, 74)

        #第二层是个卷积层，得到128个7x7的feature map
        h2 = lrelu( batchnormalize( tf.nn.conv2d( h1, self.discrim_W2, strides=[1,2,2,1], padding='SAME')) )#shape=(128, 7, 7, 128)
        h2 = tf.reshape(h2, [self.batch_size, -1])#shape=(128, 6272)
        h2 = tf.concat([h2, Y],1)#shape=(128, 6282)

        #第三层是个全连接层，得到1024个输出节点
        h3 = lrelu( batchnormalize( tf.matmul(h2, self.discrim_W3 ) ))#shape=(128, 1024)
        h3 = tf.concat([h3, Y],1)#shape=(128, 1034)
        y = tf.nn.sigmoid(h3)#shape=(128, 1034)
        return y

    def generate(self, Z, Y):
        #是discriminate()的逆过程
        yb = tf.reshape(Y, [self.batch_size, 1, 1, self.dim_y])#shape=(128, 1, 1, 10)
        Z = tf.concat([Z,Y],1)#shape=(128, 110)

        #第一层是个全连接层
        h1 = tf.nn.relu(batchnormalize(tf.matmul(Z, self.gen_W1)))#shape=(128, 1024)
        h1 = tf.concat([h1, Y],1)#shape=(128, 1034)

        #第二层是个全连接层
        h2 = tf.nn.relu(batchnormalize(tf.matmul(h1, self.gen_W2)))#shape=(128, 6272)
        h2 = tf.reshape(h2, [self.batch_size,7,7,self.dim_W2])#shape=(128, 7, 7, 128)
        h2 = tf.concat([h2, yb*tf.ones([self.batch_size, 7, 7, self.dim_y])],3)#shape=(128, 7, 7, 138)

        #第三层是个反卷积层
        output_shape_l3 = [self.batch_size,14,14,self.dim_W3]
        h3 = tf.nn.conv2d_transpose(h2, self.gen_W3, output_shape=output_shape_l3, strides=[1,2,2,1])#shape=(128, 14, 14, 64)
        h3 = tf.nn.relu( batchnormalize(h3) )#shape=(128, 14, 14, 64)
        h3 = tf.concat([h3, yb*tf.ones([self.batch_size, 14,14,self.dim_y])],3 )#shape=(128, 14, 14, 74)

        #第四层是个反卷积层，得到28x28x1的图像
        output_shape_l4 = [self.batch_size,28,28,self.dim_channel]
        h4 = tf.nn.conv2d_transpose(h3, self.gen_W4, output_shape=output_shape_l4, strides=[1,2,2,1])#shape=(128, 28, 28, 1)
        x = tf.nn.sigmoid(h4)#shape=(128, 28, 28, 1)
        return x

    def samples_generator(self, batch_size):
        #结构与generate()相同，bacth_size设置的值与其不同，是为了将生成的一批图像拼接到一张例图上
        Z = tf.placeholder(tf.float32, [batch_size, self.dim_z])
        Y = tf.placeholder(tf.float32, [batch_size, self.dim_y])

        yb = tf.reshape(Y, [batch_size, 1, 1, self.dim_y])
        Z_ = tf.concat([Z,Y],1)

        h1 = tf.nn.relu(batchnormalize(tf.matmul(Z_, self.gen_W1)))
        h1 = tf.concat([h1, Y],1)

        h2 = tf.nn.relu(batchnormalize(tf.matmul(h1, self.gen_W2)))
        h2 = tf.reshape(h2, [batch_size,7,7,self.dim_W2])
        h2 = tf.concat( [h2, yb*tf.ones([batch_size, 7, 7, self.dim_y])],3)

        output_shape_l3 = [batch_size,14,14,self.dim_W3]
        h3 = tf.nn.conv2d_transpose(h2, self.gen_W3, output_shape=output_shape_l3, strides=[1,2,2,1])
        h3 = tf.nn.relu( batchnormalize(h3) )
        h3 = tf.concat([h3, yb*tf.ones([batch_size, 14,14,self.dim_y])],3 )

        output_shape_l4 = [batch_size,28,28,self.dim_channel]
        h4 = tf.nn.conv2d_transpose(h3, self.gen_W4, output_shape=output_shape_l4, strides=[1,2,2,1])
        x = tf.nn.sigmoid(h4)
        return Z,Y,x

train.py文件用于训练模型并生成样例图像

# !/usr/bin/python
#-*- coding: utf-8 -*-
import ipdb
import os
import pandas as pd
import numpy as np
from model import *
from util import *
from load import mnist_with_valid_set

n_epochs = 100
learning_rate = 0.0002
batch_size = 128
image_shape = [28,28,1]
dim_z = 100
dim_W1 = 1024
dim_W2 = 128
dim_W3 = 64
dim_channel = 1

visualize_dim=196#196个生成的样本图像拼凑在一起保存为一张样例图，样例图的尺寸为392x392

trX, vaX, teX, trY, vaY, teY = mnist_with_valid_set()
# trX trY 训练集
# vaX vaY 验证集
# teX teY 测试集

dcgan_model = DCGAN(
        batch_size=batch_size,
        image_shape=image_shape,
        dim_z=dim_z,
        dim_W1=dim_W1,
        dim_W2=dim_W2,
        dim_W3=dim_W3,
        )

Z_tf, Y_tf, image_tf, d_cost_tf, g_cost_tf, p_real, p_gen = dcgan_model.build_model()
# Z_tf 输入随机的数据
# Y_tf 输出的类别（0-9个数字）
# image_tf batch_size个真实训练图像
# d_cost_tf 判别网络的目标函数
# g_cost_tf 生成网络的目标函数
# p_real 真实图像的判别概率
# p_gen 生成图像的判别概率

sess = tf.InteractiveSession() #交互式使用对话
saver = tf.train.Saver(max_to_keep=10)

#得到判别网络和生成网络的参数列表
discrim_vars = filter(lambda x: x.name.startswith('discrim'), tf.trainable_variables())
gen_vars = filter(lambda x: x.name.startswith('gen'), tf.trainable_variables())

#训练使用Adam进行优化，学习率设为0.0002
train_op_discrim = tf.train.AdamOptimizer(learning_rate, beta1=0.5).minimize(d_cost_tf, var_list=discrim_vars)
train_op_gen = tf.train.AdamOptimizer(learning_rate, beta1=0.5).minimize(g_cost_tf, var_list=gen_vars)

Z_tf_sample, Y_tf_sample, image_tf_sample = dcgan_model.samples_generator(batch_size=visualize_dim)

tf.initialize_all_variables().run()

Z_np_sample = np.random.uniform(-1, 1, size=(visualize_dim,dim_z))#从-1到1的均匀分布中随机采样，得到196x100的矩阵
Y_np_sample = OneHot( np.random.randint(10, size=[visualize_dim]))#生成196x10的矩阵，对于每一行，随机地在某一列置1
iterations = 0
k = 2

for epoch in range(n_epochs):
    index = range(len(trY))#得到训练数据的下标序列
    np.random.shuffle(index)#置乱下标
    #得到新的训练数据序列
    trX = trX[index]
    trY = trY[index]
    for start, end in zip(
            range(0, len(trY), batch_size),#生成一个从0开始，至训练数据数量大小结尾，间隔是128的序列，近似于[0,128,256,...]
            range(batch_size, len(trY), batch_size)#与上一行得到的list相比，少了开始的一个0，[128,256,384,...]
            ):
    #zip生成一个序列对[(0, 128), (128, 256), (256, 384),...],每一对相当于一个batch_size
        Xs = trX[start:end].reshape( [-1, 28, 28, 1]) / 255.#像素值限制在1以下
        Ys = OneHot(trY[start:end])#生成128x10的矩阵，对于每一行，随机地在某一列置1
        Zs = np.random.uniform(-1, 1, size=[batch_size, dim_z]).astype(np.float32)#生成128组维数为100的随机向量

        #先训练判别网络的参数，再训练生成网络中的参数，交替进行
        if np.mod( iterations, k ) != 0:
            #喂进去Z、Y，得到生成图像；得到生成图像的判别概率；得到生成网络的目标函数，训练优化其中的参数
            _, gen_loss_val = sess.run(
                    [train_op_gen, g_cost_tf],
                    feed_dict={
                        Z_tf:Zs,
                        Y_tf:Ys
                        })
            #喂进去Z、X、Y，得到生成图像，得到真实图像；得到生成图像判别概率和真实图像判别概率；得到判别网络的目标函数（未优化）
            discrim_loss_val, p_real_val, p_gen_val = sess.run([d_cost_tf,p_real,p_gen], feed_dict={Z_tf:Zs, image_tf:Xs, Y_tf:Ys})
            print "=========== updating G =========="
            print "iteration:", iterations
            print "gen loss:", gen_loss_val
            print "discrim loss:", discrim_loss_val

        else:
            # 喂进去Z、X、Y，得到生成图像，得到真实图像；得到生成图像判别概率和真实图像判别概率；得到判别网络的目标函数，训练优化其中的参数
            _, discrim_loss_val = sess.run(
                    [train_op_discrim, d_cost_tf],
                    feed_dict={
                        Z_tf:Zs,
                        Y_tf:Ys,
                        image_tf:Xs
                        })
            #喂进去Z、Y，得到生成图像；得到生成图像的判别概率；得到生成网络的目标函数（未优化）
            gen_loss_val, p_real_val, p_gen_val = sess.run([g_cost_tf, p_real, p_gen], feed_dict={Z_tf:Zs, image_tf:Xs, Y_tf:Ys})
            print "=========== updating D =========="
            print "iteration:", iterations
            print "gen loss:", gen_loss_val
            print "discrim loss:", discrim_loss_val

        print "Average P(real)=", p_real_val.mean()
        print "Average P(gen)=", p_gen_val.mean()

        #每200次epoch生成一张样例图。由于samples_generator中的参数和generate中的参数是一样的，直接喂给其数据，就能享受到训练了200*n轮后的成果
        if np.mod(iterations, 200) == 0:
            generated_samples = sess.run(
                    image_tf_sample,
                    feed_dict={
                        Z_tf_sample:Z_np_sample,
                        Y_tf_sample:Y_np_sample
                        })
            generated_samples = (generated_samples + 1.)/2.#把像素值的范围由在-1到1转化成0到1
            save_visualization(generated_samples, (14,14), save_path='./vis/sample_'+str(iterations/200)+'.jpg')

        iterations += 1

对于此项目，我还存在一些疑惑，比如说网络结构中Y（标签）的数量10为什么总被用于改变网络的结构：

h2 = tf.reshape（h2，[self.batch_size，7,7,self.dim_W2]）＃shape =（128,7,7,128）

h2 = tf.concat（[h2，yb * tf.ones（[self.batch_size，7,7,self.dim_y]）]，3）#shape =（128,7,7,138）

在这里，h2增加了10个通道。它的作用是什么呢？

用DCGAN生成手写体数字图像代码解析

猜你喜欢