神经网络解决XOR（抑或）问题

引言

神经网络用于解决XOR问题也算是一个神经网络的一个重大突破。
首先，我们需要知道什么是抑或问题，这里我就不多说了。
抑或问题有4个输入，1个输出。输入是二维的，且每维要么是0要么是1，输出是一维的，要么是0要么是1.
当输入为0,0或者1,1时，输出为0，当输入为1,0或者0,1是输出我1.
各位可以自行将其画到直角坐标系中，这是用线性分类器无法解决的。

目标

本文想搭建一个神经网络架构来训练参数。
但是要求，架构是可调的，也就是说隐层的数量，学习率等都是可调的。
最后得到参数并进行预测。
在预测时，将结果使用3D图画出来，以表示分类结果。
这里的3D是指，虽然输入中每个维度要么是0要么是1，但是预测时，每一个维度可以是0到1之间的一个小数。
比如，测试数据是0.2，0.2，我们知道这个数据距离0,0这点事最近的，我们会将其近似的看做就是0，0这个点，然后也是有对应输出的，比如和0,0这个点相同。

一些说明

这里我将这个问题看做分类问题，所以最后一层采用Softmax分类器。
激活函数就使用最原始的sigmoid函数。

程序

这里程序分为两个部分，一个部分负责搭建框架，另一个部分负责输入数据并进行训练。
代码可以直接去CSDN下载中搜索后下载。

框架程序：
（基本上根据cs231n的课程作业修改得到）

# coding=utf-8

import numpy as np


def basic_forard(x, w, b):
    x = x.reshape(x.shape[0], -1)
    out = np.dot(x, w) + b
    cache = (x, w, b)

    return out, cache


def basic_backward(dout, cache):
    x, w, b = cache
    dx = np.dot(dout, w.T)
    # dx = np.reshape(dx, x.shape)
    # x = x.reshape(x.shape[0], -1)
    dw = np.dot(x.T, dout)
    db = np.reshape(np.sum(dout, axis=0), b.shape)

    return dx, dw, db


def sigmoid_forward(x):
    x = x.reshape(x.shape[0], -1)
    out = 1 / (1 + np.exp(-1 * x))
    cache = out

    return out, cache


def sigmoid_backward(dout, cache):
    out = cache
    dx = out * (1 - out)
    dx *= dout

    return dx


def basic_sigmoid_forward(x, w, b):
    basic_out, basic_cache = basic_forard(x, w, b)
    sigmoid_out, sigmoid_cache = sigmoid_forward(basic_out)
    cache = (basic_cache, sigmoid_cache)

    return sigmoid_out, cache


def basic_sigmoid_backward(dout, cache):
    basic_cache, sigmoid_cache = cache
    dx_sigmoid = sigmoid_backward(dout, sigmoid_cache)
    dx, dw, db = basic_backward(dx_sigmoid, basic_cache)

    return dx, dw, db


def softmax_loss(x, y):
    shifted_logits = x - np.max(x, axis=1, keepdims=True)
    Z = np.sum(np.exp(shifted_logits), axis=1, keepdims=True)
    log_probs = shifted_logits - np.log(Z)
    probs = np.exp(log_probs)
    N = x.shape[0]
    loss = -np.sum(log_probs[np.arange(N), y]) / N
    dx = probs.copy()
    dx[np.arange(N), y] -= 1
    dx /= N
    # print(x.shape)
    # print(y.shape)
    # print(dx.shape)
    return loss, dx


class muliti_layer_net(object):
    def __init__(self, hidden_dim, input_dim=2, num_classes=2, dtype=np.float32, seed=None, reg=0.0):
        self.num_layers = 1 + len(hidden_dim)
        self.dtype = dtype
        self.reg = reg
        self.params = {}

        # init all parameters
        layers_dims = [input_dim] + hidden_dim + [num_classes]

        for i in range(self.num_layers):
            self.params['W' + str(i + 1)] = np.random.randn(layers_dims[i], layers_dims[i + 1])
            self.params['b' + str(i + 1)] = np.zeros((1, layers_dims[i + 1]))


    def loss(self, X, y=None):
        X = X.astype(self.dtype)
        mode = 'test' if y is None else 'train'

        # compute the forward data and cache
        basic_sigmoid_cache = {}

        layer_input = X

        for lay in range(self.num_layers):
            layer_input, basic_sigmoid_cache[lay] = basic_sigmoid_forward(layer_input,
                                                                          self.params['W' + str(lay + 1)],
                                                                          self.params['b' + str(lay + 1)])

        score = layer_input
        # print(score.shape)

        if mode == 'test':
            return score

        # compute the gradient
        loss, dscore = softmax_loss(score, y)
        dx = dscore
        grads = {}

        for index in range(self.num_layers):
            lay = self.num_layers - index - 1
            loss += 0.5 * self.reg * np.sum(self.params['W' + str(lay + 1)] * self.params['b' + str(lay + 1)])
            dx, dw, db = basic_sigmoid_backward(dx, basic_sigmoid_cache[lay])

            grads['W' + str(lay + 1)] = dw + self.reg * self.params['W' + str(lay + 1)]
            grads['b' + str(lay + 1)] = db

        return loss, grads


def sgd_momentum(w, dw, config=None):
    if config is None: config = {}
    config.setdefault('learning_rate', 1e-2)
    config.setdefault('momentum', 0.9)

    v = config.get('velocity', np.zeros_like(w))
    v = config['momentum'] * v - config['learning_rate'] * dw
    next_w = w + v

    config['velocity'] = v

    return next_w, config


class Solver(object):

    def __init__(self, model, data, **kwargs):
        self.model = model
        self.X_train = data['X_train']
        self.y_train = data['y_train']
        self.X_val = data['X_val']
        self.y_val = data['y_val']

        self.update_rule = kwargs.pop('update_rule', 'sgd_momentum')
        self.optim_config = kwargs.pop('optim_config', {})
        self.lr_decay = kwargs.pop('lr_decay', 1.0)
        self.batch_size = kwargs.pop('batch_size', 100)
        self.num_epochs = kwargs.pop('num_epochs', 10)

        self.print_every = kwargs.pop('print_every', 10)
        self.verbose = kwargs.pop('verbose', True)

        if len(kwargs) > 0:
            extra = ', '.join('"%s"' % k for k in kwargs.keys())
            raise ValueError('Unrecognized argements %s' % extra)

        # if not hasattr(optim, self.update_rule):
        #     raise ValueError('Invalid update_rule "%s"' % self.update_rule)
        # self.update_rule = getattr(optim, self.update_rule)

        self._reset()

    def _reset(self):
        """
        Set up some book-keeping variables for optimization. Don't call this
        manually.
        """
        # Set up some variables for book-keeping
        self.epoch = 0
        self.best_val_acc = 0
        self.best_params = {}
        self.loss_history = []
        self.train_acc_history = []
        self.val_acc_history = []

        self.optim_configs = {}
        for p in self.model.params:
            d = {k: v for k, v in self.optim_config.items()}
            self.optim_configs[p] = d

    def _step(self):
        num_train = self.X_train.shape[0]
        batch_mask = np.random.choice(num_train, self.batch_size)
        X_batch = self.X_train[batch_mask]
        y_batch = self.y_train[batch_mask]

        loss, grads = self.model.loss(X_batch, y_batch)
        self.loss_history.append(loss)

        for p, w in self.model.params.items():
            dw = grads[p]
            config = self.optim_configs[p]
            next_w, next_config = sgd_momentum(w, dw, config)
            self.model.params[p] = next_w
            self.optim_configs[p] = next_config

    def check_accuracy(self, X, y, num_samples=None, batch_size=100):
        N = X.shape[0]
        if num_samples is not None and N > num_samples:
            mask = np.random.choice(N, num_samples)
            N = num_samples
            X = X[mask]
            y = y[mask]

        num_batches = N / batch_size
        if N % batch_size != 0:
            num_batches += 1

        y_pred = []

        for i in range(int(num_batches)):
            start = i * batch_size
            end = (i + 1) * batch_size
            scores = self.model.loss(X[start:end])
            y_pred.append(np.argmax(scores, axis=1))
        y_pred = np.hstack(y_pred)
        acc = np.mean(y_pred == y)

        return acc


    def train(self):
        num_train = self.X_train.shape[0]
        iterations_per_epoch = max(num_train / self.batch_size, 1)
        num_iterations = self.num_epochs * iterations_per_epoch

        for t in range(int(num_iterations)):
            self._step()

            if self.verbose and self.print_every == 0:
                print('Iteration {:d} / {:d}, loss: {:f}'.format(t+1, num_iterations, self.loss_history[-1]))

            epoch_end = (t + 1) % iterations_per_epoch == 0
            if epoch_end:
                self.epoch += 1
                for k in self.optim_configs:
                    self.optim_configs[k]['learning_rate'] *= self.lr_decay

            first_it = (t == 0)
            last_it = (t == num_iterations + 1)
            if first_it or last_it or epoch_end:
                train_acc = self.check_accuracy(self.X_train, self.y_train, num_samples=10)
                val_acc = self.check_accuracy(self.X_val, self.y_val)

                self.train_acc_history.append(train_acc)
                self.val_acc_history.append(val_acc)

                if self.verbose:
                    print('Epoch {:d} / {:d}, train_acc: {:f}, val_acc: {:f}'.format(self.epoch, self.num_epochs, train_acc, val_acc))

                if val_acc > self.best_val_acc:
                    self.best_val_acc = val_acc
                    self.best_params = {}
                    for k,v in self.model.params.items():
                        self.best_params[k] = v.copy()

        self.model.params = self.best_params

训练和测试程序

import layers
import numpy as np
import matplotlib.pyplot as plt

small_data = {
  'X_train': np.array([[0, 0], [0, 1], [1, 1], [1, 0]]),
  'y_train': np.array([0, 1, 0, 1]),
  'X_val': np.array([[0, 0], [0, 1], [1, 1], [1, 0]]),
  'y_val': np.array([0, 1, 0, 1]),
}

learning_rate = 0.2
reg = 0.0
model = layers.muliti_layer_net(hidden_dim=[2,2], input_dim=2, num_classes=2, reg=reg, dtype=np.float64)
solver = layers.Solver(model, small_data,
                       print_every=1, num_epochs=5000, batch_size=4,
                       update_rule='sgd_momentum',
                       optim_config={'learning_rate': learning_rate})
solver.train()
print(model.params)
best_model = model

# plt.plot(solver.loss_history, 'o')
# plt.title('Training loss history')
# plt.xlabel('Iteration')
# plt.ylabel('Training loss')
# plt.show()


# x_ = [x_1, x_2]
# x_ = np.array(x_)
# x_ = x_.T
# print(x_.shape)
# # print(x_[20])
# test_pred = np.argmax(best_model.loss(x_), axis=1)
# print(test_pred)
x_1 = np.arange(0, 1, 0.01)
x_2 = np.arange(0, 1, 0.01)
x_test = np.zeros((len(x_1)*len(x_2), 2))
print(x_test.shape)
index = 0
for i in range(len(x_1)):
    for j in range(len(x_2)):
        x_test[int(index), 0] = x_1[int(i)]
        x_test[int(index), 1] = x_2[int(j)]
        index += 1
print(x_test[0])
print(x_test[903])
print(x_test[5203])

test_pred = np.argmax(best_model.loss(x_test), axis=1)
print(test_pred)

from mpl_toolkits.mplot3d import Axes3D

x_1, x_2 = np.meshgrid(x_1, x_2)
figure = plt.figure()
ax = Axes3D(figure)
test_pred = test_pred.reshape(len(x_1), len(x_2))
ax.plot_surface(x_1, x_2, test_pred, rstride=1, cstride=1, cmap='rainbow')
plt.show()

预测结果图

一些补充
1.代码并不是总能得到得到百分之百的正确率
2.如果迭代次数过少，正确率也会比较低，epoch一般都要3000以上才能得到100准确率
3.一些参数对正确率也有一定影响
4.本实验对于了解基本神经网络有一定作用

神经网络解决XOR（抑或）问题

引言

目标

一些说明

程序

猜你喜欢