基于softmax的多分类模型

对于多分类问题，可以使用softmax来做，但是效果不是那么好，当做一个算法的练手吧

首先是数据集处理的代码：

文件名：data_loader.py

# coding: utf-8

import sys
from collections import Counter
import pdb
import numpy as np
import tensorflow.contrib.keras as kr

if sys.version_info[0] > 2:
    is_py3 = True
else:
    reload(sys)
    sys.setdefaultencoding("utf-8")
    is_py3 = False


def native_word(word, encoding='utf-8'):
    """如果在python2下面使用python3训练的模型，可考虑调用此函数转化一下字符编码"""
    if not is_py3:
        return word.encode(encoding)
    else:
        return word


def native_content(content):
    if not is_py3:
        return content.decode('utf-8')
    else:
        return content


def open_file(filename, mode='r'):
    """
    常用文件操作，可在python2和python3间切换.
    mode: 'r' or 'w' for read or write
    """
    if is_py3:
        return open(filename, mode, encoding='utf-8', errors='ignore')
    else:
        return open(filename, mode)


def read_file(filename):
    """读取文件数据"""
    contents, labels = [], []
    with open_file(filename) as f:
        for line in f:
            try:
                label, content = line.strip().split('\t')
                if content:
                    contents.append(list(native_content(content)))
                    labels.append(native_content(label))
            except:
                pass
    return contents, labels

#构建词汇表，使用字符级的表示，这一函数会将词汇表存储下来，避免每一次重复处理;
def build_vocab(train_dir, vocab_dir, vocab_size=5000):
    """根据训练集构建词汇表，存储"""
    #train, test, val文件的格式为： 分类 文字
    data_train, _ = read_file(train_dir)

    all_data = []
    for content in data_train:
        all_data.extend(content)

    counter = Counter(all_data) #统计所有文档中每个字出现的次数 格式：{'c': 3, 'a': 1, 'b': 1} 
    count_pairs = counter.most_common(vocab_size - 1) #取出现次数最多的部分, 格式：[('c', 3), ('a', 1)]
    words, _ = list(zip(*count_pairs)) #格式：[('c', 'a'), (3, 1)], words格式为：('c', 'a')
    # 添加一个 <PAD> 来将所有文本pad为同一长度
    words = ['<PAD>'] + list(words)
    open_file(vocab_dir, mode='w').write('\n'.join(words) + '\n')  #vocab_dir里面就是处理后的词表，每行一个字

#读取上一步存储的词汇表，转换为{词：id}表示;
def read_vocab(vocab_dir):
    """读取词汇表"""
    # words = open_file(vocab_dir).read().strip().split('\n')
    with open_file(vocab_dir) as fp:
        # 如果是py2 则每个值都转化为unicode
        words = [native_content(_.strip()) for _ in fp.readlines()]
    word_to_id = dict(zip(words, range(len(words))))
    return words, word_to_id

# 将分类目录固定，转换为{类别: id}表示;
def read_category(file):
    categories = [] 
    for line in open(file, 'r'):
        categories.append(line.strip().split('\t')[0])
    categories = set(categories)    
    categories = [native_content(x) for x in categories]

    cat_to_id = dict(zip(categories, range(len(categories))))

    return categories, cat_to_id

#将一条由id表示的数据重新转换为文字;
def to_words(content, words):
    """将id表示的内容转换为文字"""
    return ''.join(words[x] for x in content)

#将数据集从文字转换为固定长度的id序列表示;
def process_file(filename, word_to_id, cat_to_id, max_length=600):
    """将文件转换为id表示"""
    contents, labels = read_file(filename)

    data_id, label_id = [], [] 
    for i in range(len(contents)):
        #实际就是将一篇文档的词id向量和一个分类id对应起来

        #data_id中每个元素是一篇文档的词id构成的向量
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])

        #label_id， 每篇文档对应一个分类id，这个分类id是与一篇文档的词id向量对应
        label_id.append(cat_to_id[labels[i]]) 

    # 使用keras提供的pad_sequences来将文本pad为固定长度
    #因为data_id中每个元素都是一个由一篇文档中的字组成的向量，而每篇文档长度不同，所以每篇文档对应的向量元素个数不同，所以这里要将他们格式化为同一长度，策略就是高位补0
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length) 
    y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id))  # 将标签转换为one-hot表示
#    y_pad = parse_vector(label_id, len(cat_to_id))

    return x_pad, y_pad

def parse_vector(label_id, size):
    des = []
    for label in label_id:
        c = np.full(size, 1e-8)
        c[label] = 1
        des.append(c)

    return np.array(des)

#为神经网络的训练准备经过shuffle的批次的数据。
#x为所有文档的词id向量构成的集合，是np.array类型
#y为所有文档对应的标签的one-hot向量集合
#注意：batch_iter这个函数返回的是一个迭代器
def batch_iter(x, y, batch_size=64):
    """生成批次数据"""
    data_len = len(x)
    num_batch = int((data_len - 1) / batch_size) + 1  #计算每个批次取的数据量

    #np.random.permutation是随机打乱一个数组, 比如将[0,1,2,3] 打乱成[3,1,0,2]
    #np.arange是构造一个[0, data_len]的列表
    indices = np.random.permutation(np.arange(data_len))
    x_shuffle = x[indices]  #基于一个打乱的索引顺序indics，分别从x中取出对应位置的向量，并按照这个顺序组成新的x，实际就是打乱x的向量顺序
    y_shuffle = y[indices]

    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]  #每次从打乱顺序后的x和y中依次取一个批次的数据

文件名：run_softmax.py

#-*- coding:utf8 -*-

import pdb
import os
import tensorflow as tf
import numpy as np
from data_loader import read_vocab, read_category, batch_iter, process_file, build_vocab

train_file = 'data/baike_levelone_category_train.lst'
test_file = 'data/baike_levelone_category_test.lst'
vocab_file = 'data/baike_category_vocab.lst'
all_file = 'data/baike_levelone_category_all.lst'

max_vocab_size = 5000
seq_length = 600 #输入x的维度
num_epochs = 10
batch_size = 64

def feed_data(x_batch, y_batch):
    feed_dict = {
        x: x_batch,
        y_: y_batch
    }
    return feed_dict

if not os.path.exists(vocab_file):
    build_vocab(all_file, vocab_file, max_vocab_size)
print 'build vocab over'
#全部分类，分类对应的id
categorys, cat_to_id = read_category(all_file)
print 'read category over'
words, word_to_id = read_vocab(vocab_file)
print 'read vocab over'
x_train, y_train = process_file(train_file, word_to_id, cat_to_id, seq_length)

print 'process file over'
num_classes = len(cat_to_id)
#定义模型

with tf.device('/cpu:0'):
    x = tf.placeholder(tf.float32, [None, seq_length], name='input_x')
    y_ = tf.placeholder(tf.float32, [None, num_classes], name='input_y')
    
#    w = tf.Variable(tf.zeros([seq_length, num_classes]))
#    b = tf.Variable(tf.zeros([num_classes]))
    w = tf.Variable(tf.truncated_normal(shape=[seq_length, num_classes], mean=0, stddev=1))
    b = tf.Variable(tf.truncated_normal(shape=[num_classes], mean=0, stddev=1)) 

    y_mat = tf.matmul(x,w) + b    
    y = tf.nn.softmax(y_mat)
#    cost = -tf.reduce_sum(y_*tf.log(y))  #交叉熵的计算方式
#    cost = tf.reduce_sum(tf.square(y_-y))
    cost = tf.nn.sigmoid_cross_entropy_with_logits(labels=y_, logits=y)
    train_step = tf.train.GradientDescentOptimizer(0.005).minimize(cost)
    print 'initial'
    init = tf.initialize_all_variables()
    print 'session'
    sess = tf.Session()
    sess.run(init)
    
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) #argmax是指取数组中最大的值所在的索引
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print 'accuracy'
    
    for epoch in range(num_epochs):
        print('epoch:', epoch+1)
        batch_train = batch_iter(x_train, y_train, batch_size)
             
        #这里x_batch的维度是(batch_size, seq_length), batch_size其实就是每次取的文档的个数
        for x_batch, y_batch in batch_train:
#            pdb.set_trace() 
            feed_dict = feed_data(x_batch, y_batch)
            sess.run(train_step, feed_dict=feed_dict)
    
            print 'accuracy',sess.run(accuracy, feed_dict=feed_dict)
            print 'y',sess.run(tf.argmax(y,1), feed_dict=feed_dict)
            print 'y_',sess.run(tf.argmax(y_,1), feed_dict=feed_dict)   

feed_dict = feed_data(x_train, y_train)
print 'accuracy',sess.run(accuracy, feed_dict=feed_dict)

说明：数据集文件的格式为每行一个样本，以制表符分割，第一列为分本的分类，第二列为文本的内容

训练打印日志如下：

accuracy 0.578125
y [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
y_ [ 5  5  6  1  5  2  5  9  5  5  1  5  7  5  9  1  1  5  1  5  1  1 12  5
  5  5  5  4  5  5  7  5  5  7  5 11  5  4  5  5  5  5  5  5  1  5  6  5
 12  5 10  5  4  5  5  5  4 11  0  5  6  5  5  5]
accuracy 0.5625
y [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
y_ [ 1  5  5  5  4  4  7  8  5  5  2  5  5  5  5  4  5  1  5  1 10  4  6  1
  5 12  1  5  7  5  5  5  5 11  5  5  5 11  5  5  7  1  5  7  5  5 10  4
  5  5  5 12  5  5  5  4  5  5 12  5  5  5  1  1]
accuracy 0.59375
y [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
y_ [9 1 5 1 5 7 5 5 8 5 1 5 5 5 1 5 1 5 5 1 5 5 5 4 5 5 5 1 5 5 5 5 1 6 5 5 5
 5 7 5 1 7 5 5 5 5 1 6 5 1 5 1 5 5 0 7 5 5 4 7 5 5 1 7]
accuracy 0.625
y [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
y_ [11  4  5  5  5  5  5  5  5  5 11  5  5  1  5  5 11  5  5  8  5  5 11  1
  5  2  9  5  5  1  7  5  5  5  1  1  1  1  5  5  5  5  5  5  5  5  5  5
  5  5 12  1  1  1  5  8  5  5  5  5  1  1  1  5]
accuracy 0.71875
y [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
y_ [ 5  6  5  5  1  5  5  8  5  5  5  5  5  5  5  5  5  5  5  1  7 12  5  5
  5  5  5  1  5  5  5  7  5  5  4  6  4  5  5  5  5  5  5  5  5  4  5  6
  5  1  5  9  5  1 11  5  5  1  5  5  5  5  5  5]
accuracy 0.59375
y [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
y_ [ 5  0  5  5  5  1  0  1  5 11  5  5  5 11  5  5  5  5  5  5  9 11  1  5
  6  1  4  5  5  5  5  5  7  5  6  5  5  5 11 11  5  1  5 10  5  1  1  5
  5  5  0  1  1  5  5  5  5  9  1  5  5 11  5  5]
accuracy 1.0
y [5]
y_ [5]
accuracy 0.62927836

可以看到在训练集上的准确率只有62%，这个效果很差，当然效果差的原因可能是其他方面我没有做好。

这个项目实战的学习心得：

1、我们的机器上当别人在使用GPU的时候，我如果要执行我的训练，会抛出Out_OF_MEMORY内存不够的异常，本想通过with tf.device('/cpu:0') 来指定让代码不使用GPU，但是这种方式不行，应该还需要其他的方式执行，我最终将代码移到另外一台机器上执行的

2、梯度下降算法中学习率的选择问题

学习率不要选择太大，否则很难收敛，在学习的过程中可能总是跳过最佳点位

学习率也不要选择太小，否则，学习消耗的时间太长

3、损失函数的选择

我选择的损失函数是交叉熵，我一开始是自己写的交叉熵算法，如下：

cost = -tf.reduce_sum(y_*tf.log(y))

这导致的一个问题是，在优化的过程中，w和b的值出现[nan, nan, nan, nan]的问题。

原因应该是公式中用到了log函数，因为它的参数的范围是大于0，如果给它的参数为0，就会抛异常，这应该是引起nan的原因

解决办法是，用tensorflow自带的计算交叉熵的函数，tensorflow提供了四种计算交叉熵的函数，可以选择其中一种

基于softmax的多分类模型

猜你喜欢