CNN的学习和使用

CNN广泛用于处理图像中，根据生物的习惯，我们识别一个图像往往只需处理局部的数据即可。

在CNN中，有权值共享这一概念，在大尺寸图像中随机选取一小块，并从小块样本中学习到一些特征，以这个特征作为探测器，应用到这个图像的任意地方。由此进行卷积运算。

池化，人们可以计算图像一个区域上某个特定特征的平均值或者最大值，这些概要特征会改善结果。

1.使用MNIST数据集进行练习

from __future__ import division, print_function, absolute_import
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import tflearn
import tflearn.data_utils as du

# Data loading and preprocessing
import tflearn.datasets.mnist as mnist
#获取数据集
X, Y, testX, testY = mnist.load_data(one_hot=True)
#TFLearn将图便转换成维度为28*28的向量，而为了处理方便我们需要把它恢复过来
X = X.reshape([-1, 28, 28, 1])
testX = testX.reshape([-1, 28, 28, 1])
X, mean = du.featurewise_zero_center(X)
testX = du.featurewise_zero_center(testX, mean)

# Building Residual Network 构建CNN网络
net = tflearn.input_data(shape=[None, 28, 28, 1])
#构建二维卷积函数
net = tflearn.conv_2d(net, 64, 3, activation='relu', bias=False)
# Residual blocks组建剩余网络
net = tflearn.residual_bottleneck(net, 3, 16, 64)
net = tflearn.residual_bottleneck(net, 1, 32, 128, downsample=True)
net = tflearn.residual_bottleneck(net, 2, 32, 128)
net = tflearn.residual_bottleneck(net, 1, 64, 256, downsample=True)
net = tflearn.residual_bottleneck(net, 2, 64, 256)
net = tflearn.batch_normalization(net)
net = tflearn.activation(net, 'relu')
net = tflearn.global_avg_pool(net)
# Regression
net = tflearn.fully_connected(net, 10, activation='softmax')
net = tflearn.regression(net, optimizer='momentum',
                         loss='categorical_crossentropy',
                         learning_rate=0.1)
# Training训练并交叉验证结果
model = tflearn.DNN(net, checkpoint_path='model_resnet_mnist',
                    max_checkpoints=10, tensorboard_verbose=0)
model.fit(X, Y, n_epoch=100, validation_set=(testX, testY),
          show_metric=True, batch_size=256, run_id='resnet_mnist')

---------------------------------

Run id: resnet_mnist

Log directory: /tmp/tflearn_logs/

---------------------------------

Training samples: 55000

Validation samples: 10000

Training Step: 215 | total loss: 0.07232 | time: 886.217s

| Momentum | epoch: 001 | loss: 0.07232 - acc: 0.9777 | val_loss: 0.09552 - val_acc: 0.9711 -- iter: 55000/55000

Training Step: 430 | total loss: 0.13791 | time: 930.724s

| Momentum | epoch: 002 | loss: 0.13791 - acc: 0.9728 | val_loss: 0.11432 - val_acc: 0.9663 -- iter: 55000/55000

Training Step: 645 | total loss: 0.13539 | time: 909.413s

| Momentum | epoch: 003 | loss: 0.13539 - acc: 0.9781 | val_loss: 0.07588 - val_acc: 0.9794 -- iter: 55000/55000

Training Step: 860 | total loss: 0.02486 | time: 880.677s

| Momentum | epoch: 004 | loss: 0.02486 - acc: 0.9927 | val_loss: 0.03608 - val_acc: 0.9901 -- iter: 55000/55000

Training Step: 1075 | total loss: 0.02277 | time: 872.714s

| Momentum | epoch: 005 | loss: 0.02277 - acc: 0.9936 | val_loss: 0.04405 - val_acc: 0.9863 -- iter: 55000/55000

Training Step: 1290 | total loss: 0.15620 | time: 878.157s

| Momentum | epoch: 006 | loss: 0.15620 - acc: 0.9766 | val_loss: 0.07888 - val_acc: 0.9790 -- iter: 55000/55000

Training Step: 1505 | total loss: 0.13995 | time: 914.958s

| Momentum | epoch: 007 | loss: 0.13995 - acc: 0.9828 | val_loss: 0.05722 - val_acc: 0.9852 -- iter: 55000/55000

Training Step: 1720 | total loss: 0.00957 | time: 770.717s

| Momentum | epoch: 008 | loss: 0.00957 - acc: 0.9984 | val_loss: 0.02719 - val_acc: 0.9921 -- iter: 55000/55000

Training Step: 1935 | total loss: 0.17275 | time: 781.146s

| Momentum | epoch: 009 | loss: 0.17275 - acc: 0.9761 | val_loss: 0.09713 - val_acc: 0.9732 -- iter: 55000/55000

Training Step: 2150 | total loss: 0.16494 | time: 764.513s

| Momentum | epoch: 010 | loss: 0.16494 - acc: 0.9791 | val_loss: 0.08176 - val_acc: 0.9792 -- iter: 55000/55000

Training Step: 2365 | total loss: 0.16337 | time: 768.006s

| Momentum | epoch: 011 | loss: 0.16337 - acc: 0.9813 | val_loss: 0.07136 - val_acc: 0.9808 -- iter: 55000/55000

Training Step: 2580 | total loss: 0.16413 | time: 769.976s

| Momentum | epoch: 012 | loss: 0.16413 - acc: 0.9846 | val_loss: 0.07357 - val_acc: 0.9809 -- iter: 55000/55000

2.识别恶意评论

from __future__ import division, print_function, absolute_import
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import tensorflow as tf
import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_1d, global_max_pool
from tflearn.layers.merge_ops import merge
from tflearn.layers.estimator import regression
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
import os
from tensorflow.contrib.learn.python import learn
from sklearn import metrics
from sklearn.model_selection import train_test_split
import numpy as np

MAX_DOCUMENT_LENGTH = 200
EMBEDDING_SIZE = 50

n_words=0

#读取文件，把每个文件转换成一个字符串
def load_one_file(filename):
    x=""
    with open(filename) as f:
        for line in f:
            x+=line
    return x
#遍历读取文件夹下全部文件
def load_files(rootdir,label):
    list = os.listdir(rootdir)
    x=[]
    y=[]
    for i in range(0, len(list)):
        path = os.path.join(rootdir, list[i])
        if os.path.isfile(path):
            #print "Load file %s" % path
            y.append(label)
            x.append(load_one_file(path))

    return x,y

#不同文件夹标记为正面和负面，其中正面评论标记为0，负面评论标记为1
def load_data():
    x=[]
    y=[]
    x1,y1=load_files("/Users/zhanglipeng/Data/movie-review-data/review_polarity/txt_sentoken/pos/",0)
    x2,y2=load_files("/Users/zhanglipeng/Data/review_polarity/txt_sentoken/neg/", 1)
    x=x1+x2
    y=y1+y2
    return x,y
def  do_cnn(trainX, trainY,testX, testY):
    global n_words
    # Data preprocessing
    # Sequence padding
    trainX = pad_sequences(trainX, maxlen=MAX_DOCUMENT_LENGTH, value=0.)
    testX = pad_sequences(testX, maxlen=MAX_DOCUMENT_LENGTH, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Building convolutional network构造CNN，使用一维卷积函数
    network = input_data(shape=[None, MAX_DOCUMENT_LENGTH], name='input')
    network = tflearn.embedding(network, input_dim=n_words+1, output_dim=128)
    branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
    branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
    branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
    network = merge([branch1, branch2, branch3], mode='concat', axis=1)
    network = tf.expand_dims(network, 2)
    network = global_max_pool(network)
    network = dropout(network, 0.5)
    network = fully_connected(network, 2, activation='softmax')
    network = regression(network, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy', name='target')
    # TrainingCNN算法在fit函数中直接指定了数据集合
    model = tflearn.DNN(network, tensorboard_verbose=0)
    model.fit(trainX, trainY, n_epoch = 20, shuffle=True, validation_set=(testX, testY), show_metric=True, batch_size=32)

if __name__ == '__main__':
    # IMDB Dataset loading
    global n_words
#词袋模型将数据向量化
    x,y=load_data()

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)

    vp = learn.preprocessing.VocabularyProcessor(max_document_length=MAX_DOCUMENT_LENGTH, min_frequency=1)
    vp.fit(x)
    x_train = np.array(list(vp.transform(x_train)))
    x_test = np.array(list(vp.transform(x_test)))
    n_words=len(vp.vocabulary_)
    print('Total words: %d' % n_words)

    do_cnn(x_train, y_train,x_test, y_test)

3.识别垃圾邮件

from sklearn.feature_extraction.text import CountVectorizer
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
from sklearn.feature_extraction.text import TfidfTransformer
import tensorflow as tf
import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_1d, global_max_pool
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.merge_ops import merge
from tflearn.layers.estimator import regression
from tflearn.data_utils import to_categorical, pad_sequences
from sklearn.neural_network import MLPClassifier
from tflearn.layers.normalization import local_response_normalization
from tensorflow.contrib import learn


max_features=500
max_document_length=1024



def load_one_file(filename):
    x=""
    with open(filename) as f:
        for line in f:
            line=line.strip('\n')
            line = line.strip('\r')
            x+=line
    return x

def load_files_from_dir(rootdir):
    x=[]
    list = os.listdir(rootdir)
    for i in range(0, len(list)):
        path = os.path.join(rootdir, list[i])
        if os.path.isfile(path):
            v=load_one_file(path)
            x.append(v)
    return x

def load_all_files():
    ham=[]
    spam=[]
    for i in range(1,5):
        path="/Users/zhanglipeng/Data/mail/enron%d/ham/" % i
        print "Load %s" % path
        ham+=load_files_from_dir(path)
        path="/Users/zhanglipeng/Data/mail/enron%d/spam/" % i
        print "Load %s" % path
        spam+=load_files_from_dir(path)
    return ham,spam

def get_features_by_wordbag():
    ham, spam=load_all_files()
    x=ham+spam
    y=[0]*len(ham)+[1]*len(spam)
    vectorizer = CountVectorizer(
                                 decode_error='ignore',
                                 strip_accents='ascii',
                                 max_features=max_features,
                                 stop_words='english',
                                 max_df=1.0,
                                 min_df=1 )
    print vectorizer
    x=vectorizer.fit_transform(x)
    x=x.toarray()
    return x,y

def show_diffrent_max_features():
    global max_features
    a=[]
    b=[]
    for i in range(1000,20000,2000):
        max_features=i
        print "max_features=%d" % i
        x, y = get_features_by_wordbag()
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)
        gnb = GaussianNB()
        gnb.fit(x_train, y_train)
        y_pred = gnb.predict(x_test)
        score=metrics.accuracy_score(y_test, y_pred)
        a.append(max_features)
        b.append(score)
        plt.plot(a, b, 'r')
    plt.xlabel("max_features")
    plt.ylabel("metrics.accuracy_score")
    plt.title("metrics.accuracy_score VS max_features")
    plt.legend()
    plt.show()

def do_nb_wordbag(x_train, x_test, y_train, y_test):
    print "NB and wordbag"
    gnb = GaussianNB()
    gnb.fit(x_train,y_train)
    y_pred=gnb.predict(x_test)
    print metrics.accuracy_score(y_test, y_pred)
    print metrics.confusion_matrix(y_test, y_pred)

def do_svm_wordbag(x_train, x_test, y_train, y_test):
    print "SVM and wordbag"
    clf = svm.SVC()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print metrics.accuracy_score(y_test, y_pred)
    print metrics.confusion_matrix(y_test, y_pred)

def get_features_by_wordbag_tfidf():
    ham, spam=load_all_files()
    x=ham+spam
    y=[0]*len(ham)+[1]*len(spam)
    vectorizer = CountVectorizer(binary=True,
                                 decode_error='ignore',
                                 strip_accents='ascii',
                                 max_features=max_features,
                                 stop_words='english',
                                 max_df=1.0,
                                 min_df=1 )
    print vectorizer
    x=vectorizer.fit_transform(x)
    x=x.toarray()
    transformer = TfidfTransformer(smooth_idf=False)
    print transformer
    tfidf = transformer.fit_transform(x)
    x = tfidf.toarray()
    return  x,y


def do_cnn_wordbag(trainX, testX, trainY, testY):
    global max_document_length
    print "CNN and tf"

    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Building convolutional network
    network = input_data(shape=[None,max_document_length], name='input')
    network = tflearn.embedding(network, input_dim=1000000, output_dim=128)
    branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
    branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
    branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
    network = merge([branch1, branch2, branch3], mode='concat', axis=1)
    network = tf.expand_dims(network, 2)
    network = global_max_pool(network)
    network = dropout(network, 0.8)
    network = fully_connected(network, 2, activation='softmax')
    network = regression(network, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy', name='target')
    # Training
    model = tflearn.DNN(network, tensorboard_verbose=0)
    model.fit(trainX, trainY,
              n_epoch=5, shuffle=True, validation_set=(testX, testY),
              show_metric=True, batch_size=100,run_id="spam")

def do_rnn_wordbag(trainX, testX, trainY, testY):
    global max_document_length
    print "RNN and wordbag"

    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, max_document_length])
    net = tflearn.embedding(net, input_dim=10240000, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
              batch_size=10,run_id="spm-run",n_epoch=5)


def do_dnn_wordbag(x_train, x_test, y_train, y_testY):
    print "DNN and wordbag"

    # Building deep neural network
    clf = MLPClassifier(solver='lbfgs',
                        alpha=1e-5,
                        hidden_layer_sizes = (5, 2),
                        random_state = 1)
    print  clf
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print metrics.accuracy_score(y_test, y_pred)
    print metrics.confusion_matrix(y_test, y_pred)



def  get_features_by_tf():
    global  max_document_length
    x=[]
    y=[]
    ham, spam=load_all_files()
    x=ham+spam
    y=[0]*len(ham)+[1]*len(spam)
    vp=tflearn.data_utils.VocabularyProcessor(max_document_length=max_document_length,
                                              min_frequency=0,
                                              vocabulary=None,
                                              tokenizer_fn=None)
    x=vp.fit_transform(x, unused_y=None)
    x=np.array(list(x))
    return x,y



if __name__ == "__main__":
    print "Hello spam-mail"
    #print "get_features_by_wordbag"
    #x,y=get_features_by_wordbag()
    #x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)

    #print "get_features_by_wordbag_tfidf"
    #x,y=get_features_by_wordbag_tfidf()
    #x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)
    #NB
    #do_nb_wordbag(x_train, x_test, y_train, y_test)
    #show_diffrent_max_features()

    #SVM
    #do_svm_wordbag(x_train, x_test, y_train, y_test)

    #DNN
    #do_dnn_wordbag(x_train, x_test, y_train, y_test)

    print "get_features_by_tf"
    x,y=get_features_by_tf()
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)
    #CNN
    #do_cnn_wordbag(x_train, x_test, y_train, y_test)


    #RNN
    do_rnn_wordbag(x_train, x_test, y_train, y_test)

1.使用MNIST数据集进行练习

2.识别恶意评论

3.识别垃圾邮件

猜你喜欢