CNN广泛用于处理图像中,根据生物的习惯,我们识别一个图像往往只需处理局部的数据即可。
在CNN中,有权值共享这一概念,在大尺寸图像中随机选取一小块,并从小块样本中学习到一些特征,以这个特征作为探测器,应用到这个图像的任意地方。由此进行卷积运算。
池化,人们可以计算图像一个区域上某个特定特征的平均值或者最大值,这些概要特征会改善结果。
1.使用MNIST数据集进行练习
from __future__ import division, print_function, absolute_import
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import tflearn
import tflearn.data_utils as du
# Data loading and preprocessing
import tflearn.datasets.mnist as mnist
#获取数据集
X, Y, testX, testY = mnist.load_data(one_hot=True)
#TFLearn将图便转换成维度为28*28的向量,而为了处理方便我们需要把它恢复过来
X = X.reshape([-1, 28, 28, 1])
testX = testX.reshape([-1, 28, 28, 1])
X, mean = du.featurewise_zero_center(X)
testX = du.featurewise_zero_center(testX, mean)
# Building Residual Network 构建CNN网络
net = tflearn.input_data(shape=[None, 28, 28, 1])
#构建二维卷积函数
net = tflearn.conv_2d(net, 64, 3, activation='relu', bias=False)
# Residual blocks组建剩余网络
net = tflearn.residual_bottleneck(net, 3, 16, 64)
net = tflearn.residual_bottleneck(net, 1, 32, 128, downsample=True)
net = tflearn.residual_bottleneck(net, 2, 32, 128)
net = tflearn.residual_bottleneck(net, 1, 64, 256, downsample=True)
net = tflearn.residual_bottleneck(net, 2, 64, 256)
net = tflearn.batch_normalization(net)
net = tflearn.activation(net, 'relu')
net = tflearn.global_avg_pool(net)
# Regression
net = tflearn.fully_connected(net, 10, activation='softmax')
net = tflearn.regression(net, optimizer='momentum',
loss='categorical_crossentropy',
learning_rate=0.1)
# Training训练并交叉验证结果
model = tflearn.DNN(net, checkpoint_path='model_resnet_mnist',
max_checkpoints=10, tensorboard_verbose=0)
model.fit(X, Y, n_epoch=100, validation_set=(testX, testY),
show_metric=True, batch_size=256, run_id='resnet_mnist')
---------------------------------
Run id: resnet_mnist
Log directory: /tmp/tflearn_logs/
---------------------------------
Training samples: 55000
Validation samples: 10000
--
Training Step: 215 | total loss: 0.07232 | time: 886.217s
| Momentum | epoch: 001 | loss: 0.07232 - acc: 0.9777 | val_loss: 0.09552 - val_acc: 0.9711 -- iter: 55000/55000
--
Training Step: 430 | total loss: 0.13791 | time: 930.724s
| Momentum | epoch: 002 | loss: 0.13791 - acc: 0.9728 | val_loss: 0.11432 - val_acc: 0.9663 -- iter: 55000/55000
--
Training Step: 645 | total loss: 0.13539 | time: 909.413s
| Momentum | epoch: 003 | loss: 0.13539 - acc: 0.9781 | val_loss: 0.07588 - val_acc: 0.9794 -- iter: 55000/55000
--
Training Step: 860 | total loss: 0.02486 | time: 880.677s
| Momentum | epoch: 004 | loss: 0.02486 - acc: 0.9927 | val_loss: 0.03608 - val_acc: 0.9901 -- iter: 55000/55000
--
Training Step: 1075 | total loss: 0.02277 | time: 872.714s
| Momentum | epoch: 005 | loss: 0.02277 - acc: 0.9936 | val_loss: 0.04405 - val_acc: 0.9863 -- iter: 55000/55000
--
Training Step: 1290 | total loss: 0.15620 | time: 878.157s
| Momentum | epoch: 006 | loss: 0.15620 - acc: 0.9766 | val_loss: 0.07888 - val_acc: 0.9790 -- iter: 55000/55000
--
Training Step: 1505 | total loss: 0.13995 | time: 914.958s
| Momentum | epoch: 007 | loss: 0.13995 - acc: 0.9828 | val_loss: 0.05722 - val_acc: 0.9852 -- iter: 55000/55000
--
Training Step: 1720 | total loss: 0.00957 | time: 770.717s
| Momentum | epoch: 008 | loss: 0.00957 - acc: 0.9984 | val_loss: 0.02719 - val_acc: 0.9921 -- iter: 55000/55000
--
Training Step: 1935 | total loss: 0.17275 | time: 781.146s
| Momentum | epoch: 009 | loss: 0.17275 - acc: 0.9761 | val_loss: 0.09713 - val_acc: 0.9732 -- iter: 55000/55000
--
Training Step: 2150 | total loss: 0.16494 | time: 764.513s
| Momentum | epoch: 010 | loss: 0.16494 - acc: 0.9791 | val_loss: 0.08176 - val_acc: 0.9792 -- iter: 55000/55000
--
Training Step: 2365 | total loss: 0.16337 | time: 768.006s
| Momentum | epoch: 011 | loss: 0.16337 - acc: 0.9813 | val_loss: 0.07136 - val_acc: 0.9808 -- iter: 55000/55000
--
Training Step: 2580 | total loss: 0.16413 | time: 769.976s
| Momentum | epoch: 012 | loss: 0.16413 - acc: 0.9846 | val_loss: 0.07357 - val_acc: 0.9809 -- iter: 55000/55000
--
2.识别恶意评论
from __future__ import division, print_function, absolute_import
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import tensorflow as tf
import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_1d, global_max_pool
from tflearn.layers.merge_ops import merge
from tflearn.layers.estimator import regression
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
import os
from tensorflow.contrib.learn.python import learn
from sklearn import metrics
from sklearn.model_selection import train_test_split
import numpy as np
MAX_DOCUMENT_LENGTH = 200
EMBEDDING_SIZE = 50
n_words=0
#读取文件,把每个文件转换成一个字符串
def load_one_file(filename):
x=""
with open(filename) as f:
for line in f:
x+=line
return x
#遍历读取文件夹下全部文件
def load_files(rootdir,label):
list = os.listdir(rootdir)
x=[]
y=[]
for i in range(0, len(list)):
path = os.path.join(rootdir, list[i])
if os.path.isfile(path):
#print "Load file %s" % path
y.append(label)
x.append(load_one_file(path))
return x,y
#不同文件夹标记为正面和负面,其中正面评论标记为0,负面评论标记为1
def load_data():
x=[]
y=[]
x1,y1=load_files("/Users/zhanglipeng/Data/movie-review-data/review_polarity/txt_sentoken/pos/",0)
x2,y2=load_files("/Users/zhanglipeng/Data/review_polarity/txt_sentoken/neg/", 1)
x=x1+x2
y=y1+y2
return x,y
def do_cnn(trainX, trainY,testX, testY):
global n_words
# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=MAX_DOCUMENT_LENGTH, value=0.)
testX = pad_sequences(testX, maxlen=MAX_DOCUMENT_LENGTH, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)
# Building convolutional network构造CNN,使用一维卷积函数
network = input_data(shape=[None, MAX_DOCUMENT_LENGTH], name='input')
network = tflearn.embedding(network, input_dim=n_words+1, output_dim=128)
branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
network = merge([branch1, branch2, branch3], mode='concat', axis=1)
network = tf.expand_dims(network, 2)
network = global_max_pool(network)
network = dropout(network, 0.5)
network = fully_connected(network, 2, activation='softmax')
network = regression(network, optimizer='adam', learning_rate=0.001,
loss='categorical_crossentropy', name='target')
# TrainingCNN算法在fit函数中直接指定了数据集合
model = tflearn.DNN(network, tensorboard_verbose=0)
model.fit(trainX, trainY, n_epoch = 20, shuffle=True, validation_set=(testX, testY), show_metric=True, batch_size=32)
if __name__ == '__main__':
# IMDB Dataset loading
global n_words
#词袋模型将数据向量化
x,y=load_data()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)
vp = learn.preprocessing.VocabularyProcessor(max_document_length=MAX_DOCUMENT_LENGTH, min_frequency=1)
vp.fit(x)
x_train = np.array(list(vp.transform(x_train)))
x_test = np.array(list(vp.transform(x_test)))
n_words=len(vp.vocabulary_)
print('Total words: %d' % n_words)
do_cnn(x_train, y_train,x_test, y_test)
3.识别垃圾邮件
from sklearn.feature_extraction.text import CountVectorizer
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
from sklearn.feature_extraction.text import TfidfTransformer
import tensorflow as tf
import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_1d, global_max_pool
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.merge_ops import merge
from tflearn.layers.estimator import regression
from tflearn.data_utils import to_categorical, pad_sequences
from sklearn.neural_network import MLPClassifier
from tflearn.layers.normalization import local_response_normalization
from tensorflow.contrib import learn
max_features=500
max_document_length=1024
def load_one_file(filename):
x=""
with open(filename) as f:
for line in f:
line=line.strip('\n')
line = line.strip('\r')
x+=line
return x
def load_files_from_dir(rootdir):
x=[]
list = os.listdir(rootdir)
for i in range(0, len(list)):
path = os.path.join(rootdir, list[i])
if os.path.isfile(path):
v=load_one_file(path)
x.append(v)
return x
def load_all_files():
ham=[]
spam=[]
for i in range(1,5):
path="/Users/zhanglipeng/Data/mail/enron%d/ham/" % i
print "Load %s" % path
ham+=load_files_from_dir(path)
path="/Users/zhanglipeng/Data/mail/enron%d/spam/" % i
print "Load %s" % path
spam+=load_files_from_dir(path)
return ham,spam
def get_features_by_wordbag():
ham, spam=load_all_files()
x=ham+spam
y=[0]*len(ham)+[1]*len(spam)
vectorizer = CountVectorizer(
decode_error='ignore',
strip_accents='ascii',
max_features=max_features,
stop_words='english',
max_df=1.0,
min_df=1 )
print vectorizer
x=vectorizer.fit_transform(x)
x=x.toarray()
return x,y
def show_diffrent_max_features():
global max_features
a=[]
b=[]
for i in range(1000,20000,2000):
max_features=i
print "max_features=%d" % i
x, y = get_features_by_wordbag()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)
gnb = GaussianNB()
gnb.fit(x_train, y_train)
y_pred = gnb.predict(x_test)
score=metrics.accuracy_score(y_test, y_pred)
a.append(max_features)
b.append(score)
plt.plot(a, b, 'r')
plt.xlabel("max_features")
plt.ylabel("metrics.accuracy_score")
plt.title("metrics.accuracy_score VS max_features")
plt.legend()
plt.show()
def do_nb_wordbag(x_train, x_test, y_train, y_test):
print "NB and wordbag"
gnb = GaussianNB()
gnb.fit(x_train,y_train)
y_pred=gnb.predict(x_test)
print metrics.accuracy_score(y_test, y_pred)
print metrics.confusion_matrix(y_test, y_pred)
def do_svm_wordbag(x_train, x_test, y_train, y_test):
print "SVM and wordbag"
clf = svm.SVC()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print metrics.accuracy_score(y_test, y_pred)
print metrics.confusion_matrix(y_test, y_pred)
def get_features_by_wordbag_tfidf():
ham, spam=load_all_files()
x=ham+spam
y=[0]*len(ham)+[1]*len(spam)
vectorizer = CountVectorizer(binary=True,
decode_error='ignore',
strip_accents='ascii',
max_features=max_features,
stop_words='english',
max_df=1.0,
min_df=1 )
print vectorizer
x=vectorizer.fit_transform(x)
x=x.toarray()
transformer = TfidfTransformer(smooth_idf=False)
print transformer
tfidf = transformer.fit_transform(x)
x = tfidf.toarray()
return x,y
def do_cnn_wordbag(trainX, testX, trainY, testY):
global max_document_length
print "CNN and tf"
trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)
# Building convolutional network
network = input_data(shape=[None,max_document_length], name='input')
network = tflearn.embedding(network, input_dim=1000000, output_dim=128)
branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
network = merge([branch1, branch2, branch3], mode='concat', axis=1)
network = tf.expand_dims(network, 2)
network = global_max_pool(network)
network = dropout(network, 0.8)
network = fully_connected(network, 2, activation='softmax')
network = regression(network, optimizer='adam', learning_rate=0.001,
loss='categorical_crossentropy', name='target')
# Training
model = tflearn.DNN(network, tensorboard_verbose=0)
model.fit(trainX, trainY,
n_epoch=5, shuffle=True, validation_set=(testX, testY),
show_metric=True, batch_size=100,run_id="spam")
def do_rnn_wordbag(trainX, testX, trainY, testY):
global max_document_length
print "RNN and wordbag"
trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)
# Network building
net = tflearn.input_data([None, max_document_length])
net = tflearn.embedding(net, input_dim=10240000, output_dim=128)
net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
loss='categorical_crossentropy')
# Training
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
batch_size=10,run_id="spm-run",n_epoch=5)
def do_dnn_wordbag(x_train, x_test, y_train, y_testY):
print "DNN and wordbag"
# Building deep neural network
clf = MLPClassifier(solver='lbfgs',
alpha=1e-5,
hidden_layer_sizes = (5, 2),
random_state = 1)
print clf
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print metrics.accuracy_score(y_test, y_pred)
print metrics.confusion_matrix(y_test, y_pred)
def get_features_by_tf():
global max_document_length
x=[]
y=[]
ham, spam=load_all_files()
x=ham+spam
y=[0]*len(ham)+[1]*len(spam)
vp=tflearn.data_utils.VocabularyProcessor(max_document_length=max_document_length,
min_frequency=0,
vocabulary=None,
tokenizer_fn=None)
x=vp.fit_transform(x, unused_y=None)
x=np.array(list(x))
return x,y
if __name__ == "__main__":
print "Hello spam-mail"
#print "get_features_by_wordbag"
#x,y=get_features_by_wordbag()
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)
#print "get_features_by_wordbag_tfidf"
#x,y=get_features_by_wordbag_tfidf()
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)
#NB
#do_nb_wordbag(x_train, x_test, y_train, y_test)
#show_diffrent_max_features()
#SVM
#do_svm_wordbag(x_train, x_test, y_train, y_test)
#DNN
#do_dnn_wordbag(x_train, x_test, y_train, y_test)
print "get_features_by_tf"
x,y=get_features_by_tf()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)
#CNN
#do_cnn_wordbag(x_train, x_test, y_train, y_test)
#RNN
do_rnn_wordbag(x_train, x_test, y_train, y_test)