Tensorflow学习(二):文本分类

TensorFlow如何工作?什么是机器学习模型,什么是神经网络?,神经网络如何学习,如何处理数据并将其传递给神经网络输入,如何运行模型并获得预测结果?

用神经网络和TensorFlow进行文本分类

# -*- coding:utf-8 -*-
# 用神经网络和TensorFlow分类文本

import numpy as np
import tensorflow as tf
from collections import Counter
from sklearn.datasets import fetch_20newsgroups

categories = ["comp.graphics", "sci.space", "rec.sport.baseball"]
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

print('total texts in train:', len(newsgroups_train.data))
print('total texts in test:', len(newsgroups_test.data))

vocab = Counter()

for text in newsgroups_train.data:
    for word in text.split(' '):
        vocab[word.lower()] += 1
for text in newsgroups_test.data:
    for word in text.split(' '):
        vocab[word.lower()] += 1
print("Total words:", len(vocab))

total_words = len(vocab)


def get_word_2_index(vocab):
    word2index = {}
    for i, word in enumerate(vocab):
        word2index[word.lower()] = i
    return word2index


word2index = get_word_2_index(vocab)


def get_batch(df, i, batch_size):
    batches = []
    results = []
    texts = df.data[i*batch_size:i*batch_size+batch_size]
    categories = df.target[i*batch_size:i*batch_size+batch_size]
    for text in texts:
        layer = np.zeros(total_words, dtype=float)
        for word in text.split(' '):
            layer[word2index[word.lower()]] += 1
        batches.append(layer)
    for category in categories:
        y = np.zeros((3), dtype=float)
        if category == 0:
            y[0] = 1
        elif category == 1:
            y[1] = 1
        else:
            y[2] = 1
        results.append(y)
    return np.array(batches), np.array(results)


# parameters
learning_rate = 0.01
training_epochs = 10
batch_size = 150
display_step = 1

# Network Parameters
n_hidden_1 = 100      # 1st layer number of features
n_hidden_2 = 100       # 2nd layer number of features
n_input = total_words  # Words in vocab
n_classes = 3         # Categories: graphics, sci.space and baseball

input_tensor = tf.placeholder(tf.float32, [None, n_input], name='input')
output_tensor = tf.placeholder(tf.float32, [None, n_classes], name='output')


def multilayer_perceptron(input_tensor, weights, biases):
    layer_1_multiplication = tf.matmul(input_tensor, weights['h1'])
    layer_1_addition = tf.add(layer_1_multiplication, biases['b1'])
    layer_1 = tf.nn.relu(layer_1_addition)

    # Hidden layer with RELU activation
    layer_2_multiplication = tf.matmul(layer_1, weights['h2'])
    layer_2_addition = tf.add(layer_2_multiplication, biases['b2'])
    layer_2 = tf.nn.relu(layer_2_addition)

    # output layer
    out_layer_multiplication = tf.matmul(layer_2, weights['out'])
    out_layer_addition = tf.add(out_layer_multiplication, biases['out'])
    return out_layer_addition


# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}
# construct model
prediction = multilayer_perceptron(input_tensor, weights, biases)

# Define loss & optimizer
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=output_tensor))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

init = tf.global_variables_initializer()

# Launch the graph
with tf.Session() as sess:
    sess.run()

    # training cycle
    for epoch in range(training_epochs):
        avg_cost = 0
        total_batch = int(len(newsgroups_train.data)/batch_size)
        # loop over all batches
        for i in range(total_batch):
            batch_x, batch_y = get_batch(newsgroups_train, i, batch_size)
            # Run optimization op (backprop) and cost op (to get loss value)
            c, _ = sess.run([loss, optimizer], feed_dict={input_tensor: batch_x, output_tensor: batch_y})
            # compute average loss
            avg_cost += c/total_batch
            # Display logs per epoch step
            if epoch % display_step == 0:
                print("Epoch:", '%04d' % (epoch + 1), "loss=", "{:.9f}".format(avg_cost))
    print("Optimization Finished!")
    # Test model
    correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(output_tensor, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    total_test_data = len(newsgroups_test.target)
    batch_x_test, batch_y_test = get_batch(newsgroups_test, 0, total_test_data)
    print("Accuracy:", accuracy.eval({input_tensor: batch_x_test, output_tensor: batch_y_test}))
total texts in train: 1774
total texts in test: 1180
Total words: 119930
2017-12-27 16:25:00.121692: I C:\tf_jenkins\home\workspace\rel-win\M\windows\PY\36\tensorflow\core\platform\cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX AVX2
Epoch: 0001 loss= 164.599542791
Epoch: 0001 loss= 229.308005593
Epoch: 0001 loss= 312.437832919
Epoch: 0001 loss= 419.234974254
Epoch: 0001 loss= 491.679138184
Epoch: 0001 loss= 628.423256614
Epoch: 0001 loss= 716.383273038
Epoch: 0001 loss= 767.755315607
Epoch: 0001 loss= 804.416395708
Epoch: 0001 loss= 839.817649148
Epoch: 0001 loss= 873.678663774
Epoch: 0002 loss= 12.155790849
Epoch: 0002 loss= 37.855947321
Epoch: 0002 loss= 61.012716120
Epoch: 0002 loss= 89.293117176
Epoch: 0002 loss= 95.758963845
Epoch: 0002 loss= 106.525867115
Epoch: 0002 loss= 140.357190912
Epoch: 0002 loss= 156.909991871
Epoch: 0002 loss= 188.811081626
Epoch: 0002 loss= 201.752990029
Epoch: 0002 loss= 223.565703652
Epoch: 0003 loss= 22.479526867
Epoch: 0003 loss= 36.388277921
Epoch: 0003 loss= 40.809483615
Epoch: 0003 loss= 45.898767298
Epoch: 0003 loss= 54.971788927
Epoch: 0003 loss= 92.455045873
Epoch: 0003 loss= 105.125879114
Epoch: 0003 loss= 107.778049989
Epoch: 0003 loss= 109.214097977
Epoch: 0003 loss= 109.673732237
Epoch: 0003 loss= 114.287493446
Epoch: 0004 loss= 20.967838634
Epoch: 0004 loss= 36.625368985
Epoch: 0004 loss= 56.412753018
Epoch: 0004 loss= 61.769968900
Epoch: 0004 loss= 69.876602173
Epoch: 0004 loss= 71.778213328
Epoch: 0004 loss= 74.627652255
Epoch: 0004 loss= 74.787704804
Epoch: 0004 loss= 75.873412468
Epoch: 0004 loss= 80.153322816
Epoch: 0004 loss= 95.818399332
Epoch: 0005 loss= 1.251875010
Epoch: 0005 loss= 4.656712185
Epoch: 0005 loss= 5.162945314
Epoch: 0005 loss= 5.383128795
Epoch: 0005 loss= 6.082911080
Epoch: 0005 loss= 7.967948935
Epoch: 0005 loss= 7.992210087
Epoch: 0005 loss= 7.992210087
Epoch: 0005 loss= 8.931116670
Epoch: 0005 loss= 9.138003655
Epoch: 0005 loss= 9.726748903
Epoch: 0006 loss= 10.962064570
Epoch: 0006 loss= 15.842136730
Epoch: 0006 loss= 19.281685569
Epoch: 0006 loss= 20.565143499
Epoch: 0006 loss= 21.748276104
Epoch: 0006 loss= 21.754320296
Epoch: 0006 loss= 21.754320296
Epoch: 0006 loss= 21.754320296
Epoch: 0006 loss= 21.808484375
Epoch: 0006 loss= 21.808484375
Epoch: 0006 loss= 29.417288557
Epoch: 0007 loss= 1.706949928
Epoch: 0007 loss= 2.966487364
Epoch: 0007 loss= 2.966487364
Epoch: 0007 loss= 2.966487364
Epoch: 0007 loss= 2.966487364
Epoch: 0007 loss= 2.966487364
Epoch: 0007 loss= 3.028692327
Epoch: 0007 loss= 3.216852399
Epoch: 0007 loss= 3.259427713
Epoch: 0007 loss= 3.259427713
Epoch: 0007 loss= 3.259427713
Epoch: 0008 loss= 0.000000000
Epoch: 0008 loss= 0.000000000
Epoch: 0008 loss= 0.021928859
Epoch: 0008 loss= 0.223925532
Epoch: 0008 loss= 0.517013535
Epoch: 0008 loss= 0.517013535
Epoch: 0008 loss= 0.517013535
Epoch: 0008 loss= 0.677528009
Epoch: 0008 loss= 0.677528009
Epoch: 0008 loss= 0.677528009
Epoch: 0008 loss= 0.677528009
Epoch: 0009 loss= 2.619076642
Epoch: 0009 loss= 2.619076642
Epoch: 0009 loss= 2.619076642
Epoch: 0009 loss= 2.674342090
Epoch: 0009 loss= 2.674342090
Epoch: 0009 loss= 2.674342090
Epoch: 0009 loss= 2.674342090
Epoch: 0009 loss= 2.674342090
Epoch: 0009 loss= 2.674342090
Epoch: 0009 loss= 2.674342090
Epoch: 0009 loss= 2.674342090
Epoch: 0010 loss= 1.000655781
Epoch: 0010 loss= 1.000655781
Epoch: 0010 loss= 1.000655781
Epoch: 0010 loss= 1.000655781
Epoch: 0010 loss= 1.000655781
Epoch: 0010 loss= 1.000655781
Epoch: 0010 loss= 1.000655781
Epoch: 0010 loss= 1.000655781
Epoch: 0010 loss= 1.000655781
Epoch: 0010 loss= 1.000675207
Epoch: 0010 loss= 1.000675207
Optimization Finished!
Accuracy: 0.723729
Process finished with exit code 0

猜你喜欢

转载自blog.csdn.net/sinat_36972314/article/details/80269971
今日推荐