python深度学习--处理文本数据（one-hot; word Embedding)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pylab
from pandas import DataFrame, Series
from keras import models, layers, optimizers, losses, metrics
from keras.utils.np_utils import to_categorical

#深度学习用于自然语言处理是将模式识别应用于单词、 句子和段落
#文本向量化（vectorize）是指将文本转换为数值张量的过程

#单词级的one-hot编码，之前路透社及imdb数据集已经涉及
#它将每个单词与一个唯一的整数索引相关联， 然后将这个整数索引 i 转换为长度为N的二进制向量（N是词表大小），这个向量只有第 i 个元 素是 1，其余元素都为 0。
#one-hot 编码得到的向量是二进制的、稀疏的（绝大部分元素都是0）、维度很高的（维度大小等于词表中的单词个数）
def one_hot_word():
    samples=['The cat sat on the mat.','The dog ate my homework.']
    token_index={}
    for sample in samples:
        for word in sample.split():#按空格划分单词（这里并没有去除标点符号）
            if word not in token_index:#为每个单词建立索引，从1开始（单词不重复）
                token_index[word]=len(token_index)+1
    max_length=10#samples中的不重复分词长度为10，且每个samples元素分词长度最大为6
    results=np.zeros(shape=(len(samples),max_length,max(token_index.values())+1))
    print(token_index)
    print(results.shape)
    print(results)
    for i,sample in enumerate(samples):
        for j,word in list(enumerate(sample.split()))[:max_length]:
            index=token_index.get(word)
            results[i,j,index]=1.#索引与数组下标相同置为1
    return results
# one_hot_word()

#字符级one-hot编码
import string
def one_hot_char():
    samples=['The cat sat on the mat.','The dog ate my homework.']
    characters=string.printable#返回字符串，所有可打印的 ASCII 字符
    print(characters)
    token_index=dict(zip(range(1,len(characters)+1),characters))#给字符添加索引
    print(token_index)
    max_length=50
    results=np.zeros((len(samples),max_length,max(token_index.keys())+1))
    for i,sample in enumerate(samples):
        for j,character in enumerate(sample):
            index=token_index.get(character)
            results[i,j,index]=1.
    return results
# one_hot_char()
#------------------------------------------------------------------------------
'''
Keras 的内置函数可以对原始文本数据进行单词级或字符级的 one-hot 编码
有很多重要的特性：
从字符串中去除特殊字符、只考虑数据集中前N个最常见的单词（这是一种常用的限制，以避免处理非常大的输入向量空间）

'''
#用 Keras 实现单词级的 one-hot 编码
from keras.preprocessing.text import Tokenizer
samples=['The cat sat on the mat.', 'The dog ate my homework.']
tokenizer=Tokenizer(num_words=1000)#创建一个分词器（tokenizer），设置 为只考虑前 1000 个最常见的单词
tokenizer.fit_on_texts(samples)#构建单词索引
sequences = tokenizer.texts_to_sequences(samples)#将字符串转换为整数索引组成的列表
print(sequences)
one_hot_results=tokenizer.texts_to_matrix(samples,mode='binary')
#可以直接得到 one-hot 二进制表示。 这个分词器也支持除 one-hot 编码外 的其他向量化模式
word_index=tokenizer.word_index#返回单词索引
print(word_index)
print('found %s unique tokens.'%len(word_index))

#使用散列技巧的单词级的 one-hot 编码[hash]
#如果词表中唯一标记的数量太大而无法直接处理，就可以使用这种技巧
#将单词散列编码为固定长度的向量，通常用一个非常简单的散列函数来实现,但可能会出现散列冲突[如果散列空间的维度远大于需要散列的唯一标记的个数，散列冲突的可能性会减小]

#使用散列技巧的单词级的 one-hot 编码
samples=['The cat sat on the mat.', 'The dog ate my homework.']
dimensionality=1000#将单词保存为长度为 1000 的向量。如果单词数量接近 1000 个（或更多），那么会遇到很多散列冲突，这会降低这种编码方法的准确性
max_length=10
results=np.zeros((len(samples), max_length, dimensionality))
print(results.shape)
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = abs(hash(word)) % dimensionality #将单词散列为 0~1000 范围内的一个随机整数索引
        results[i, j, index] = 1.


# one-hot 编码或 one-hot 散列得到的词表示是稀疏的、高维的、硬编码的；
# 词嵌入是密集的、相对低维的，而且是从数据中学习得到的
#使用词嵌入（word embedding）：
'''
获取词嵌入有两种方法
    1.在完成主任务（比如文档分类或情感预测）的同时学习词嵌入。在这种情况下，
一开始是随机的词向量，然后对这些词向量进行学习，其学习方式与学习神经网络的
权重相同
    2.在不同于待解决问题的机器学习任务上预计算好词嵌入，然后将其加载到模型
中。这些词嵌入叫作预训练词嵌入（pretrained word embedding）
    
'''
#1.利用Embedding层学习嵌入
'''
    在一个合理的嵌入空间中，同义词应该被嵌入 到相似的词向量中，一般来说，任意两个词向量之间的几何距离（比如 L2 距离）应该和这两个词的语义距离有
关（表示不同事物的词被嵌入到相隔很远的点，而相关的词则更加靠近）。除了 距离，你可能还希望嵌入空间中的特定方向也是有意义的
    在真实的词嵌入空间中，常见的有意义的几何变换的例子包括“性别”向量和“复数”向量。例如，将 king（国王）向量加上 female（女性）向量，得到的是
queen（女王）向量。将 king（国王） 向量加上 plural（复数）向量，得到的是 kings 向量。词嵌入空间通常具有几千个这种可解释的、 并且可能很有用的向量。

    一个好的词嵌入空间在很大程度上取决于你的任务。英语电影评论情感分析 模型的完美词嵌入空间，可能不同于英语法律文档分类模型的完美词嵌入空间，因为某些语义关系的重要性因任务而异
    合理的做法是对每个新任务都学习一个新的嵌入空间。幸运的是，反向传播让这种学习变得很简单，而 Keras 使其变得更简单
'''
#将一个 Embedding 层实例化
from keras.layers import Embedding
embedding_layer=Embedding(1000,64)#Embedding 层至少需要两个参数： 标记的个数（即最大单词索引 +1）和嵌入的维度（这里是 64）
'''
接收整数作为输入，并在内部字典中查找这些整数，然后返回相关联的向量。
     单词索引---> Embedding ----->对应的词向量
输入: 一个二维整数张量，其形状为 (samples, sequence_length)，每个元素是一个整数序列[一批数据中的所有序列必须具有相同的长度（因为需要将它们打包成一个张量），所以较短的序列应该用 0 填充，较长的序列应该被截断。]

输出：返回一个形状为 (samples, sequence_length, embedding_ dimensionality) 的三维浮点数张量
'''
#加载IMDB数据，准备用于Embedding层
from keras.datasets import imdb
from keras import preprocessing
def general_embedding():
    max_features=10000#作为特征的单词个数（前10000个常见单词）
    maxlen=20#每个样本达到20个单词后截断文本
    (x_train,y_train),(x_test,y_test)=imdb.load_data(num_words=max_features)
    x_train=preprocessing.sequence.pad_sequences(x_train,maxlen=maxlen)#将整数列表转换成形状为(samples, maxlen)的二维整数张量
    x_test=preprocessing.sequence.pad_sequences(x_test,maxlen=maxlen)

    #在IMDB数据上使用Embedding层和分类器
    from keras.layers import Flatten,Dense,Embedding
    model=models.Sequential()
    model.add(Embedding(10000,8,input_length=maxlen))#指定 Embedding 层的最大输入长度，以便后面将嵌入输入展平。Embedding层激活的形状为(samples, maxlen, 8)
    model.add(Flatten())#将三维的嵌入张量展平成形状为 (samples, maxlen * 8) 的二维张量
    model.add(Dense(1,activation='sigmoid'))#分类器
    model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
    print(model.summary())
    history=model.fit(x_train,y_train,epochs=10,batch_size=32,validation_split=0.2)
# general_embedding()
#val_acc: 0.7466
'''
注意，仅仅将嵌入序列展开并在上面训练一个 Dense 层，会导致模型对输入序列中的每个单词单独处理，而没有考虑单词之间的关系和句子结构（举个例子，这个
模型可能会将 this movie is a bomb和 this movie is the bomb两条都归为负面评论）。更好的做法是在嵌入序列上添加循环层或一维卷积层，将每个序列作为整体来学习特征
'''
########-------------------------------------------------------------------
#2.使用预训练的词嵌入
#原理类似于之前的预训练的cnn（如VGG16等）：没有足够的数据来自己学习真正强大的特征，但你需要的特征应该是非常通用的， 比如常见的视觉特征或语义特征
#1）无监督的方法计算一个密集的低维词嵌入空间：word2vec 算法 由 Google 的 Tomas Mikolov 于 2013 年开发，其维度抓住了特定的语义属性，比如性别；
# 2）GloVe（global vectors for word representation，词表示全局向量），由斯坦福大学的研究人员于 2014 年开发。这种嵌入方法基于对词共现统计矩阵进行因式分解

#####-------------------整合在一起：从原始文本到词嵌入
#1.下载IMDB数据的原始文本：http://mng.bz/0tIo
#[因为该文件不带后缀，可能导致浏览器无法访问，可尝试使用xunlei 输入网址下载，或者使用linux 命令行：wget http://mng.bz/0tIo --no-check-certificate ]
#2.处理IMDB原始数据的标签
import os
imdb_dir='F:/aclImdb'#解压后文件路径
train_dir=os.path.join(imdb_dir,'train')
def word_splitting(directory):

    labels=[]#存放标签
    texts=[]#存放评论
    for label_type in ['neg','pos']:
        dir_name=os.path.join(directory,label_type)
        for fname in os.listdir(dir_name):#neg/pos目录下所有.txt文件
            if fname[-4:]=='.txt':
                #出现UnicodeDecodeError: 'gbk' codec can't decode byte 0x89 in position 14: illegal multibyte sequence错误：添加encoding='utf-8'
                f=open(os.path.join(dir_name,fname),encoding='utf-8')
                texts.append(f.read())
                f.close()
                if label_type=='neg':#标记
                    labels.append(0)
                else:
                    labels.append(1)
    return texts,labels
#2.对数据进行分词
#针对于具体任务的嵌入可能效果更好
# from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
maxlen=100#100个单词后截断评论
training_samples=200#因为预训练的词嵌入对训练数据很少的问题特别有用(将训练数据限定为前 200)
validation_samples=10000#验证集
max_words=10000#只考虑数据集中前 10 000 个最常见的单词
texts,labels=word_splitting(train_dir)
tokenizer=Tokenizer(num_words=max_words)#分词
tokenizer.fit_on_texts(texts)
sequences=tokenizer.texts_to_sequences(texts)#转换成索引序列
print(sequences)#
word_index=tokenizer.word_index#获取词索引
print('Found %s unique tokens.' % len(word_index))
data=pad_sequences(sequences,maxlen=maxlen)#将整数列表转换成形状为(samples, maxlen)的二维整数张量
labels=np.asarray(labels)#标签列表转换成一维张量
print(data.shape)
print(labels.shape)
indices=np.arange(data.shape[0])#texts（评论）数
np.random.shuffle(indices)#置乱（不然所有负面评论都在前面， 然后是所有正面评论）
data=data[indices]
labels=labels[indices]
x_train=data[:training_samples]
y_train=labels[:training_samples]
x_val=data[training_samples:training_samples+validation_samples]#验证集10000
y_val=labels[training_samples:training_samples+validation_samples]
print(x_val.shape)

#3.下载GloVe词嵌入：https://nlp.stanford.edu/projects/glove  找到glove.6B.zip，解压[400 000 个单词（或非单词的标记） 的 100 维嵌入向量]

#4.对嵌入进行预处理：构建一个将单词（字符串）映射为其向量表示（数值向量）的索引
glove_dir='F:/glove.6B'
embeddings_index={}
f=open(os.path.join(glove_dir,'glove.6B.100d.txt'),encoding='utf-8')
i=0
for line in f:
    values=line.split()#形式为['word','float','float',...,'float']#(即该单词与其它单词的相关因子)
    word=values[0]
    coefs=np.asarray(values[1:],dtype='float32')
    embeddings_index[word]=coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

#准备GloVe词嵌入矩阵

#它必须是一个形状为 (max_words, embedding_dim) 的矩阵，对于单词索引（在分词时构建）中索引为 i 的单词， 这个矩阵的元素 i 就是这个单词对应的 embedding_dim 维向量

embedding_dim=100
embedding_matrix=np.zeros((max_words,embedding_dim))
for word,i in word_index.items():
    if i <max_words:
        embedding_vector=embeddings_index.get(word)#获取嵌入向量
        if embedding_vector is not None:#嵌入索引（embeddings_index） 中找不到的词，其嵌入向量全为 0
            embedding_matrix[i]=embedding_vector

#5.定义模型
from keras.layers import Flatten,Dense,Embedding
model=models.Sequential()
model.add(Embedding(max_words,embedding_dim,input_length=maxlen))
model.add(Flatten())
model.add(Dense(32,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
print(model.summary())

#在模型中加载GloVe嵌入
model.layers[0].set_weights([embedding_matrix])#Embedding 层只有一个权重矩阵，是一个二维的浮点数矩阵，其中每个元素 i 是与索引 i 相关联的词向量
model.layers[0].trainable=False
#要冻结Embedding层（即将其trainable属性设为False），如果一个模型的一部分是经过预训练的（如Embedding层），而另一部分是随机初始化的（如分类器），
# 那么在训练期间不应该更新预训练的部分，以 避免丢失它们所保存的信息。随机初始化的层会引起较大的梯度更新，会破坏已经学到的特征

#7.训练与评估
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
history=model.fit(x_train,y_train,epochs=10,batch_size=32,validation_data=(x_val,y_val))
model.save_weights('pre_trained_glove_model.h5')

#8.绘制结果
def acc_loss_plot(history):
    fig=plt.figure()
    ax1=fig.add_subplot(2,1,1)
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)
    ax1.plot(epochs, acc, 'bo', label='Training acc')
    ax1.plot(epochs, val_acc, 'b', label='Validation acc')
    ax1.set_title('Training and validation accuracy')
    ax2=fig.add_subplot(2,1,2)
    ax2.plot(epochs, loss, 'bo', label='Training loss')
    ax2.plot(epochs, val_loss, 'b', label='Validation loss')
    ax2.set_title('Training and validation loss')
    plt.legend()
    plt.tight_layout()
    plt.show()
acc_loss_plot(history)#训练样本很少，模型很快过拟合，验证精度接近60%

#在不加载预训练词嵌入、也不冻结嵌入层的情况下训练相同的模型，精度达到50%左右------注释掉239，240行代码

#对测试集进行分词
test_dir=os.path.join(imdb_dir,'test')
texts,labels=word_splitting(test_dir)
sequences = tokenizer.texts_to_sequences(texts)#测试集不训练
x_test=pad_sequences(sequences,maxlen)
y_test=np.asarray(labels)

#在测试集上评估模型
model.load_weights('pre_trained_glove_model.h5')
results=model.evaluate(x_test,y_test)
print(results)#0.55596
python深度学习--处理文本数据（one-hot; word Embedding)

（少量训练数据[200]）冻结预训练Embedding的精度和损失

猜你喜欢