[Hands-on deep learning] Text preprocessing

text preprocessing

read dataset

  • Load the text into memory as a string.

  • Split a string into tokens (such as words and characters).

  • Build a vocabulary that maps split tokens to numeric indices.

  • Convert text to numerically indexed sequences for easy model manipulation.

Text download link:https://www.gutenberg.org/ebooks/35

We read each line into a list


import collections
import re
from d2l import torch as d2l
#@save
d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt',
                                '090b5e7e70c295757f55df93cb0a180b9691891a')

def read_time_machine():
    with open(d2l.download('time_machine'),'r') as f:
        lines = f.readlines()

    return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]

lines = read_time_machine()
print(f'# 文本总行数:{
      
      len(lines)}')

print(lines[0])
print(lines[10])

Lemmatization

 Take the list of text lines as input, and split it into a list of words for each line of strings, which is called a list of tokens here. A token is the basic unit of text, and finally returns a list composed of token lists, where Each token is a string.

def  tokensize(lines,token = 'word'):
    """将文本行拆分为单词或者字符词元"""
    if token == 'word':
        return [line.split() for line in lines]
    
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        print('错误  未知词元类型:' + token)


tokens = tokensize(lines)
for i in range(11):
    print(tokens[i])


# 文本总行数:3221
the time machine by h g wells
twinkled and his usually pale face was flushed and animated the
['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
[]
[]
[]
[]
['i']
[]
[]
['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']

vocabulary

  The token type is a string but the input required by the model is a number. Map the lemmas of the string type to the numerical index starting from 0, but first, it is necessary to count all the words to get all the unique lemmas, and the final statistical result is called a corpus. Then, according to the frequency of occurrence of each word element, a numerical index is assigned.

class Vocab:  #@save
    """文本词表"""
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        # 按出现频率排序
        counter = count_corpus(tokens)
        self._token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                   reverse=True)
        # 未知词元的索引为0
        self.idx_to_token = ['<unk>'] + reserved_tokens
        self.token_to_idx = {
    
    token: idx
                             for idx, token in enumerate(self.idx_to_token)}
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):  # 未知词元的索引为0
        return 0

    @property
    def token_freqs(self):
        return self._token_freqs

def count_corpus(tokens):  #@save
    """统计词元的频率"""
    # 这里的tokens是1D列表或2D列表
    if len(tokens) == 0 or isinstance(tokens[0], list):
        # 将词元列表展平成一个列表
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[:10])

  • convert each token of each line of text into an index
for i in [0, 10]:
    print('文本:', tokens[i])
    print('索引:', vocab[tokens[i]])

integrated corpus

  • Returns the lexical index list and vocabulary list of the Time Machine dataset

def load_corpus_time_machine(max_tokens=-1):  #@save
    """返回时光机器数据集的词元索引列表和词表"""
    lines = read_time_machine()
    tokens = tokenize(lines, 'char')
    vocab = Vocab(tokens)
    # 因为时光机器数据集中的每个文本行不一定是一个句子或一个段落,
    # 所以将所有文本行展平到一个列表中
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab

corpus, vocab = load_corpus_time_machine()
len(corpus), len(vocab)

Guess you like

Origin blog.csdn.net/qq_44653420/article/details/132492467