''' 文本序列化 ''' class WordSequence(): UNK_TAG = "<UNK>" PAD_TAG = "<PAD>" UNK = 1 PAD = 0 def __init__(self): self.dict = { self.UNK_TAG:self.UNK, self.PAD_TAG:self.PAD } self.count = {} def fit(self,sentence): ''' 统计词频 :param sentence: :return: ''' for word in sentence: self.count[word] = self.count.get(word,0)+1 def build_vocab(self,min_count=0,max_count = None,max_features = None): """ Construction dictionary based on criteria : param min_count: minimum word : param max_count: Maximum Frequencies : param max_features: Maximum number of words : return: "" " IF min_count None Not IS: self.count = {Word: Word for COUNT, COUNT in Self. count.items () IF COUNT> min_count} IF MAX_COUNT None Not IS: self.count = {Word: Word for COUNT, in self.count.items COUNT () IF COUNT <MAX_COUNT} IF max_features None Not IS: # sort self = dict .count (the sorted (self.count.items (), the lambda X: X [-1], Reverse = True) [: max_features]) for Word in self.count: self.dict [Word] = len (Self .dict) # each correspond to a digital word The flip dict # self.inverse_dict = dict (ZIP (self.dict.values (), self.dict.keys ())) DEF Transform (Self, sentence, max_len = None): '' ' the sentence into a digital sequence : param sentence: : return: '' ' IF len (sentence)> max_len: sentence = sentence [: max_len] the else: sentence = sentence + [self.PAD_TAG] * (len-max_len (sentence)) return [Self. dict.get (I,. 1) for I in sentence] DEF inverse_transform (Self, incides): "" " the number of characters into a sequence : param incides: : return: " "" return [self.inverse_dict.get (I," <UNK> ") for i in incides] def __len__(self): return len(self.dict) if __name__ == '__main__': sentences = [["今天","天气","很","好"], ["今天","去","吃","什么"]] ws = WordSequence() for sentence in sentences: ws.fit(sentence) ws.build_vocab(min_count=0) print(ws.dict) ret = ws.transform(["好","热","呀","呀","呀","呀","呀","呀","呀"],max_len=5) print(ret) ret = ws.inverse_transform(ret) print(ret)
Corpus serialization and save
from word_sequence import WordSequence from dataset import get_dataloader import pickle from tqdm import tqdm if __name__ == '__main__': ws = WordSequence() train_data = get_dataloader(True) test_data = get_dataloader(False) for reviews,labels in tqdm(train_data,total=len(train_data)): for review in reviews: ws.fit(review) for reviews,labels in tqdm(test_data,total=len(test_data)): for review in reviews: ws.fit(review) ws.build_vocab() print(len(ws)) pickle.dump(ws,open("./models/ws.pkl"))