Text sequence of [GM] word2sequence

'''
文本序列化
'''

class WordSequence():
    UNK_TAG = "<UNK>"
    PAD_TAG = "<PAD>"
    UNK = 1
    PAD = 0

    def __init__(self):
        self.dict = {
            self.UNK_TAG:self.UNK,
            self.PAD_TAG:self.PAD
        }
        self.count = {}


    def fit(self,sentence):
        '''
        统计词频
        :param sentence: 
        :return: 
        '''
        for word in sentence:
            self.count[word] = self.count.get(word,0)+1

    def build_vocab(self,min_count=0,max_count = None,max_features = None): 
        """
        Construction dictionary based on criteria 
        : param min_count: minimum word 
        : param max_count: Maximum Frequencies 
        : param max_features: Maximum number of words 
        : return: 
        "" " 
        IF min_count None Not IS: 
            self.count = {Word: Word for COUNT, COUNT in Self. count.items () IF COUNT> min_count} 
        IF MAX_COUNT None Not IS: 
            self.count = {Word: Word for COUNT, in self.count.items COUNT () IF COUNT <MAX_COUNT} 
        IF max_features None Not IS: 
            # sort 
            self = dict .count (the sorted (self.count.items (), the lambda X: X [-1], Reverse = True) [: max_features]) 

        for Word in self.count: 
            self.dict [Word] = len (Self .dict) # each correspond to a digital word

        The flip dict # 
            self.inverse_dict = dict (ZIP (self.dict.values (), self.dict.keys ())) 

    DEF Transform (Self, sentence, max_len = None): 
        '' ' 
        the sentence into a digital sequence 
        : param sentence: 
        : return: 
        '' ' 
        IF len (sentence)> max_len: 
            sentence = sentence [: max_len] 
        the else: 
            sentence = sentence + [self.PAD_TAG] * (len-max_len (sentence)) 
        return [Self. dict.get (I,. 1) for I in sentence] 

    DEF inverse_transform (Self, incides): 
        "" " 
        the number of characters into a sequence 
        : param incides: 
        : return: 
        " "" 
        return [self.inverse_dict.get (I," <UNK> ") for i in incides]

    def __len__(self):
        return len(self.dict)

if __name__ == '__main__':
    sentences = [["今天","天气","很","好"],
                 ["今天","去","吃","什么"]]

    ws = WordSequence()
    for sentence in sentences:
        ws.fit(sentence)

    ws.build_vocab(min_count=0)
    print(ws.dict)
    ret = ws.transform(["好","热","呀","呀","呀","呀","呀","呀","呀"],max_len=5)
    print(ret)
    ret = ws.inverse_transform(ret)
    print(ret)

  Corpus serialization and save

 

from word_sequence import WordSequence
from dataset import get_dataloader
import pickle
from tqdm import tqdm

if __name__ == '__main__':
    ws = WordSequence()
    train_data = get_dataloader(True)
    test_data = get_dataloader(False)
    for reviews,labels in tqdm(train_data,total=len(train_data)):
        for review in reviews:
            ws.fit(review)
    for reviews,labels in tqdm(test_data,total=len(test_data)):
        for review in reviews:
            ws.fit(review)

    ws.build_vocab()
    print(len(ws))
    pickle.dump(ws,open("./models/ws.pkl"))

 

  

 

Guess you like

Origin www.cnblogs.com/LiuXinyu12378/p/12319312.html