[Frontiers of Computer Science] Chapter 7 Answers 2022 - Text Analysis

Chapter 7

7.1 Reading and writing of strings and text files

7.1.1 Creation of strings

string = "Hello World!"

7.1.2 Printing of character strings

print(string)
print("1234")

7.1.3 Reading strings from text files

text_1 = readtext("./nlp/data/textbooks/grade0/text0.txt")

print(text_1)

7.1.4 Word segmentation

text = readtext("./nlp/data/textbooks/grade0/text0.txt")
words = splitwords(text)

7.1.5 List and its common operations

l1 = [1, 3, 5, 7, 9, 11]
l2 = [1, 'a', ['3.14', 1.5], 'bc']

print(l1[2])
print(l2[3])

l1[1] = 20
print(l1)

l2.append('a')
print(l2)

print(len(l1))
print(len(l2))

for i in l1:
    print(i)

7.2 Dictionary and word frequency statistics

7.2.1 Creating a dictionary

word_freq_dict = {
    
    'you': 0.098, 'the': 0.020, 'friend': 0.059}
print(word_freq_dict)

7.2.2 Query keywords

print(word_freq_dict['you'])

7.2.3 Insertion and modification of key-value pairs

word_freq_dict['you'] = 0.088
print(word_freq_dict)

word_freq_dict['her'] = 0.0392
print(word_freq_dict)

7.2.4 Determine whether a keyword is in the dictionary

print('you' in word_freq_dict)
print('he' in word_freq_dict)

7.2.5 Dictionary traversal

for key, value in word_freq_dict.items():    
    print(key+':'+str(value))
    

7.2.6 Using dictionary type to count word frequency

def word_freq(words):
    freq_dict = {
    
    }
    for word in words:
        if word in freq_dict:
            freq_dict[word] += 1
        else:
            freq_dict[word] = 1
    for word, freq in freq_dict.items():
        freq_dict[word] = freq / len(words)
    return freq_dict
        
text = readtext("nlp/data/textbooks/grade0/text0.txt")
words = splitwords(text)
freq_dict = word_freq(words)
print(freq_dict)

7.3 Text Article Feature Extraction

7.3.1 Read text data

textbooks_data = load_textbooks_data()

print(len(textbooks_data))

print(textbooks_data[0:4])

7.3.2 Building a vocabulary list

def get_diff_level(path_grade):
    diff_level = {
    
    } 
    for path, grade in path_grade:
        text = readtext(path)
        words = splitwords(text)
        grade = int(grade)
        for word in words:
            if word in diff_level and diff_level[word] <= grade:
                continue
            else:
                diff_level[word] = grade
    return diff_level

diff_level = get_diff_level(textbooks_data)

print(diff_level)

7.3.3 Save and load vocabulary list

save_private(diff_level, "./data/tmp/diff_level")

diff_level = load_private("./data/tmp/diff_level")

print(diff_level)

7.3.4 Word segmentation

text = readtext("nlp/data/reading/train/text0.txt")

print(text)

words = splitwords(text)

print(len(words))

7.3.5 Count the occurrences of words of each grade in articles

grade_freq = [0,0,0,0,0,0,0,0,0,0,0,0]

l1 = [0]*3 
l2 = ['a']*5 

print(l1)
print(l2)

grade_freq = [0]*12

for word in words:
    if word in diff_level:
        grade = diff_level[word]
        grade_freq[grade] += 1
    else:
        continue

print(grade_freq)

7.3.6 Count the occurrence frequency of words in each grade in articles

num = sum(grade_freq)

print(num)

for i in range(12):    
    grade_freq[i] /= num
    
print(grade_freq)

7.3.7 Article Feature Extraction

def extract_features(path, diff_level):
    text = readtext(path)
    words = splitwords(text)
    grade_freq = [0]*12
    for word in words:
        if word in diff_level:
            grade = diff_level[word]
            grade_freq[grade] += 1
        else:
            continue
    num = sum(grade_freq)
    for i in range(12):
        grade_freq[i] /= num
    return grade_freq

grade_freq = extract_features("nlp/data/reading/train/text1.txt", diff_level)

print(grade_freq)

7.4 Text Difficulty Classification

7.4.1 Load new vocabulary list

diff_level = load_private("./data/tmp/diff_level")

7.4.2 Loading the dataset

train_data = load_train_data()

print(len(train_data))

print(train_data[0:5])

test_data = load_test_data()
print(len(test_data))
print(test_data[0:5])

7.4.3 Extract features and difficulty level and save

features = []
labels = []

for path, label in train_data:
    features.append(extract_features(path, diff_level))
    labels.append(int(label))
    
def get_feats_labels(data, diff_level):
    features = []
    labels = []
    for path, label in data:
        features.append(extract_features(path, diff_level))
        labels.append(int(label))
    return features, labels

train_feats, train_labels = get_feats_labels(train_data, diff_level)

print(train_feats[0:5])
print(train_labels[0:5])

save_private([train_feats, train_labels], "./data/tmp/train_features")

train_feats, train_labels = load_private("./data/tmp/train_features")
print(train_feats[0:5])
print(train_labels[0:5])

test_feats, test_labels = get_feats_labels(test_data, diff_level)
save_private([test_feats, test_labels], "./data/tmp/test_features")

7.4.4 Saving features and difficulty levels for binary data

train_data = load_binary_train_data("primary", "junior")

print(len(train_data))

print(train_data[0:5])

test_data = load_binary_test_data("primary", "junior")
test_feats, test_labels = get_feats_labels(test_data, diff_level)

save_private([train_feats, train_labels], "./data/tmp/pri_jun_train_features")
save_private([test_feats, test_labels], "./data/tmp/pri_jun_test_features")

7.4.5 Using a Linear Classifier to Classify Article Difficulty

model = linear_classifier()
model.train(train_feats, train_labels)

pred_y = model.pred(test_feats)
acc = accuracy(pred_y, test_labels)
print(acc)

7.5 Extracting Text Features

7.5.1 Loading the dataset

corpus = data.get('corpus')
doc = corpus[87]
fig() + plot(doc)

7.5.2 Building a bag of words

word_bags = corpus.map(split_words)

7.5.3 Load stop word vocabulary

stop_words = load_stopwords()
fig() + plot(stop_words)

7.5.4 Building a dictionary

vocabulary = build_vocabulary(word_bags, stop_words = stop_words, frequency_threshold = 5) 
fig() + plot(vocabulary)
print('Vocabulary Length: ', len(vocabulary))

7.5.5 Calculate word frequency vector

tf = TermFrequency(vocabulary)
tf_features = word_bags.map(tf)
feat = tf_features[87]
fig() + plot(feat)

7.5.6 Computing tf-idf vectors

tfidf = TFIDF(vocabulary, word_bags)
tfidf_features = word_bags.map(tfidf)
feat = tfidf_features[87]
fig() + plot(feat)

7.6 Discovering potential themes in text

7.6.1 Get Dataset

corpus, vocab, tf_feat, tfidf_feat = data.get('text-feat')

7.6.2 Building document-word matrix

tfidf_mat = to_matrix(tfidf_feat)

print("文档-词矩阵尺寸:",tfidf_mat.shape)

7.6.3 Nonnegative Matrix Factorization

model = topic_model(vocab, tfidf_mat, num_topics=8)

model.train()

7.6.4 Analysis of decomposition results

t_mat = model.tmatrix
w_mat = model.wmatrix

print('Size of T Matrix: ', t_mat.shape)
print('Size of W Matrix: ', w_mat.shape)

7.6.5 Extracting high-frequency words

high_freqs = model.extract_highfreqs(top_n=5)
fig() + plot(high_freqs)

7.6.6 Reviewing articles, summarizing topics and comparing model results

Reference image:

img

img

id = 87
doc = corpus[id]
fig() + plot(doc)

topic_weights = w_mat[id]
fig(2, 1) + [plot(high_freqs), plot(topic_weights)]

Guess you like

Origin blog.csdn.net/m0_68192925/article/details/127556276
Recommended