# encoding=utf-8
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
corpus = [
'This This is the first document.',
'This This is the second second document.',
'And the third one.',
'Is this the first document?',
]
tfidf_model = TfidfVectorizer()
tfidf_matrix = tfidf_model.fit_transform(corpus)
word_dict=tfidf_model.get_feature_names()
print(word_dict)
print(tfidf_matrix)
实验结果:
"C:\Program Files\Anaconda3\python.exe" D:/pycharmprogram/csgwork/find_classification_keys/test_tfidfVectorizer.py
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
(0, 8) 0.6986804246371375
(0, 3) 0.34934021231856877
(0, 6) 0.2856085141790751
(0, 2) 0.43150466158747897
(0, 1) 0.34934021231856877
(1, 8) 0.49256714844677196
(1, 3) 0.24628357422338598
(1, 6) 0.20135295972313796
(1, 1) 0.24628357422338598
(1, 5) 0.7717016211057586
(2, 6) 0.2884767487500274
(2, 0) 0.5528053199908667
(2, 7) 0.5528053199908667
(2, 4) 0.5528053199908667
(3, 8) 0.4387767428592343
(3, 3) 0.4387767428592343
(3, 6) 0.35872873824808993
(3, 2) 0.5419765697264572
(3, 1) 0.4387767428592343
Process finished with exit code 0