tf-idf实例二

#!/usr/bin/env python

-- coding:utf-8 --

import numpy as np
import pandas as pd
from collections import Counter
docA = ‘The cat sat on my bed’
docB = ‘The dog sat on my knees’

切割文档

bowA = docA.split(’ ‘)
bowB = docB.split(’ ')
print(bowA,bowB)
wordSet = set(bowA).union(set(bowB))
print(“wordSet :”,type(wordSet),wordSet)
wordDictA = dict.fromkeys(wordSet, 0)
wordDictB = dict.fromkeys(wordSet, 0)
print(wordDictA,wordDictB)

for word in bowA:
wordDictA[word] += 1
for word in bowB:
wordDictB[word] += 1
print("#####",wordDictA,wordDictB)
print(“counter bowA:”,Counter(bowA))
pd_tf = pd.DataFrame([wordDictA, wordDictB])
print(pd_tf)
def computeTF(wordDict, bow):
# 用一个字典对象保存 TF,把所有对应于 bow 文档里的 TF都计算出来
tfDict = {}
nbowCount = len(bow)
for word, count in wordDict.items():
tfDict[word] = count / nbowCount
return tfDict
tfA = computeTF(wordDictA, bowA)
tfB = computeTF(wordDictB, bowB)
print(“tfA:”,wordDictA,bowA,tfA)
print(“tfB:”,tfB)

def computeIDF(wordDictList):
# 用一个字典对象保存 IDF,每个词作为 key,初始值为 0
print(wordDictList)
print(wordDictList[0])
idfDict = dict.fromkeys(wordDictList[0], 0)
print(“idfDict:”,idfDict)
# 总文档数量
N = len(wordDictList)
import math
for wordDict in wordDictList:
# 遍历字典中的每个词汇,统计 Ni
for word, count in wordDict.items():
if count > 0 :
# 先把 Ni 增加 1,存入到 idfDict 中
idfDict[word] += 1
print(“2 idfDict:”,idfDict)
# 已经得到所有词汇 i 对应的 Ni,现在根据公式把它替换成 idf 值
for word, Ni in idfDict.items():
idfDict[word] = math.log10((N + 1)/(Ni + 1))
return idfDict
print(“idfs wordDictA, wordDictB :”,[wordDictA, wordDictB])
idfs = computeIDF([wordDictA, wordDictB])
print(“idfs:”,idfs)

def computeTFIDF(tf, idfs):
tfidf = {}
for word, tfval in tf.items():
tfidf[word] = tfval * idfs[word]
return tfidf
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)

print("##########")
print(type(tfidfA))
print(tfidfA)
df_list = [tfidfA, tfidfB]
print(df_list,type(df_list))
print(“resutl:”,type(pd.DataFrame(df_list)))
print(pd.DataFrame(df_list))

发布了114 篇原创文章 · 获赞 18 · 访问量 3万+

猜你喜欢

转载自blog.csdn.net/WangYouJin321/article/details/103987127