NLP相似度之tf-idf计算

当然,在学习过程中也是参考了很多其他的资料,代码都是一行一行敲出来的。

一、将多个文件合并成一个文件,避免频繁的打开和关闭

 1 import sys
 2 
 3 for line in sys.stdin:
 4     ss = line.strip().split('\t')
 5     file_name = ss[0].strip()
 6     file_context = ss[1].strip()
 7     word_list = file_context.split(' ')
 8 
 9     word_set = set()
10     for word in word_list:
11         word_set.add(word)
12 
13     for word in word_set:
14         print '\t'.join([word, '1'])

执行命令:就可以得到合并后的文件啦!!!

python convert.py input_tfidf_dir/ > merge_files.data 

tf-idf计算流程图:

二 、计算IDF的值:

map阶段:读取每一行

 1 import sys
 2 
 3 for line in sys.stdin:
 4     ss = line.strip().split('\t')
 5     file_name = ss[0].strip()
 6     file_context = ss[1].strip()
 7     word_list = file_context.split(' ')
 8 
 9     word_set = set()
10     for word in word_list:
11         word_set.add(word)
12 
13     for word in word_set:
14         print '\t'.join([word, '1'])

reduce阶段:

 1 import sys
 2 import math
 3 
 4 current_word = None
 5 doc_cnt = 508
 6 count_pool = []
 7 sum = 0
 8 
 9 for line in sys.stdin:
10     ss = line.strip().split('\t')
11     if len(ss) != 2:
12         continue
13 
14     word, val = ss
15     if current_word == None:
16         current_word = word
17     if current_word != word:
18         for count in count_pool:
19             sum += count
20 
21         idf_score = math.log(float(doc_cnt) / (float(sum) + 1))
22         print '\t'.join([current_word, str(idf_score)])
23 
24         current_word = word
25         count_pool = []
26         sum = 0
27 
28     count_pool.append((int(val)))
29 
30 for count in count_pool:
31     sum += count
32 
33 idf_score = math.log(float(doc_cnt) / (float(sum) + 1))
34 print '\t'.join([current_word, str(idf_score)])

三、计算TF的值:

 1 # 计算tf
 2 # 读取合并后的数据
 3 # 执行命令 cat merge_files.data | python map_tf.py mapper_func idf.data
 4 
 5 import sys
 6 
 7 word_dict = {}
 8 idf_dict = {}
 9 
10 # 读取计算的idf数据文件
11 def read_idf_file_func(idf_file_fd):
12     with open() as fd:
13         for line in fd:
14             ss = line.strip().split('\t')
15             if len(ss) != 2:
16                 continue
17             token = ss[0].strip()
18             idf_score = ss[1].strip()
19             idf_dict[token] = float(idf_score)
20     return idf_dict
21 
22 # cat merge_files.data | python map_tf.py mapper_func
23 def mapper_func(idf_file_fd):
24     idf_dict = read_idf_file_func(idf_file_fd)
25     # 标准输入
26     for line in sys.stdin:
27         ss = line.strip().split('\t')
28         file_name = ss[0].strip()
29         file_context = ss[1].strip()
30         word_list = file_context.split(' ')
31 
32         for word in word_list:
33             if word not in word_dict:
34                 word_dict[word] = 1
35             else:
36                 word_dict[word] += 1
37 
38         for k,v in word_dict.item():
39             if k not in idf_dict:
40                 continue
41             print(file_name,k,v,idf_file_fd[k])
42             print(k,v)
43 
44 if __name__ == "__main__":
45     module = sys.modules[__name__]
46     func = getattr(module, sys.argv[1])
47     args = None
48     if len(sys.argv) > 1:
49         args = sys.argv[2:]
50     func(*args)

猜你喜欢

转载自www.cnblogs.com/ssqq5200936/p/10744284.html