决策树实现文本分类

如何使用决策树实现文本分类：
1.分词模块：
a.人工整理词库。
b.使用自己编写的词库和分词代码（添加词库的同时，删除影响词库的词）进行分词。
2.进行one-hot编码：
在进行one-hot编码的同时，使用同义词对one-hot进行合并，同义词使用相同的id.
3.使用sklearn里面的决策树进行计算，并且使用sklearn里面的自动化调参进行调参。
代码如下：
#!/usr/bin/env python
# coding=utf-8

import numpy as np
from sklearn import tree
from sklearn.cross_validation import train_test_split
import pandas as pd
import jieba
import pydot
from sklearn.externals.six import StringIO
import re
# jieba.add_word("导诊台")
# jieba.del_word("机能")
# jieba.add_word("自助机")
# jieba.add_word("胶片袋")
# jieba.add_word("巡诊室")
# jieba.del_word("片东院")
# jieba.add_word("东院")
class node:
	def __init__(self,key=None,value=None,condition=None,left_id=None, right_id=None):
		self.key=key
		self.condition=condition
		self.value=value
		self.left_id= left_id
		self.right_id = right_id
def segment():
	"""
	分词方法
	:return:
	"""
	typedict = {0: str}
	# result_file=open("segment_result.txt","w",encoding="utf8")
	train_data = pd.read_csv("E:/协和问答系统/SenLiu/voice.csv", dtype=typedict)
	jieba.load_userdict("E:/协和问答系统/SenLiu/words.txt")
	for row in train_data.index:
		datas = train_data.loc[row].values[0]
		words = list(jieba.cut(datas))
		del_set = []
		words_file = open("E:/协和问答系统/SenLiu/words.txt", "r", encoding="utf8")
		if type(datas) != float:
			for word in words_file:
				if word.strip() != "":
					allmacth = re.finditer(word.strip(), datas)
					for match in allmacth:
						del_set.append(match.span()[0])
						del_set.append(match.span()[1])
		words_file.close()
		begin = 0
		end = len(words[0])
		lenght = len(words)
		for i in range(lenght):
			for index in del_set:
				if begin < index and end > index:
					jieba.del_word(words[i])
			if i != lenght - 1:
				begin = end
				end = end + len(words[i + 1])
	# for row in train_data.index:
	# 		datas = train_data.loc[row].values[0]
	# 		if type(datas) != float:
	# 			words = jieba.posseg.cut(datas)
	# 			line = ''
	# 			for word in words:
	# 				line = line + word.word + " "
	# 			result_file.write(line+"\n")
	# result_file.close()
def clean_data(word_set):
	#生成同义词的字典
	segment()
	tongyici_file = open("E:/协和问答系统/SenLiu/同义词.txt", "r", encoding="utf8")
	word_dict = {}
	for line in tongyici_file:
		words = line.strip().split(",")
		word_dict[line.strip()] = words[0]
	tongyici_file.close()
	train_data = pd.read_csv("E:/协和问答系统/SenLiu/voice.csv")
	data_index = {}
	word_dict_kv = {}
	for row in train_data.index:
		datas = train_data.loc[row].values[0]
		words = set(jieba.cut(datas))
		data_index[row] = [set(words), train_data.loc[row].values[1]]
		for word in words:
			#去除停用词
			if word not in "的,了,还是,吗,或,呢":
				#同义词进行归类
				boolean=True
				for keys in word_dict.keys():
					if word in keys:
						word_set.add(word_dict[keys])
						boolean=False
				if boolean:
					if word!="":
						word_set.add(word)
	word_set_new=list(word_set)
	lenght = len(word_set)
	print("lenght",lenght)
	#可以使用ont_hot函数处理
	file=open("key_value.txt","w",encoding="utf8")
	for i in range(lenght):
		word_dict_kv[i] = word_set_new[i]
		file.write(str(i)+" "+word_set_new[i]+"\n")
	file.close()
	word_dict_vk = dict(zip(word_dict_kv.values(), word_dict_kv.keys()))
	# 清洗后的数据集
	data_clean = {}
	for key in data_index.keys():
		lists = set()
		for word in data_index[key][0]:
			#同义词归类
			for keys in word_dict.keys():
				if word in keys:
					word=word_dict[keys]
			#去除停用词
			if word in word_dict_vk.keys():
				lists.add(word_dict_vk[word])
		data_clean[key] = [lists, data_index[key][1]]
	train = []
	label = []
	for key in data_clean.keys():
		label.append(data_clean[key][1])
		lists = []
		for i in range(lenght):
			if i in data_clean[key][0]:
				lists.append(1)
			else:
				lists.append(0)
		train.append(lists)
	return train, label, word_dict_kv,train_data
def loadDataSet():
	data = []
	label = []
	with open('fat.txt') as file:
		for line in file:
			tokens = line.strip().split(' ')
			data.append([float(tk) for tk in tokens[:-1]])
			label.append(tokens[-1])
	x = np.array(data)
	print('x:')
	print(x)
	label = np.array(label)
	y = np.zeros(label.shape)
	y[label == 'fat'] = 1
	print('y:')
	print(y)
	return x, y


def decisionTreeClf():
	wordset=set()
	x, y,word_dict_kv,train_data = clean_data(wordset)

	# 拆分数据集和训练集
	all_score=0.0
	# for i in range(100):
	#完全训练不用分割
	# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
		# 使用信息熵作为划分标准
	clf = tree.DecisionTreeClassifier(criterion='gini',min_samples_leaf=10)
	clf.fit(x, y)
	dot_data = StringIO()
	with open("iris.dot", 'w') as f:
		f = tree.export_graphviz(clf, out_file=f)
		tree.export_graphviz(clf, out_file=dot_data)
		graph = pydot.graph_from_dot_data(dot_data.getvalue())
		graph[0].write_pdf("ex.pdf")
		# Image(graph.create_png())
		# 打印特征在分类起到的作用性
		# 打印测试结果
	# build_tree()
	answer = clf.predict(x)
	all_score=all_score+np.mean(answer == y)
	print(all_score)
	# # 输出决策路径
	# for array in x_train:
	# 	print("------------------------")
	# 	path=clf.decision_path([array])
	# 	# for
	# print(clf.tree_)
	return word_dict_kv,x,train_data
def  build_tree(file):
	file = open(file, "r", encoding="utf8")
	all_lines = []
	for line in file:
		all_lines.append(line.strip()[0:-2])
	file.close()
	lenght = len(all_lines)
	node_dict = {}
	fathers_sons = []
	for line_index in range(2, lenght - 1):
		if "->" in all_lines[line_index]:
			fathers_son = []
			father_son = all_lines[line_index].split("->")
			father = father_son[0]
			son = father_son[1].strip().split(" ")[0]
			fathers_son.append(father)
			fathers_son.append(son)
			fathers_sons.append(fathers_son)
		else:
			node_data = all_lines[line_index].split(" ")
			other_data = "".join(node_data[1:])[8:-2].split("value=")
			condition = other_data[0].split("\\")[0]
			value = other_data[1].replace("\\n", ",")
			# print(value)
			if "X" not in condition:
				condition = None
			node_dict[node_data[0]] = node(key=node_data[0], value=value, condition=condition)
	for father_son in fathers_sons:
		if node_dict[father_son[0].strip()].left_id == None:
			node_dict[father_son[0].strip()].left_id = father_son[1].strip()
		else:
			node_dict[father_son[0].strip()].right_id = father_son[1].strip()
	return node_dict
word_dict_kv,x,train_data=decisionTreeClf()
lenght=len(x)
node_dict=build_tree("iris.dot")
file=open("Decision_rules.txt","w",encoding="utf8")
for i in range(lenght):
	X=x[i]
	node = "0"
	result="\n"+train_data.loc[i].values[0] +"|"
	while node_dict[node].left_id!=None:
		condition=node_dict[node].condition
		pattern = re.compile(r'[[](.*?)[]]')
		index=int(re.findall(pattern, condition)[0])
		if eval(condition)==True:
			node=node_dict[node].left_id
			result=result+word_dict_kv[index]+":"+str(False)+"|"
		else:
			node = node_dict[node].right_id
			result = result + word_dict_kv[index] + ":" + str(True) + "|"
	a=eval(node_dict[node].value)
	b=[]
	for i in range(len(a)):
		c=int(a[i])
		if c>0:
			b.append(c)
	if len(b)<=1:
		file.write(result)
file.close()
# file=open("result.text","w",encoding="utf8")
# for word,important in zip(wordset,importances):
# 	file.write(word+","+str(important)+"\n")
# 	file.flush()
# file.close()
决策树实现文本分类

猜你喜欢