如何使用决策树实现文本分类:
1.分词模块:
a.人工整理词库。
b.使用自己编写的词库和分词代码(添加词库的同时,删除影响词库的词)进行分词。
2.进行one-hot编码:
在进行one-hot编码的同时,使用同义词对one-hot进行合并,同义词使用相同的id.
3.使用sklearn里面的决策树进行计算,并且使用sklearn里面的自动化调参进行调参。
代码如下:
#!/usr/bin/env python
# coding=utf-8
import numpy as np
from sklearn import tree
from sklearn.cross_validation import train_test_split
import pandas as pd
import jieba
import pydot
from sklearn.externals.six import StringIO
import re
# jieba.add_word("导诊台")
# jieba.del_word("机能")
# jieba.add_word("自助机")
# jieba.add_word("胶片袋")
# jieba.add_word("巡诊室")
# jieba.del_word("片东院")
# jieba.add_word("东院")
class node:
def __init__(self,key=None,value=None,condition=None,left_id=None, right_id=None):
self.key=key
self.condition=condition
self.value=value
self.left_id= left_id
self.right_id = right_id
def segment():
"""
分词方法
:return:
"""
typedict = {0: str}
# result_file=open("segment_result.txt","w",encoding="utf8")
train_data = pd.read_csv("E:/协和问答系统/SenLiu/voice.csv", dtype=typedict)
jieba.load_userdict("E:/协和问答系统/SenLiu/words.txt")
for row in train_data.index:
datas = train_data.loc[row].values[0]
words = list(jieba.cut(datas))
del_set = []
words_file = open("E:/协和问答系统/SenLiu/words.txt", "r", encoding="utf8")
if type(datas) != float:
for word in words_file:
if word.strip() != "":
allmacth = re.finditer(word.strip(), datas)
for match in allmacth:
del_set.append(match.span()[0])
del_set.append(match.span()[1])
words_file.close()
begin = 0
end = len(words[0])
lenght = len(words)
for i in range(lenght):
for index in del_set:
if begin < index and end > index:
jieba.del_word(words[i])
if i != lenght - 1:
begin = end
end = end + len(words[i + 1])
# for row in train_data.index:
# datas = train_data.loc[row].values[0]
# if type(datas) != float:
# words = jieba.posseg.cut(datas)
# line = ''
# for word in words:
# line = line + word.word + " "
# result_file.write(line+"\n")
# result_file.close()
def clean_data(word_set):
#生成同义词的字典
segment()
tongyici_file = open("E:/协和问答系统/SenLiu/同义词.txt", "r", encoding="utf8")
word_dict = {}
for line in tongyici_file:
words = line.strip().split(",")
word_dict[line.strip()] = words[0]
tongyici_file.close()
train_data = pd.read_csv("E:/协和问答系统/SenLiu/voice.csv")
data_index = {}
word_dict_kv = {}
for row in train_data.index:
datas = train_data.loc[row].values[0]
words = set(jieba.cut(datas))
data_index[row] = [set(words), train_data.loc[row].values[1]]
for word in words:
#去除停用词
if word not in "的,了,还是,吗,或,呢":
#同义词进行归类
boolean=True
for keys in word_dict.keys():
if word in keys:
word_set.add(word_dict[keys])
boolean=False
if boolean:
if word!="":
word_set.add(word)
word_set_new=list(word_set)
lenght = len(word_set)
print("lenght",lenght)
#可以使用ont_hot函数处理
file=open("key_value.txt","w",encoding="utf8")
for i in range(lenght):
word_dict_kv[i] = word_set_new[i]
file.write(str(i)+" "+word_set_new[i]+"\n")
file.close()
word_dict_vk = dict(zip(word_dict_kv.values(), word_dict_kv.keys()))
# 清洗后的数据集
data_clean = {}
for key in data_index.keys():
lists = set()
for word in data_index[key][0]:
#同义词归类
for keys in word_dict.keys():
if word in keys:
word=word_dict[keys]
#去除停用词
if word in word_dict_vk.keys():
lists.add(word_dict_vk[word])
data_clean[key] = [lists, data_index[key][1]]
train = []
label = []
for key in data_clean.keys():
label.append(data_clean[key][1])
lists = []
for i in range(lenght):
if i in data_clean[key][0]:
lists.append(1)
else:
lists.append(0)
train.append(lists)
return train, label, word_dict_kv,train_data
def loadDataSet():
data = []
label = []
with open('fat.txt') as file:
for line in file:
tokens = line.strip().split(' ')
data.append([float(tk) for tk in tokens[:-1]])
label.append(tokens[-1])
x = np.array(data)
print('x:')
print(x)
label = np.array(label)
y = np.zeros(label.shape)
y[label == 'fat'] = 1
print('y:')
print(y)
return x, y
def decisionTreeClf():
wordset=set()
x, y,word_dict_kv,train_data = clean_data(wordset)
# 拆分数据集和训练集
all_score=0.0
# for i in range(100):
#完全训练不用分割
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
# 使用信息熵作为划分标准
clf = tree.DecisionTreeClassifier(criterion='gini',min_samples_leaf=10)
clf.fit(x, y)
dot_data = StringIO()
with open("iris.dot", 'w') as f:
f = tree.export_graphviz(clf, out_file=f)
tree.export_graphviz(clf, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph[0].write_pdf("ex.pdf")
# Image(graph.create_png())
# 打印特征在分类起到的作用性
# 打印测试结果
# build_tree()
answer = clf.predict(x)
all_score=all_score+np.mean(answer == y)
print(all_score)
# # 输出决策路径
# for array in x_train:
# print("------------------------")
# path=clf.decision_path([array])
# # for
# print(clf.tree_)
return word_dict_kv,x,train_data
def build_tree(file):
file = open(file, "r", encoding="utf8")
all_lines = []
for line in file:
all_lines.append(line.strip()[0:-2])
file.close()
lenght = len(all_lines)
node_dict = {}
fathers_sons = []
for line_index in range(2, lenght - 1):
if "->" in all_lines[line_index]:
fathers_son = []
father_son = all_lines[line_index].split("->")
father = father_son[0]
son = father_son[1].strip().split(" ")[0]
fathers_son.append(father)
fathers_son.append(son)
fathers_sons.append(fathers_son)
else:
node_data = all_lines[line_index].split(" ")
other_data = "".join(node_data[1:])[8:-2].split("value=")
condition = other_data[0].split("\\")[0]
value = other_data[1].replace("\\n", ",")
# print(value)
if "X" not in condition:
condition = None
node_dict[node_data[0]] = node(key=node_data[0], value=value, condition=condition)
for father_son in fathers_sons:
if node_dict[father_son[0].strip()].left_id == None:
node_dict[father_son[0].strip()].left_id = father_son[1].strip()
else:
node_dict[father_son[0].strip()].right_id = father_son[1].strip()
return node_dict
word_dict_kv,x,train_data=decisionTreeClf()
lenght=len(x)
node_dict=build_tree("iris.dot")
file=open("Decision_rules.txt","w",encoding="utf8")
for i in range(lenght):
X=x[i]
node = "0"
result="\n"+train_data.loc[i].values[0] +"|"
while node_dict[node].left_id!=None:
condition=node_dict[node].condition
pattern = re.compile(r'[[](.*?)[]]')
index=int(re.findall(pattern, condition)[0])
if eval(condition)==True:
node=node_dict[node].left_id
result=result+word_dict_kv[index]+":"+str(False)+"|"
else:
node = node_dict[node].right_id
result = result + word_dict_kv[index] + ":" + str(True) + "|"
a=eval(node_dict[node].value)
b=[]
for i in range(len(a)):
c=int(a[i])
if c>0:
b.append(c)
if len(b)<=1:
file.write(result)
file.close()
# file=open("result.text","w",encoding="utf8")
# for word,important in zip(wordset,importances):
# file.write(word+","+str(important)+"\n")
# file.flush()
# file.close()