源代码和数据集已经上传,见:https://download.csdn.net/download/pengchengliu/10569142
代码如下:
#读入数据,对数据进行预处理
def prehandle():
f = open('raw_data.txt', 'r', encoding="utf-8") #utf-8
# f = open('test_data.txt', 'r', encoding="utf-8") # utf-8
tmp = f.read().split()
f.close()
#对数据集进行处理
dataset = []
element = []
for i in tmp:
element = i.split("/")
if len(element) == 1:
element.append('w')
if element[1]=='x':
element[1]='n'
dataset.append(element)
#将数据集按照8:2的比例划分为训练集与测试集
i = round(len(dataset) * 0.8)
while dataset[i][0] != '。': #找到这句话结尾
i += 1
trainset = []
testset = []
for j in range(i + 1):
trainset.append(dataset[j])
for j in range(i + 1, len(dataset)):
testset.append(dataset[j])
return trainset, testset
#参数学习
def learnMordel(trainset):
s_num = {}
o_num = {}
pai_num = {}
A_tmp = {} #用于存放每个动词产生多少其他词。每个名次产生多少词。
A_num = {}
B_num = {}
for i in range(len(trainset)):
# s_num {n:200 , v:300 , ...}
# 用于标记每个状态出现的次数
if trainset[i][1] in s_num:
s_num[trainset[i][1]] += 1
else:
s_num[trainset[i][1]] = 1
# o_num {'吃':2 , "跑":30 , ...}
# 用于标记每个观察值出现的次数
if trainset[i][0] in o_num:
o_num[trainset[i][0]] += 1
else:
o_num[trainset[i][0]] = 1
# pai_num
#用于记录句号后面一个词性
if i == 0 or trainset[i - 1][0] == '。' or trainset[i-1][0]==',' or trainset[i-1][0]=='?' or trainset[i-1][0]=='!':
if trainset[i][1] in pai_num:
pai_num[trainset[i][1]] += 1
else:
pai_num[trainset[i][1]] = 1
#A_tmp {n:200 , v:100 , ...}
# 跟s_num基本一样。用于标记每个状态共产生了多少其他状态。(n->v)的个数 + (n->adj)的个数 + ...
if i != (len(trainset)-1):
if trainset[i][1] in A_tmp:
A_tmp[trainset[i][1]] += 1
else:
A_tmp[trainset[i][1]] = 1
# A_num {(v,n):3 , (v,adj):2 , ...}
if i != 0:
if (trainset[i - 1][1], trainset[i][1]) in A_num:
A_num[(trainset[i - 1][1], trainset[i][1])] += 1
else:
A_num[(trainset[i - 1][1], trainset[i][1])] = 1
# B_num {(v,'吃'):3 , (n,"狗"):2 , ...}
if (trainset[i][1], trainset[i][0]) in B_num:
B_num[(trainset[i][1], trainset[i][0])] += 1
else:
B_num[(trainset[i][1], trainset[i][0])] = 1
print(pai_num)
'''
print("\n s_num:************")
print(s_num)
print("\n o_num:************")
print(o_num)
print("\n pai_num:***********")
print(pai_num)
print("\n A_num:*************")
print(A_num)
print("\n B_num:*************")
print(B_num)
'''
# S
S = []
for i in s_num:
S.append(i)
# O
O = []
for i in o_num:
O.append(i)
# pai
pai = []
numbers = 0
for i in pai_num:
numbers += pai_num[i]
print(numbers)
for i in S:
if i in pai_num:
pai.append(pai_num[i] / numbers)
else:
pai.append(0)
B = B_num
for i in B_num:
B_num[i]/=s_num[i[0]]
for i in S:
for j in O:
if (i,j) not in B_num:
B_num[(i,j)] = 0
# A
A = []
for i in S:
row = []
row_num = 0
for j in S:
if (i, j) in A_num:
row.append(A_num[(i, j)])
else:
row.append(0)
A.append(row)
for i in range(len(S)):
for j in range(len(S)):
if A[i][j] != 0:
A[i][j] = A[i][j] / A_tmp[S[i]]
"""
# B
B = []
row_nums = [] # yongyu jilu v->...de ge shu,n->...de ge shu ...
for i in S:
row = []
row_num = 0
for j in O:
if (i, j) in B_num:
row.append(B_num[(i, j)])
row_num += B_num[(i, j)]
else:
row.append(0)
B.append(row)
row_nums.append(row_num)
for i in range(len(S)):
for j in range(len(O)):
if B[i][j] != 0:
B[i][j] = B[i][j] / row_nums[i]
"""
print("状态集共有%d种状态"%len(S))
print("观察集共有%d种状态"%len(O))
return S, O, pai, A, B_num
#对测试集进行处理,把数据和标签分离
def process(testset):
status = []
label = []
for i in testset:
status.append(i[0])
label.append(i[1])
return status, label
####维特比算法####
def viterbi(status, S, O, pai, A, B):
path = [[]for i in range(len(S))] #path保存的是当前节点往前的链路.[[],[],[],...,[]]
State = [[0 for i in range(len(S))] for i in range(len(status))] #[[0 0 0 0 ...],[0 0 0 0 ...],...,[0 0 0 0 ...]]
# print("矩阵初始化完毕")
for i in range(len(S)):
if status[0] in O: #“吃”在观察集中
State[0][i] = pai[i] * B[(S[i],status[0])]
else:
State[0][i] = pai[i] *1 /len(status)
path[i].append(i)
# print("矩阵第一列初始化完成")
lenS =len(S)
maxk = 0
maxvalue = 0
tmpvalue = 0
for i in range(1, len(status)):
tmppath = [[] for j in range(lenS)]
for j in range(lenS):
maxk = 0
maxvalue = 0
for k in range(lenS):
tmpvalue = State[i - 1][k] * A[k][j] #A[k][j]
if tmpvalue > maxvalue:
maxvalue = tmpvalue
maxk = k
if status[i] in O:
State[i][j] = maxvalue * B[(S[j],status[i])]
else:
State[i][j] = maxvalue / len(status) #对于没有见过的观察值,随即从状态序列生成
tmppath[j].extend(path[maxk])
tmppath[j].append(j)
path = tmppath
# print("第%d列计算完成"%i)
#找到最后一列最大值,用k纪录
k = 0
for j in range(len(S)):
maxnum = 0
if State[len(status) - 1][j] > maxnum:
k = j
#回溯依次找到所有状态
result = path[k]
result1 = []
for i in result:
result1.append(S[i])
return result1
####对模型结果进行分析####
def accuracy(result, label):
num = 0
for i in range(len(result)):
if result[i] == label[i]:
num += 1
return num / len(result)
###########程序开始##########################
####步骤一:数据预处理 ####
trainset, testset = prehandle()
print("成功生成训练集和测试集")
'''
print("\n trainset:************")
print(trainset)
print("\n testset:*************")
print(testset)
'''
####步骤二:学习参数,生成隐马模型####
S, O, pai, A, B = learnMordel(trainset)
print("参数训练完毕!")
'''
print("\n S:****************")
print(S)
print("\n O:****************")
print(O)
print("\n pai:**************")
print(pai)
print("\n A:****************")
print(A)
print("\n B:****************")
print(B)
'''
####步骤三:对测试集进行处理####
status, label = process(testset)
print("测试集处理完毕")
print("观测序列长度为:%d"%len(status))
print(status)
print("标签为")
print(label)
'''
S = ['晴天', '阴天', '下雨']
status = ['干', '潮湿', '湿润']
O = ['干','稍干','潮湿','湿润']
A = [[0.5, 0.25, 0.25], [0.375, 0.25, 0.375], [0.25, 0.125, 0.625]]
B = {('晴天','干'):0.6,('晴天','稍干'):0.2,('晴天','潮湿'):0.15,('晴天','湿润'):0.05,('阴天','干'):0.25,('阴天','稍干'):0.25,('阴天','潮湿'):0.25,('阴天','湿润'):0.25,('下雨','干'):0.05,('下雨','稍干'):0.10,('下雨','潮湿'):0.35,('下雨','湿润'):0.5}
#B = {[[0.6,0.2,0.15,0.05],[0.25,0.25,0.25,0.25],[0.05,0.10,0.35,0.50]}
pai = [1, 0, 0]
'''
####步骤四:调用维特比算法,生成预测的标签####
result =[]
begin = 0
end =0
for i in range(len(status)):
if status[i] == '。' or status[i]==',' or status[i]==';' or status[i] == '?' or status[i]=='!' or i == len(status)-1:
print("要预测的下一句话为:")
print(status[begin:i+1])
end = i
result1 = viterbi(status[begin:end+1], S, O, pai, A, B)
print("预测结果为:")
print(result1)
begin = end+1
result.extend(result1)
#print("预测总结果为:")
#print(result)
#result = []
#result = viterbi(status, S, O, pai, A, B)
####步骤五:对结果进行分析####
acc = accuracy(result, label)
print("模型正确率为:")
print(acc)
实验效果如下:
模型正确率达到:96.62%