统计学习方法_最大熵模型实现

使用的是二分类的MNIST，注意为了区别不同的特征，给它们加上了x0、x1。注意f(x,y)的含义。最后运行时间很长，最大熵模型更适合小数据集。

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import time
import math
import random

import numpy as np
import pandas as pd

from collections import defaultdict

from sklearn.cross_validation import train_test_split

class MaxEnt(object):
	def init_params(self, X, Y):
		self.X_ = X  # X是一个列表，二维数组
		self.Y_ = set()

		self.cal_Pxy_Px(X, Y)

		self.N = len(X)  # 训练集大小
		self.n = len(self.Pxy)  # (x,y)对数
		self.M = 10000.0  # 书P91，可认为是学习速率

		self.build_dict()
		self.cal_EPxy()

	def cal_Pxy_Px(self, X, Y):
		'''
			P82
			计算联合分布P(X,Y)的经验分布和边缘分布P(X)的经验分布
		'''
		# defaultdict作用是当查找时key不存在时，返回默认值而不是报错
		self.Pxy = defaultdict(int)
		self.Px = defaultdict(int)

		for i in range(len(X)):
			x_, y = X[i], Y[i]
			self.Y_.add(y)

			for x in x_:  # 遍历每个特征
				self.Pxy[(x, y)] += 1
				self.Px[x] += 1

	def build_dict(self):
		'''
			为所有(x,y)设定一个id，并建立起双向的映射，相同(x,y)无需重复设定
		'''
		self.id2xy = {}
		self.xy2id = {}

		for i, (x, y) in enumerate(self.Pxy):
			self.id2xy[i] = (x, y)
			self.xy2id[(x, y)] = i

	def fxy(self, x, y):
		'''
			判断是否存在(x,y)这一对
		'''
		return (x, y) in self.xy2id

	def cal_pyx(self, X, y):
		'''
			计算规范化因子Z的内层(6.23)
			同时也是(6.22)的分子
		'''
		result = 0.0
		for x in X:  # 这里的X是一条数据，遍历每个特征
			if self.fxy(x, y):
				id = self.xy2id[(x, y)]
				result += self.w[id]  # f(x,y) = 1，不用乘
		return (math.exp(result), y)

	def cal_probability(self, X):  # 这里的X是一条数据
		'''
			计算P85的(6.22)
		'''
		Pyxs = [(self.cal_pyx(X, y)) for y in self.Y_]
		Z = sum([prob for prob, y in Pyxs])  # 规范化因子
		return [(prob / Z, y) for prob, y in Pyxs]

	def cal_EPx(self):
		'''
			计算P83最上面的期望
		'''
		self.EPx = [0.0 for i in range(self.n)]  # EPx个数 = (x,y)对数

		for i, X in enumerate(self.X_):  # 遍历每条数据
			Pyxs = self.cal_probability(X)  # 得到P(y|X)，X固定，遍历y（为什么不是特征x而是X存疑）

			for x in X:  # 遍历每个特征
				for Pyx, y in Pyxs:  # Pyx是针对每条数据X和每个标签y的值
					if self.fxy(x, y):
						id = self.xy2id[(x, y)]
						self.EPx[id] += Pyx * (1.0 / self.N)  # 暗含经验分布的叠加

	def cal_EPxy(self):
		'''
			计算P82最下面的期望
		'''
		self.EPxy = defaultdict(float)
		for id in range(self.n):
			(x, y) = self.id2xy[id]
			self.EPxy[id] = float(self.Pxy[(x, y)]) / float(self.N)

	def train(self, X, Y):
		self.init_params(X, Y)
		self.w = [0.0 for i in range(self.n)]

		max_iteration = 1000
		for times in range(max_iteration):
			print('iteration times %d' % times)
			deltas = []
			self.cal_EPx()

			for i in range(self.n):
				delta = 1 / self.M * math.log(self.EPx[i])
				deltas.append(delta)

			self.w = [self.w[i] + deltas[i] for i in range(self.n)]

	def predict(self, testset):
		results = []
		for test in testset:
			result = self.cal_probability(test)
			results.append(max(result, key=lambda x: x[0])[1])
		return results

def rebuild_features(features):
	'''
		将原feature的(a0,a1,a2,a3...)
		变成(0_a0,1_a1,2_a2,3_a3)的形式
		因为计算f(x,y)需要区分不同的x
	'''
	new_features = []
	for feature in features:
		new_feature = []
		for i, f in enumerate(feature):  # 遍历每一行数据
			new_feature.append(str(i) + '_' + str(f))
		new_features.append(new_feature)
	return new_features

if __name__ == '__main__':
	print('Start reading data:')
	time1 = time.time()

	raw_data = pd.read_csv('data/train_binary.csv', header=0)
	data = raw_data.values

	imgs = data[:, 1:]
	labels = data[:, 0]

	train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33, random_state=11111)

	train_features = rebuild_features(train_features)
	test_features = rebuild_features(test_features)
	print(len(train_features))
	print(len(test_features))
	# print(train_features[0])

	time2 = time.time()
	print('read data cost %f seconds' % (time2 - time1))

	print('Start training:')
	met = MaxEnt()
	met.train(train_features, train_labels)
	time3 = time.time()
	print('training cost %f seconds' % (time3 - time2))

	print('Start predicting:')
	test_predict = met.predict(test_features)
	time4 = time.time()
	print('predicting cost %f seconds' % (time4 - time3))

	accuracy = sum([test_labels[i] == test_predict[i] for i in range(len(test_labels))]) / len(test_labels)
	print('The accuracy is %f!')
'''
output:
Start reading data:
28140
13860
read data cost 42.410567 seconds
Start training:
iteration times 0
iteration times 1
iteration times 2
...（要运行十几个小时，正确率可达97%以上）
'''
统计学习方法_最大熵模型实现

使用的是二分类的MNIST，注意为了区别不同的特征，给它们加上了x0、x1。注意f(x,y)的含义。最后运行时间很长，最大熵模型更适合小数据集。

猜你喜欢