机器学习逻辑回归实战python

参考 machine learning inaction

为了实现logistic回归分类器,我们可以在每个特征上乘以回归系数,然后把所有的值相加,将总和代入Sigmoid函数中,进而得到一个范围在0~1之间的数值。任何大于0.5的数据归为1类,小于0.5归为0类。Sigmoid函数形式如下:

logistic
log
sigmoid(z)输入可写作:
Z = w0 * x0 + w1 * x1+…+wn * xn
其中x0 = 1, 也可写为Z = sum(wi * xi)+d (i=1,2,…,n)
我们参数优化的对象就是w,使得:
当y=1时,sigmoid(-WX)>0.5;
当y=0时,sigmoid(-WX)<0.5

原理推导请参考:逻辑回归 logistics regression 公式推导

最后w的更新迭代公式如下:
在这里插入图片描述

#logRegres.py
#python 3.8
from numpy import *
import random
import matplotlib.pyplot as plt
w0 = 1.0 
#获得数据集和标签
def loadDataSet():
	dataMat = []
	labelMat = []
	fr = open('testSet.txt')
	lines = [line.rstrip().split() for line in open('testSet.txt')]
	for lineArr in lines:
		print(lineArr)
		dataMat.append([w0, float(lineArr[0]),float(lineArr[1])])
		labelMat.append(int(lineArr[2])) 
	''' 
	for line in fr.readlines():
		lineArr = line.strip().split()
		dataMat.append([w0, float(lineArr[0]),float(lineArr[1])])
		labelMat.append(int(lineArr[2]))
	'''
	return dataMat, labelMat

def sigmoid(inX):
	return 1.0/(1+exp(-inX))

#梯度上升算法
def gradAscent(dataMatIn, classLabels):
	dataMatrix = mat(dataMatIn)
	labelMat = mat(classLabels).transpose() 
	m, n = shape(dataMatrix) #实例个数为m,特征维度为n
	alpha = 0.0005			#学习率
	maxCycles = 1000        #迭代次数
	weights = ones([n,1])+10.0 #初始化W, 其中w0 = d
	for k in range(maxCycles):
		h = sigmoid(dataMatrix*weights)
		error = (labelMat - h)
		print(sum(abs(error)))
		print(weights.T,'\n')
		weights = weights + alpha*dataMatrix.transpose()*error

	return weights

#随机梯度上升
def stocGradAscent0(dataMatrix, classLabels):
	m, n = shape(dataMatrix)
	alpha = 0.01
	weights = ones(n)
	for i in range(m):
		h = sigmoid(dataMatrix[i]*weights.T)
		error = classLabels[i] - h
		weights += alpha*error*dataMatrix[i]
	return weights
#随机单点
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
	m,n = shape(dataMatrix)
	weights = ones(n)
	weights[0] = 10.0 
	dataIndex = list(range(m))
	dic = {
    
    }
	for j in range(numIter): 
		random.shuffle(dataIndex) #乱序下标
		print(dataIndex)
		for i in range(m):
			alpha = 4/(1.0+j+i)+0.01 
			#randIndex = int(random.uniform(0,len(dataIndex))) 
			#dataX = dataMatrix[dataIndex[i]]
			dataX = array(dataMatrix[dataIndex[i]])
			h = sigmoid(sum(dataX*weights))
			error = classLabels[dataIndex[i]] - h
			if abs(error)>0.25:
				weights = weights + [alpha * error * X for X in dataX]
				weights = weights + alpha * error * dataX
				print(weights)
				dic[i] = dic.get(i,0) + 1
			
		print(weights,'\n')
	print(sorted(dic.items(), key=lambda x:x[1]))
	return weights

#随机批量
def gradAscent1(dataMatrix0, labelMat0, maxCycles=150):
	m, n = shape(dataMatrix0)
	dataIndex = list(range(m))
	weights = ones([n,1])+10.0
	m0 = 10
	for k in range(maxCycles):
		if m0>m or k==maxCycles-1:
			m0 = m
		random.shuffle(dataIndex) 
		dataMatrix = mat([dataMatrix0[i] for i in dataIndex[:m0]])
		print(dataMatrix)
		labelMat = mat([labelMat0[i] for i in dataIndex[:m0]]).transpose()

		alpha = 0.5/(k+1) + 0.01
		h = sigmoid(dataMatrix*weights)
		error = (labelMat - h)
		print(sum(abs(error)))
		if sum(abs(error))>1:
			print(weights.T,'\n')
			weights = weights + alpha*dataMatrix.transpose()*error
		m0 += 5
	return weights

#画出散点图以及决策边界
def plotBestFit(dataMat,labelMat,wei):
	#weights = wei.getA()
	#dataMat,labelMat=loadDataSet()
	dataArr = array(dataMat)
	n = shape(dataArr)[0] 
	xcord1 = []; ycord1 = []
	xcord2 = []; ycord2 = []
	for i in range(n):
		if int(labelMat[i])== 1:
			xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
		else:
			xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
	fig = plt.figure()
	ax = fig.add_subplot(111)
	ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
	ax.scatter(xcord2, ycord2, s=30, c='green')
	x = arange(-3.0, 3.0, 0.1)
	y = (-w0*weights[0]-weights[1]*x)/weights[2] 
	ax.plot(x, y)
	plt.xlabel('X1'); plt.ylabel('X2');
	plt.show()


if __name__ == '__main__':
	dataMat,labelMat = loadDataSet()
	#weights = gradAscent(dataMat, labelMat).getA()
	#weights = stocGradAscent0(dataMat, labelMat)
	#weights = stocGradAscent1(dataMat, labelMat,50)
	weights = gradAscent1(dataMat, labelMat, 100).getA()
	print(weights.T)
	plotBestFit(dataMat,labelMat,weights)
#补充分类函数
def classify0(inX, weights):
	#inX测试实例
	#weights是训练结果(优化结果)
	inX = array(inX)
	weights = array(weights)
	prob = sigmoid(dot(inX,weights))
	if prob>0.5:
		return 1
	else:
		return 0
#测试函数
def testClassifier(dataMat,labelMat,weights):
	testDataSize = len(dataMat)
	errorCount = 0
	for i in range(testDataSize):
		y0 = classify0(dataMat[i],weights)
		print("The result from Classifier: %d, The real label is : %d" %(y0,labelMat[i]))
		if y0 != labelMat[i]:
			errorCount += 1
	errorRate = (float(errorCount)/testDataSize)
	print("the error rate of this test is: %f" % errorRate)
	return errorRate
-0.017612	14.053064	0
-1.395634	4.662541	1
-0.752157	6.538620	0
-1.322371	7.152853	0
0.423363	11.054677	0
0.406704	7.067335	1
0.667394	12.741452	0
-2.460150	6.866805	1
0.569411	9.548755	0
-0.026632	10.427743	0
0.850433	6.920334	1
1.347183	13.175500	0
1.176813	3.167020	1
-1.781871	9.097953	0
-0.566606	5.749003	1
0.931635	1.589505	1
-0.024205	6.151823	1
-0.036453	2.690988	1
-0.196949	0.444165	1
1.014459	5.754399	1
1.985298	3.230619	1
-1.693453	-0.557540	1
-0.576525	11.778922	0
-0.346811	-1.678730	1
-2.124484	2.672471	1
1.217916	9.597015	0
-0.733928	9.098687	0
-3.642001	-1.618087	1
0.315985	3.523953	1
1.416614	9.619232	0
-0.386323	3.989286	1
0.556921	8.294984	1
1.224863	11.587360	0
-1.347803	-2.406051	1
1.196604	4.951851	1
0.275221	9.543647	0
0.470575	9.332488	0
-1.889567	9.542662	0
-1.527893	12.150579	0
-1.185247	11.309318	0
-0.445678	3.297303	1
1.042222	6.105155	1
-0.618787	10.320986	0
1.152083	0.548467	1
0.828534	2.676045	1
-1.237728	10.549033	0
-0.683565	-2.166125	1
0.229456	5.921938	1
-0.959885	11.555336	0
0.492911	10.993324	0
0.184992	8.721488	0
-0.355715	10.325976	0
-0.397822	8.058397	0
0.824839	13.730343	0
1.507278	5.027866	1
0.099671	6.835839	1
-0.344008	10.717485	0
1.785928	7.718645	1
-0.918801	11.560217	0
-0.364009	4.747300	1
-0.841722	4.119083	1
0.490426	1.960539	1
-0.007194	9.075792	0
0.356107	12.447863	0
0.342578	12.281162	0
-0.810823	-1.466018	1
2.530777	6.476801	1
1.296683	11.607559	0
0.475487	12.040035	0
-0.783277	11.009725	0
0.074798	11.023650	0
-1.337472	0.468339	1
-0.102781	13.763651	0
-0.147324	2.874846	1
0.518389	9.887035	0
1.015399	7.571882	0
-1.658086	-0.027255	1
1.319944	2.171228	1
2.056216	5.019981	1
-0.851633	4.375691	1
-1.510047	6.061992	0
-1.076637	-3.181888	1
1.821096	10.283990	0
3.010150	8.401766	1
-1.099458	1.688274	1
-0.834872	-1.733869	1
-0.846637	3.849075	1
1.400102	12.628781	0
1.752842	5.468166	1
0.078557	0.059736	1
0.089392	-0.715300	1
1.825662	12.693808	0
0.197445	9.744638	0
0.126117	0.922311	1
-0.679797	1.220530	1
0.677983	2.556666	1
0.761349	10.693862	0
-2.168791	0.143632	1
1.388610	9.341997	0
0.317029	14.739025	0

猜你喜欢

转载自blog.csdn.net/qq_31541101/article/details/113142022