0.逻辑回归概念
- 目的:解决分类问题。
- 实际:将连续值通过sigmoid函数映射到[0,1]之间,该值为分类为1的概率。p>=0.5判定为类别1,否则判定为类别0。
- 是广义的线性回归。相当于把线性回归算法的目标函数值映射到[0,1]之间,进行分类。
详细概念
1. 导入鸢尾花数据集
import numpy as np
import pandas as pd
data=pd.read_csv("data/iris.csv")
data.drop_duplicates(inplace=True)
data["Name"]=data["Name"].map({"Iris-versicolor":0,"Iris-setosa":1,"Iris-virginica":2})
|
SepalLength |
SepalWidth |
PetalLength |
PetalWidth |
Name |
136 |
6.3 |
3.4 |
5.6 |
2.4 |
2 |
109 |
7.2 |
3.6 |
6.1 |
2.5 |
2 |
13 |
4.3 |
3.0 |
1.1 |
0.1 |
1 |
70 |
5.9 |
3.2 |
4.8 |
1.8 |
0 |
17 |
5.1 |
3.5 |
1.4 |
0.3 |
1 |
4 |
5.0 |
3.6 |
1.4 |
0.2 |
1 |
87 |
6.3 |
2.3 |
4.4 |
1.3 |
0 |
86 |
6.7 |
3.1 |
4.7 |
1.5 |
0 |
67 |
5.8 |
2.7 |
4.1 |
1.0 |
0 |
135 |
7.7 |
3.0 |
6.1 |
2.3 |
2 |
54 |
6.5 |
2.8 |
4.6 |
1.5 |
0 |
83 |
6.0 |
2.7 |
5.1 |
1.6 |
0 |
21 |
5.1 |
3.7 |
1.5 |
0.4 |
1 |
88 |
5.6 |
3.0 |
4.1 |
1.3 |
0 |
22 |
4.6 |
3.6 |
1.0 |
0.2 |
1 |
78 |
6.0 |
2.9 |
4.5 |
1.5 |
0 |
108 |
6.7 |
2.5 |
5.8 |
1.8 |
2 |
118 |
7.7 |
2.6 |
6.9 |
2.3 |
2 |
128 |
6.4 |
2.8 |
5.6 |
2.1 |
2 |
82 |
5.8 |
2.7 |
3.9 |
1.2 |
0 |
data=data[data["Name"]!=2]
2. 逻辑回归算法
class LogisticRegression:
def __init__(self,alpha,times):
self.alpha=alpha
self.times=times
def sigmoid(self,z):
"""
sigmoid函数的实现。
paremeters:
----
z:float 自变量,值为:z=w.T*x
returns:
----
p:float,值为[0,1].
返回样本属于类别1的概率,用来作为结果的预测。
当s>=0.5(z>=0)时,判定为类别1;否则判定为类别0.
"""
return 1.0/(1.0+np.exp(-z))
def fit(self,X,y):
self.X=np.asarray(X)
self.y=np.asarray(y)
self.w_=np.zeros(1+X.shape[1])
self.loss_=[]
for i in range(self.times):
z=np.dot(X,self.w_[1:])+self.w_[0]
p=self.sigmoid(z)
cost=-np.sum(y*np.log(p)+(1-y)*np.log(1-p))
self.loss_.append(cost)
self.w_[0]+=self.alpha*np.sum(y-p)
self.w_[1:]+=self.alpha*np.dot(X.T,y-p)
def predict_proba(self,X):
"""返回预测为1和0类型的概率"""
X=np.asarray(X)
z=np.dot(X,self.w_[1:])+self.w_[0]
p=self.sigmoid(z)
p=p.reshape(-1,1)
return np.concatenate([1-p,p],axis=1)
def predict(self,X):
return np.argmax(self.predict_proba(X),axis=1)
3.数据切分
t1=data[data["Name"]==0]
t2=data[data["Name"]==1]
t1=t1.sample(len(t1),random_state=666)
t2=t2.sample(len(t2),random_state=666)
X_train=pd.concat([t1.iloc[:40,:-1],t2.iloc[:40,:-1]],axis=0)
y_train=pd.concat([t1.iloc[:40,-1],t2.iloc[:40,-1]],axis=0)
X_test=pd.concat([t1.iloc[40:,:-1],t2.iloc[40:,:-1]],axis=0)
y_test=pd.concat([t1.iloc[40:,-1],t2.iloc[40:,-1]],axis=0)
4.逻辑回归进行二分类
reg=LogisticRegression(alpha=0.001,times=20)
reg.fit(X_train,y_train)
reg.predict_proba(X_test)
array([[0.70002701, 0.29997299],
[0.83689443, 0.16310557],
[0.79590141, 0.20409859],
[0.76391151, 0.23608849],
[0.7577496 , 0.2422504 ],
[0.79310386, 0.20689614],
[0.74918987, 0.25081013],
[0.75201187, 0.24798813],
[0.81234813, 0.18765187],
[0.77947483, 0.22052517],
[0.30889335, 0.69110665],
[0.28541285, 0.71458715],
[0.31146719, 0.68853281],
[0.29001408, 0.70998592],
[0.33239525, 0.66760475],
[0.30123427, 0.69876573],
[0.28658295, 0.71341705],
[0.26853876, 0.73146124]])
result=reg.predict(X_test)
np.sum(result==y_test)/len(y_test)
1.0
5. 可视化展示
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams["font.family"]="SimHei"
mpl.rcParams["axes.unicode_minus"]=False
plt.plot(result,"ro",ms=15,label="预测值")
plt.plot(y_test.values,'go',label="真实值")
plt.title("逻辑回归")
plt.xlabel("样本序号")
plt.ylabel("类别")
plt.legend()
<matplotlib.legend.Legend at 0x205636e7488>
plt.plot(range(1,reg.times+1),reg.loss_,"go-")
[<matplotlib.lines.Line2D at 0x2056367a408>]