python机器学习--分类算法

#感知器逻辑：一个二值分类问题，分别记为1(正类别)和-1（负类别）.定义激励函数z=wx (w为权值，x为输入值)，当Z大于阈值时为1类，否则为-1类
#用Python实现感知器学习算法。步骤：1、将权重初始化为0或一个极小的随机数 2、迭代所有训练样本，计算出输出值Y,更新权重。
import numpy as np
class Perceptron(object): #class 创建类
def init(self,eta=0.01,n_iter=10): #定义学习速率eta和在训练集上进行迭代的次数n_iter
self.eta = eta
self.n_iter = n_iter
def fit(self, x, y): #设定通过fit方法将self.w_ 初始化为一个零向量R^（m+1）,其中m是数据集中维度（特征）的数量
#对于并非在初始化对象时创建但又被对象中其他方法调用的属性，可以在后面添加一个下划线，如self.w_
self.w_ = np.zeros(1 + x.shape[1]) #zeors赋值self.w_ 为（0，0，0，0，…)的向量，shape[1] 一维矩阵
self.errors_ = []
for _ in range(self.n_iter):
errors = 0
for xi, target in zip(x,y):
update = self.eta * (target -self.predict(xi)) #学习速率（y-预测y)
self.w_[1:] += update * xi
self.w_[0:] += update
errors += int(update != 0.0)
self.errors_.append(errors) #append() 方法向列表的尾部添加一个新的元素
return self
def net_input(self,x):
return np.dot(x,self.w_[1:]) + self.w_[0] #dot()函数是矩阵乘
def predict(self,x):
return np.where(self.net_input(x) >=0.0, 1, -1) #numpy.where()函数是三元表达式x if condition else y的矢量化版本

#基于鸢尾花数据集训练感知器模型，Iris-setosa山鸢尾。versicolor为变色鸢尾

import pandas as pd
df = pd.read_csv(‘https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data’,header=None)
df.tail() #tail显示数据最后5行，确保数据正确加载

#提取100个训练样本，1代表变色鸢尾 -1代表山鸢尾。提取训练样本中第一特征列（萼片长度）和第三特征列（花瓣长度）赋值给矩阵X。
import matplotlib.pyplot as plt
#import numpy as np
y=df.iloc[0:100,4].values
y=np.where(y == ‘Iris-setosa’,-1,1)
x=df.iloc[0:100,[0,2]].values
plt.scatter(x[:50,0],x[:50,1],color=‘red’,marker=‘o’,label=‘setosa’)
plt.scatter(x[50:100,0],x[50:100,1],color=‘blue’,marker=‘x’,label=‘versicolor’)
plt.xlabel(‘pelal length’) #花瓣长度
plt.ylabel(‘sepal length’) #萼片长度
plt.legend(loc=‘upper left’)
plt.show()

#Epochs迭代次数 Number of misclassifications错误分类样本数量。
ppn = Perceptron(eta=0.1,n_iter=10)
ppn.fit(x,y)
plt.plot(range(1,len(ppn.errors_)+1),ppn.errors_,marker=‘o’)
plt.xlabel(‘Epochs’) #迭代次数
plt.ylabel(‘Number of misclassifications’) #错误分类数
plt.show()

在这里插入图片描述
from matplotlib.colors import ListedColormap #ListedColormap定义颜色和标记符号
def plot_decision_regions(x,y,classifier,resolution=0.02):
markers = (‘s’,‘x’,‘o’,’^’,‘v’)
colors = (‘red’,‘blue’,‘lightgreen’,‘gray’,‘cyan’)
cmap = ListedColormap(colors[:len(np.unique(y))])
x1_min,x1_max = x[:,0].min() -1,x[:,0].max() +1
x2_min,x2_max = x[:,1].min() -1,x[:,1].max() +1
xx1,xx2 = np.meshgrid(np.arange(x1_min,x1_max,resolution),
np.arange(x2_min,x2_max,resolution)) #meshgrid函数将最大值最小值向量生成二维数组XX1和XX2.
z = classifier.predict(np.array([xx1.ravel(),xx2.ravel()]).T) #classifier分类
z =z.reshape(xx1.shape)
plt.contourf(xx1,xx2,z,alpha=0.4,cmap=cmap) #contourf 对于网格数组中的每个预测的类以不同的颜色绘制出预测得到的决策区域
plt.xlim(xx1.min(),xx1.max())
plt.ylim(xx2.min(),xx2.max())
for idx,cl in enumerate(np.unique(y)): #enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列，同时列出数据和数据下标，一般用在 for 循环当中
plt.scatter(x=x[y== cl,0],y=x[y==cl,1],
alpha=0.8,c=cmap(idx),
marker=markers[idx],label=cl)

plot_decision_regions(x,y,classifier=ppn)
plt.xlabel(‘sepal length [cm]’) #萼片长度
plt.ylabel(‘petal length [cm]’) #花瓣长度
plt.legend(loc=‘upper left’)
plt.show()
在这里插入图片描述
#使用Python实现自适应线性神经元
class AdalineGD(object):
def init(self,eta=0.01,n_iter=50):
self.eta = eta
self.n_iter = n_iter
def fit(self,x,y):
self.w_ = np.zeros(1+x.shape[1])
self.cost_ = []
for i in range(self.n_iter):
output = self.net_input(x)
errors = (y - output)
self.w_[1:]+= self.eta * x.T.dot(errors) #计算1到m位置的权重
self.w_[0] += self.eta * errors.sum() #计算梯度的第0个位置的权重
cost = (errors**2).sum() / 2.0 #误差平方和=误差的平方/2
self.cost_.append(cost) #增加cost这列
return self
def net_input(self,x):
return np.dot(x,self.w_[1:]) + self.w_[0]

def activation(self,x):
    return self.net_input(x)

def predict(self,x):
    return np.where(self.activation(x) >= 0.0, 1 ,-1)

#学习速率=0.01时速率太大，并没有使代价函数的值尽可能的低。学习速率=0.0001时速率太小为了达到算法收敛目标，需要更多迭代。
fig,ax = plt.subplots(nrows=1,ncols=2,figsize=(8,4))
ada1 = AdalineGD(n_iter=10,eta=0.01).fit(x,y)
ax[0].plot(range(1,len(ada1.cost_) +1),np.log10(ada1.cost_),marker=‘o’)
ax[0].set_xlabel(‘Epochs’)
ax[0].set_ylabel(‘log(sum-squares-error)’)
ax[0].set_title(‘adalin-leaning rate 0.01’)
ada2 = AdalineGD(n_iter=10,eta=0.0001).fit(x,y)
ax[1].plot(range(1,len(ada2.cost_)+1),ada2.cost_,marker=‘o’)
ax[1].plot(range(1,len(ada2.cost_)+1),
ada2.cost_,marker=‘o’)
ax[1].set_xlabel(‘Epochs’)
ax[1].set_ylabel(‘sum-squared-error’)
ax[1].set_title(‘adalin-leaning rate 0.0001’)
plt.show()
在这里插入图片描述
#将X值标准化
x_std = np.copy(x)
x_std[:,0] = (x[:,0] - x[:,0].mean()) / x[:,0].std()
x_std[:,1] = (x[:,1] - x[:,1].mean()) / x[:,1].std()

#标准化操作后，代价函数逐步减小。
ada = AdalineGD(n_iter=15,eta=0.01)
ada.fit(x_std,y)
plot_decision_regions(x_std,y,classifier=ada)
plt.title(‘Adaline - Gradient Descent’)
plt.xlabel(‘sepel length [standardized]’)
plt.ylabel(‘petal length [standardized]’)
plt.legend(loc=‘upper left’)
plt.show()
plt.plot(range(1,len(ada.cost_)+1),ada.cost_,marker=‘o’)
plt.xlabel(‘Epochs’)
plt.ylabel(‘sum-squared-error’)
plt.show
在这里插入图片描述
#假如是一个含有几百万条数据的巨大数据集，上面的方法每一步都需要对整个数据集进行评估，计算成本非常高。
#优化算法：随机梯度下降
from numpy.random import seed
class AdalineSGD(object):
def init(self,eta=0.01,n_iter=10,shuffle=True,random_state=None):
self.eta = eta
self.n_iter = n_iter
self.w_initialized = False
self.shuffle=shuffle #新增shuffle训练数据选项，每次迭代前重排训练数据避免在优化代价函数阶段陷入循环。
if random_state: #random_state参数指定随机数种子以保持多次训练的一致性
seed(random_state)
def fit(self,x,y):
self.initialize_weights(x.shape[1])
self.cost = []

    for i in range(self.n_iter):
        if self.shuffle:
            x,y =self._shuffle(x,y)
        cost = []
        for xi,target in zip(x,y):
            cost.append(self._update_weights(xi,target))
        avg_cost = sum(cost)/len(y)
        self.cost_.append(avg_cost)
    return self
def partial_fit(self,x,y):
    if not self.w_initialized:
        self._initialize_weights(x.shape[1])
        if y.ravel().shape[0] > 1:
            for xi,target in zip(x,y):
                self._update_weights(xi,target)
    else:
        self._update_weights(x,y)
    return self

def _shuffle(self,x,y):
    r=np.random.permutation(len(y))  #通过random.permutation函数生成0-100的不重复随机序列
    return x[r],y[r]

def _initialize_weights(self,m):
    self.w_ =np.zeros(1+m)
    self.w_initialized = True
    
def _update_weights(self,xi,target):
    output = self.net_input(xi)
    error =(target - output)
    self.w_[1:] += self.eta * xi.dot(error)
    self.w_[0] +=self.eta*error
    cost = 0.5*error**2
    return cost

def net_input(self,x):
    return np.dot(x,self.w_[1:])+self.w_[0]

def activation(self,x):
    return self.net_input(x)

def predict(self,x):
    return np.where(self.activation(x) >= 0.0, 1 ,-1)

ada = AdalineSGD(n_iter=15,eta=0.01,random_state=1)
ada.fit(x_std,y)
plot_decision_regions(x_std,y,classifier=ada)
plt.title(‘Adaline - stochaastic’)
plt.xlabel(‘sepel length [standardized]’)
plt.ylabel(‘petal length [standardized]’)
plt.legend(loc=‘upper left’)
plt.show()
plt.plot(range(1,len(ada.cost_)+1),ada.cost_,marker=‘o’)
plt.xlabel(‘Epochs’)
plt.ylabel(‘Average cost’)
plt.show

在这里插入图片描述

python机器学习--分类算法

猜你喜欢