提示：文章写完后，目录可以自动生成，如何生成可参考右边的帮助文档

前言

此篇博客主要记录使用python来完成机器学习的某些算法，例如线性回归，逻辑回归，k-mean等，本文参考了吴恩达老师的机器学习课程，详细资料可见Coursera网站机器学习课程。

提示：以下是本篇文章正文内容，下面案例可供参考

一、线性回归

1.1 单特征的线性回归

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 计算代价
def computeCost(X,y,theta):
    m=len(X)
    J=np.sum(np.power(X*theta-y,2))/(2*m)
    return J

## 梯度下降
def gradientDescent(X,y,theta,alpha,iters):
    m=len(X)
    cost=np.zeros(iters)
    for i in range(iters):
        temp=((X * theta - y).T * X * alpha / m).T
        theta=np.subtract(theta,temp)
        cost[i]=computeCost(X,y,theta)
    return theta,cost

path=r'E:\python\MyMachineLearning\exp1\ex1data1.txt'
data=pd.read_csv(path,header=None,names=['Population','Profit'])
## 插入x0
data.insert(0,'ones',1)
cols=data.shape[1]
X=data.iloc[:,:-1]
y=data.iloc[:,cols-1:cols]
## 转为矩阵便于后面的矩阵计算
X=np.matrix(X.values)
y=np.matrix(y.values)
## 初始化相应的参数
theta=np.matrix(np.array([0,0])).T
alpha=0.01
max_iters=1500
theta,cost=gradientDescent(X,y,theta,alpha,max_iters)
## 绘制曲线
x=np.linspace(data.Population.min(),data.Population.max())
y=theta[0,0]+theta[1,0]*x
figure,axis=plt.subplots()
axis.plot(x,y,'r',label='Prediction')
axis.scatter(x=data['Population'],y=data['Profit'],label='Training Data')
axis.set_xlabel('Population')
axis.set_ylabel('Profit')
plt.show()

1.2 多特征的线性回归

非常类似于上部分的代码，只是样本的特征数量增加了。

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def computeCost(X,y,theta):
    m=len(X)
    J=np.sum(np.power(X*theta-y,2))/(2*m)
    return J

def gradientDescent(X,y,theta,alpha,iters):
    m=len(X)
    cost=np.zeros(iters)
    for i in range(iters):
        temp=(X*theta-y).T*X*alpha/m
        theta=np.subtract(theta,temp.T)
        cost[i]=computeCost(X,y,theta)
    return theta,cost

file=r'E:\python\MyMachineLearning\exp1\ex1data2.txt'
data=pd.read_csv(file,names=["size",'bedroom','price'],header=None)
# 归一化
data=(data-data.mean())/data.std()
data.insert(0,'ones',1)
cols=data.shape[1]
X=data.iloc[:,:-1]
y=data.iloc[:,cols-1:cols]
X=np.matrix(X.values)
y=np.matrix(y.values)
theta=np.matrix(np.array([0,0,0])).T
alpha=0.01
iters=1500
theta,cost=gradientDescent(X,y,theta,alpha,iters)
figure,axis=plt.subplots()
x=range(1,1501)
y=cost
plt.plot(x,y,color='red')
plt.show()

1.3 正规方程

（X.T*X）-1 X.Ty

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def normalEquation(X,y):
    ##计算逆矩阵
    temp=np.linalg.inv(X.T*X)
    theta=temp*X.T*y
    return theta
def computeCost(X,y,theta):
    m=len(X)
    J=np.sum(np.power(X*theta-y,2))/(2*m)
    return J

file=r'E:\python\MyMachineLearning\exp1\ex1data2.txt'
data=pd.read_csv(file,names=['Size','Bedroom','Price'])
data=(data-data.mean())/data.std()
data.insert(0,'ones',1)
cols=data.shape[1]
X=data.iloc[:,:-1]
y=data.iloc[:,cols-1:cols]
X=np.matrix(X.values)
y=np.matrix(y.values)
theta=normalEquation(X,y)
cost=computeCost(X,y,theta)
print(cost)

二、logistic 回归

2.1 plot data

import pandas as pd
import matplotlib.pyplot as plt
file=r'E:\python\MyMachineLearning\exp2\ex2data1.txt'
data=pd.read_csv(file,names=['exam1','exam2','Accepted'])
positive=data[data['Accepted'].isin([1])]
negative=data[data['Accepted'].isin([0])]
fig,ax=plt.subplots()
ax.scatter(positive['exam1'],positive['exam2'],c='b',marker='o',label='Accepted')
ax.scatter(negative['exam1'],negative['exam2'],c='r',marker='x',label='unAccepted')
ax.set_xlabel('exam1Score')
ax.set_ylabel('exam2Score')
plt.show()

2.2 单特征的logistic回归

2.3 多特征的logistic回归

import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.io import loadmat


def sigmoid(z):
    return 1/(1+np.exp(-z))

def computeCost(theta,X,y,lambda_i):
    X=np.matrix(X)
    y=np.matrix(y)
    theta=np.matrix(theta)
    m=len(X)
    error=(-y.T)*np.log(sigmoid(X*theta.T))-(1-y.T)*np.log(1-sigmoid(X*theta.T))
    reg=np.sum(np.power(theta[:,1:theta.shape[1]],2))*lambda_i/(2*m)
    return error/m+reg

def gradient(theta,X,y,lambda_i):
    X=np.matrix(X)
    y=np.matrix(y)
    theta=np.matrix(theta)
    m=len(X)
    error=sigmoid(X*theta.T)-y

    grad=((X.T*error)/m).T+(lambda_i*theta)/m
    grad[0,0]=np.sum(error.T*X[:,0])/m
    return np.array(grad).ravel()

# def gradient(theta, X, y, lambda_i):
#     theta = np.matrix(theta)
#     X = np.matrix(X)
#     y = np.matrix(y)
#
#     parameters = int(theta.ravel().shape[1])
#     error = sigmoid(X * theta.T) - y
#
#     grad = ((X.T * error) / len(X)).T + ((lambda_i / len(X)) * theta)
#
#     # intercept gradient is not regularized
#     grad[0, 0] = np.sum(np.multiply(error, X[:, 0])) / len(X)
#
#     return np.array(grad).ravel()

def one_vs_all(X,y,num_labels,lambda_i):
    rows=X.shape[0]
    params=X.shape[1]
    all_theta=np.zeros((num_labels,params+1))
    X=np.insert(X,0,values=np.ones(rows),axis=1)

    for i in range(1,num_labels+1):
        theta=np.zeros(params+1)
        y_i=np.array([1 if label==i else 0 for label in y])
        y_i=np.reshape(y_i,(rows,1))

        fmin=minimize(fun=computeCost,x0=theta,args=(X,y_i,lambda_i),method='TNC',jac=gradient)
        all_theta[i-1,:]=fmin.x
    return all_theta
file=r'E:\python\MyMachineLearning\exp3\ex3data1.mat'
data=loadmat(file)
X=data['X']
y=data['y']
all_theta=one_vs_all(data['X'],data['y'],10,1)
print(all_theta)

三、Neural Network

3.1 plot data

import numpy as np
from scipy.io import loadmat
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt


file=r'E:\python\MyMachineLearning\exp4\ex4data1.mat'
data=loadmat(file)
X=data['X']
y=data['y']
sample_idx=np.random.choice(np.arange(X.shape[0]),100)
sample_img=data['X'][sample_idx,:]
fig,ax=plt.subplots(nrows=10,ncols=10,sharey=True,sharex=True)
for r in range(10):
    for c in range(10):
        ax[r,c].matshow(np.array(sample_img[10*r+c].reshape((20,20))).T,cmap=matplotlib.cm.binary) ## 显示灰度图像
        ## 自动调整x,y坐标刻度
        plt.xticks(np.array([]))
        plt.yticks(np.array([]))
plt.show()

3.2 Neural Network

这里的神经网络的参数为theta 包含了w和b

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from scipy.io import loadmat
from sklearn.preprocessing import OneHotEncoder
from scipy.optimize import minimize

file=r'E:\python\MyMachineLearning\exp4\ex4data1.mat'
data=loadmat(file)
X=data['X']
y=data['y']
weight = loadmat("E:\python\MyMachineLearning\exp4\ex4weights.mat")
theta1, theta2 = weight['Theta1'], weight['Theta2']


def sigmoid(z):
    return 1/(1+np.exp(-z))

##前向传播
def forward_propagate(X,theta1,theta2):
    m=X.shape[0]
    a1=np.insert(X,0,values=np.ones(m),axis=1)
    z2=a1*theta1.T
    a2=np.insert(sigmoid(z2),0,np.ones(m),axis=1)
    z3=a2*theta2.T
    h=sigmoid(z3)
    return a1,z2,a2,z3,h

def computeCost(theta1,theta2,input_size,hidden_size,num_labels,X,y,lambda_i):
    m=X.shape[0]
    X=np.matrix(X)
    y=np.matrix(y)

    ## 正向传播
    a1,z2,a2,z3,h=forward_propagate(X,theta1,theta2)
    ## 对应的神经元相乘
    error=np.multiply(-y,np.log(h))-np.multiply(1-y,np.log(1-h))
    reg=(lambda_i*1.0/(2*m))*(np.sum(np.power(theta1[:,1:],2))+np.sum(np.power(theta2[:,1:],2)))

    return np.sum(error)/m+reg


def sigmoid_gradient(z):
    return np.multiply(sigmoid(z),(1-sigmoid(z)))


## 反向传播
def backprop(params,input_size,hidden_size,num_labels,X,y):
    m=X.shape[0]
    X=np.matrix(X)
    y=np.matrix(y)
    ## 参数reshape
    theta1 = np.matrix(np.reshape(params[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
    theta2 = np.matrix(np.reshape(params[hidden_size * (input_size + 1):],(num_labels,(hidden_size+1))))

    a1,z2,a2,z3,h=forward_propagate(X,theta1,theta2)

    J=0
    delta1=np.zeros(theta1.shape)
    delta2=np.zeros(theta2.shape)

    error=np.multiply((-y),np.log(h))-np.multiply((1-y),np.log(1-h))
    J=np.sum(error)/m
    ## 反向传播
    delta3=h-y
    delta2=delta3*theta2
    delta2=delta2[:,1:]
    delta2=np.multiply(delta2,sigmoid_gradient(z2))

    Delta1 = np.zeros(theta1.shape)
    Delta2 = np.zeros(theta2.shape)
    Delta2+=delta3.T*a2
    Delta1+=delta2.T*a1
    Delta2=Delta2/m
    Delta1=Delta1/m

    return J,Delta1,Delta2


def backpropReg(params,input_size,hidden_size,num_labels,X,y,lambda_i):
    m=X.shape[0]
    X=np.matrix(X)
    y=np.matrix(y)
    theta1=np.matrix(np.reshape(params[:(hidden_size)*(input_size+1)],(hidden_size,(input_size+1))))
    theta2=np.matrix(np.reshape(params[(hidden_size)*(input_size+1):],(num_labels,(hidden_size+1))))
    a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2)
    J = 0
    delta1 = np.zeros(theta1.shape)
    delta2 = np.zeros(theta2.shape)

    error = np.multiply((-y), np.log(h)) - np.multiply((1 - y), np.log(1 - h))
    reg = (lambda_i * 1.0 / (2 * m)) * (np.sum(np.power(theta1[:, 1:], 2)) + np.sum(np.power(theta2[:, 1:], 2)))
    J = np.sum(error) / m+reg

    delta3=h-y
    delta2=delta3*theta2
    delta2=delta2[:,1:]
    delta2=np.multiply(delta2,sigmoid_gradient(z2))
    Delta1 = np.zeros(theta1.shape)
    Delta2 = np.zeros(theta2.shape)
    Delta2 += delta3.T * a2
    Delta1 += delta2.T * a1
    Delta2 = Delta2 / m
    Delta1 = Delta1 / m
    Delta1[:, 1:] = Delta1[:, 1:] + (theta1[:, 1:] * lambda_i) / m
    Delta2[:, 1:] = Delta2[:, 1:] + (theta2[:, 1:] * lambda_i) / m
    ## 将俩个矩阵降维一维
    grad = np.concatenate((np.ravel(Delta1), np.ravel(Delta2)))
    return J,grad

## 自动转成矩阵形式
encoder=OneHotEncoder(sparse=False)
y_onehot=encoder.fit_transform(y)
input_size = 400
hidden_size = 25
num_labels = 10
lambda_i=1
## 随机初始theta 减去0.5保证均值为0
params = (np.random.random(size=hidden_size * (input_size + 1) + num_labels * (hidden_size + 1)) - 0.5) * 0.24
## 或者对于theta1 theta2 分别进行初始化
# theta1=np.random.random(size=hidden_size * (input_size + 1))*(2*0.12)-0.12

fmin=minimize(fun=backpropReg,x0=(params),args=(input_size,hidden_size,num_labels,X,y_onehot,lambda_i),method='TNC',jac=True,options={
    
    'maxiter':250})
print(fmin)

四、交叉验证误差

本代码中使用的算法为线性回归

file=r'E:\python\MyMachineLearning\exp5\ex5data1.mat'
data=sio.loadmat(file)
X=data['X']
y=data['y']
X, y, Xval, yval, Xtest, ytest = map(np.ravel,[data['X'], data['y'], data['Xval'], data['yval'], data['Xtest'], data['ytest']])
X, Xval, Xtest = [np.insert(x.reshape(x.shape[0], 1), 0, np.ones(x.shape[0]), axis=1) for x in (X, Xval, Xtest)]

theta=np.ones(X.shape[1])
lambda_i=1
training_cost, cv_cost = [], []
## 样本总数
m=X.shape[0]
## 考虑随着样本数的增加，对应的误差（训练误差和交叉验证误差）
for i in range(1,m+1):
    res = linear_regression(X[:i, :], y[:i], 0)

    tc = computeCostReg(res.x, X[:i, :], y[:i], 0)
    cv = computeCostReg(res.x, Xval, yval, 0)

    training_cost.append(tc)
    cv_cost.append(cv)


## 寻找最佳的lambda_i
## 一般按照0.003增长
l_candidate=[0,0.001,0.003,0.01,0.03,0.1,0.3,1.3,10]
for l in l_candidate:
    res = linear_regression(X, y, l)

    tc = computeCost(res.x, X, y)
    cv = computeCost(res.x, Xval, yval)

    training_cost.append(tc)
    cv_cost.append(cv)

## 绘制对应的图像
fig, ax = plt.subplots()
ax.plot(l_candidate, training_cost, label='training')
ax.plot(l_candidate, cv_cost, label='cross validation')
## 添加对应的标签
plt.legend()
plt.xlabel('lambda')
plt.ylabel('cost')
plt.show()


## 在测试集上机选误差，来进行评估
for l in l_candidate:
    theta = linear_regression(X, y, l).x
    print('test cost(l={}) = {}'.format(l, computeCost(theta, Xtest, ytest)))

五、SVM大间距分类器

这里用的是sklearn 包中的svm。

5.1 线性svm

import numpy as np
import pandas as pd
import scipy.io as sio
from sklearn.metrics import classification_report
from sklearn import svm
import matplotlib.pyplot as plt


def decision_boundary(svc,X1min,X1max,X2min,X2max,diff):
    x1=np.linspace(X1min,X1max,1000)
    x2=np.linspace(X2min,X2max,1000)
    cordinates=[(x,y) for x in x1 for y in x2]
    x_cord,y_cord=zip(*cordinates)
    c_val=pd.DataFrame({
    
    'x1':x_cord,'x2':y_cord})
    ## 计算与决策边界的距离
    c_val['cval']=svc.decision_function(c_val[['x1','x2']])

    ##取出位于样本分界线的点
    decision=c_val[np.abs(c_val['cval'])<diff]
    return decision.x1,decision.x2

file=r'E:\python\MyMachineLearning\exp6\ex6data1.mat'
init_data=sio.loadmat(file)
X=init_data['X']
data=pd.DataFrame(X,columns=['X1','X2'])
data['y']=init_data['y']
## 参数c =  1/lambda
## 线性支持向量机  loss hinge，squared_hinge ,对于不同的参数c会有不同的决策边界
svc=svm.LinearSVC(C=1,loss='hinge',max_iter=1000)
svc.fit(data[['X1','X2']],data['y'])
cost=svc.score(data[['X1','X2']],data['y'])
x1, x2 = decision_boundary(svc, 0, 4, 1.5, 5, 2 * 10**-3)
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(x1, x2, s=10, c='r',label='Boundary')
positive=data[data['y'].isin([1])]
negative=data[data['y'].isin([0])]
ax.scatter(positive['X1'],positive['X2'],c='b',marker='o')
ax.scatter(negative['X1'],negative['X2'],c='r',marker='x')
ax.set_title('SVM (C=1) Decision Boundary')
ax.legend()
plt.show()

5.2 非线性SVM

这里用的是高斯核函数

高斯核函数如下

## 高斯核函数
def gaussian_kernel(x1,x2,sigma):
    return np.exp(-np.sum((x1-x2)**2)/(2*(sigma**2)))

训练代码如下

file=r'E:\python\MyMachineLearning\exp6\ex6data2.mat'
init_data=sio.loadmat(file)
X=init_data['X']
data=pd.DataFrame(X,columns=['X1','X2'])
data['y']=init_data['y']

## 使用内置的高斯内核进行数据拟合
svc=svm.SVC(C=100,gamma=10,probability=True)
svc.fit(data[['X1','X2']],data['y'])
res=svc.score(data[['X1','X2']],data['y'])

六、K-means

6.1 K-Means应用

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import scipy.io as sio


## 计算每个点最近的中心
def find_closet_centroids(X,centroids):
    m=X.shape[0]
    k=centroids.shape[0]
    idx=np.zeros(m)
    for i in range(m):
        min_dist=1000000
        for j in range(k):
            ## 计算点到所有中心的距离
            dist=np.sum((X[i,:]-centroids[j,:])**2)
            if(dist < min_dist):
                min_dist=dist
                idx[i]=j
    return idx

#计算聚类中心
def compute_centroids(X,idx,k):
    m,n=X.shape
    centroids=np.zeros((k,n))
    for i in range(k):
        ## 获取对应中心的所有样本
        indices=np.where(idx==i)
        centroids[i,:]=(np.sum(X[indices,:],axis=1)/len(indices[0])).ravel()
    return centroids


## K_mean 算法
def run_k_means(X,initial_centroids,max_iters):
    m,n=X.shape
    k=initial_centroids.shape[0]
    idx=np.zeros(m)
    centroid=initial_centroids
    for i in range(max_iters):
        idx=find_closet_centroids(X,centroid)
        centroid=compute_centroids(X,idx,k)
    return idx,centroid

## 随机初始化k个聚类中心
def init_centroids(X,k):
    m,n=X.shape
    centroids=np.zeros((k,n))
    ## 随机初始化 0-m 的k个数字
    idx=np.random.randint(0,m,k)
    for i in range(k):
        centroids[i,:]=X[idx[i],:]
    return centroids

file=r'E:\python\MyMachineLearning\exp7\ex7data2.mat'
init_data=sio.loadmat(file)
X=init_data['X']
## 随机初始化聚类中心
initial_centroids = init_centroids(X,3)
idx, centroids = run_k_means(X, initial_centroids, 10)
## 位于不同中心的点
cluster1=X[np.where(idx==0)[0],:]
cluster2=X[np.where(idx==1)[0],:]
cluster3=X[np.where(idx==2)[0],:]
fig,ax=plt.subplots()
ax.scatter(cluster1[:,0],cluster1[:,1],c='b',label='Cluster 1')
ax.scatter(cluster2[:,0],cluster2[:,1],c='r',label='Cluster 2')
ax.scatter(cluster3[:,0],cluster3[:,1],c='g',label='Cluster 3')
ax.legend()
plt.show()

同样，我们也可以使用sklearn自带的KMeans来完成训练

## n_init 表示随机初始化聚类中心的次数   n_job表示并行的cpu数，-1表示使用所有cpu
model=KMeans(n_clusters=16,n_init=100,n_jobs=-1)
model.fit(data)
#获取聚簇中心
centroids = model.cluster_centers_
## #获取每个数据点的对应聚簇中心的索引
C = model.predict(data)

6.2 K-Means应用——图片压缩

import numpy as np
import pandas as pd
import scipy.io as sio
from IPython.display import Image
import matplotlib.pyplot as plt

def computeMinDist(X,centroids):
    m = X.shape[0]
    k = centroids.shape[0]
    idx = np.zeros(m)
    for i in range(m):
        min_dist = 1000000
        for j in range(k):
            ## 计算点到所有中心的距离
            dist = np.sum((X[i, :] - centroids[j, :]) ** 2)
            if (dist < min_dist):
                min_dist = dist
                idx[i] = j
    return idx

def compute_centroids(X,idx,k):
    m, n = X.shape
    centroids = np.zeros((k, n))
    for i in range(k):
        ## 获取对应中心的所有样本
        indices = np.where(idx == i)
        centroids[i, :] = (np.sum(X[indices, :], axis=1) / len(indices[0])).ravel()
    return centroids

def init_centroids(X,k):
    m,n=X.shape
    idx=np.random.randint(0,m,k)
    centroids=np.zeros((k,n))
    for i in range(k):
        centroids[i,:]=X[idx[i],:]
    return centroids

#k_mean 算法
def k_mean(X,initial_centroids,max_iters):
    m, n = X.shape
    k = initial_centroids.shape[0]
    idx = np.zeros(m)
    centroid = initial_centroids
    for i in range(max_iters):
        idx = computeMinDist(X, centroid)
        centroid = compute_centroids(X, idx, k)
    return idx, centroid

file=r'E:\python\MyMachineLearning\exp7\bird_small.png'
imgFile=r'E:\python\MyMachineLearning\exp7\bird_small.mat'
image_data=sio.loadmat(imgFile)
A=image_data['A']

## normalize
A=A/255

X=np.reshape(A,(A.shape[0]*A.shape[1],A.shape[2]))

initial_centroids=init_centroids(X,16)
idx, centroids = k_mean(X, initial_centroids, 10)

idx = computeMinDist(X, centroids)

## 将每个像素映射到聚类中心的值
X_recovered = centroids[idx.astype(int),:]
## 重新调整维度
X_recovered = np.reshape(X_recovered, (A.shape[0], A.shape[1], A.shape[2]))
plt.imshow(X_recovered)
plt.show()

七、PCA 主成分分析

7.1 PCA

import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt

def pca(X):
    ## standardization
    m = X.shape[0]
    X = (X - X.mean(axis=0)) / X.std(axis=0)

    X=np.matrix(X)

    ## 计算协方差矩阵

    cov=X.T*X/m
    ## 奇异值分解
    U,S,V=np.linalg.svd(cov)
    return U,S,V

## 选择k个降维后的数据
def project_data(X,U,k):
    U_reduced=U[:,:k]
    return X*U_reduced

## 恢复降维的数据
def recover_data(Z,U,k):
    U_reduced = U[:, :k]
    return Z*U_reduced.T


## 选择合适的k值
def choose_K(X,lambda_i):
    U,S,V=pca(X)
    res=1000
    all_sum=sum(S)
    for i in range(1,100):
        temp=sum(S[:i])
        if(temp/all_sum>=lambda_i):
            res=i
            break
    return res


## pca 可用于数据的降维
file=r'E:\python\MyMachineLearning\exp7\ex7data1.mat'
data=sio.loadmat(file)
X=data['X']

## 数据可视化
# fig,ax=plt.subplots()
# ax.scatter(X[:,0],X[:,1],c='r')
# plt.show()

U, S, V = pca(X)
Z = project_data(X, U, 1)
w=recover_data(Z,U,1)

##第一主成分的投影轴基本上是数据集中的对角线。 当我们将数据减少到一个维度时，
# 我们失去了该对角线周围的变化，所以在我们的再现中，一切都沿着该对角线。

7.2 PCA 应用 ——图片压缩

import numpy as np
import pandas as pd
import scipy.io as sio
import matplotlib.pyplot as plt

def pca(X):
    ##normalize
    X=(X-X.mean())/X.std()
    X=np.matrix(X)
    m=X.shape[0]
    cov=X.T*X/m
    U,S,V=np.linalg.svd(cov)
    return U,S,V

## 数据压缩
def project_data(X,U,k):
    U_reduced=U[:,:k]
    return X*U_reduced.T

## 数据恢复
def recover_data(X,U,k):
    U_rediced=U[:,:k]
    return z*U_rediced.T

## 显示前n张图像
def plod_n_image(X,n):
    pic_size=int(np.sqrt(X.shape[1]))
    grid_size=int(np.sqrt(n))
    face_image=X[:n,:]
    fig,ax=plt.subplots(nrows=grid_size,ncols=grid_size,sharex=True,sharey=True)

    for r in range(grid_size):
        for c in range(grid_size):
            ax[r,c].imshow(face_image[grid_size*r+c].reshape((pic_size,pic_size)))
            plt.xticks(np.array([]))
            plt.yticks(np.array([]))
    plt.show()

file=r'E:\python\MyMachineLearning\exp7\ex7faces.mat'
face_data=sio.loadmat(file)
X=face_data['X']
U,S,V=pca(X)
z=project_data(X,U,100)

八、异常检测

这里使用scipy库中的内置函数norm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.io as sio
from scipy import stats
import sklearn.preprocessing

## 计算每个特征的方差和均值
def compute_gaussian(X):
    ## axis=0 表示对列进行平均
    mu=X.mean(axis=0)
    sigma=X.var(axis=0)
    return mu,sigma

## 寻找好的阈值
def select_threshold(pval,yval):
    best_epsilon=0
    best_f1=0
    f1=0
    ## 步长
    step=(pval.max()-pval.min())/1000
    for epsilon in np.arange(pval.min(),pval.max(),step):
        preds=pval<epsilon
        tp=np.sum(np.logical_and(preds==1,yval==1))
        fp = np.sum(np.logical_and(preds == 1, yval == 0)).astype(float)
        fn = np.sum(np.logical_and(preds == 0, yval == 1)).astype(float)

        ## 计算准确率和召回率 以及 F1
        precision=tp/(tp+fp)
        recall=tp/(tp+fn)
        f1=(2*precision*recall)/(precision+recall)
        if(f1>best_f1):
            best_f1=f1
            best_epsilon=epsilon
    return best_epsilon,best_f1

file=r'E:\python\MyMachineLearning\exp8\ex8data1.mat'
data=sio.loadmat(file)
X=data['X']

# 数据可视化
# fig,ax=plt.subplots()
# ax.scatter(X[:,0],X[:,1])
# plt.show()

mu,sigma=compute_gaussian(X)
## 绘制高斯分布的图形
# fig,ax=plt.subplots()
# xplot=np.linspace(0,25,100)
# yplot=np.linspace(0.25,100)
# ## 生成网格，返回x坐标和y坐标
# Xplot,Yplot=np.meshgrid(xplot,yplot)
# #计算 对应的值
# Z = np.exp((-0.5)*((Xplot-mu[0])**2/sigma[0]+(Yplot-mu[1])**2/sigma[1]))
# ## 绘制等值图
# contour = plt.contour(Xplot, Yplot, Z,[10**-11, 10**-7, 10**-5, 10**-3, 0.1],colors='k')
# ax.scatter(X[:,0], X[:,1])
# plt.show()

## 通过验证集来确定 阈值（用于判断异常点）
Xval=data['Xval']
yval=data['yval']

## scipy 内置函数，用于给出对应点正态分布概率
# dist=stats.norm(mu[0],sigma[0])
# dist.pdf(X[:,0])

p=np.zeros((Xval.shape[0],Xval.shape[1]))
p[:,0] = stats.norm(mu[0], sigma[0]).pdf(X[:,0])
p[:,1] = stats.norm(mu[1], sigma[1]).pdf(X[:,1])

pval = np.zeros((Xval.shape[0], Xval.shape[1]))
pval[:,0] = stats.norm(mu[0], sigma[0]).pdf(Xval[:,0])
pval[:,1] = stats.norm(mu[1], sigma[1]).pdf(Xval[:,1])

##可视化分类结果
# epsilon, f1 = select_threshold(pval, yval)
# outliers = np.where(p < epsilon)
# fig,ax=plt.subplots()
# ax.scatter(X[:,0],X[:,1])
# ax.scatter(outliers[:,0],outliers[:,1],c='r',marker='x')
# plt.show()

对于多元高斯分布模型的创建如下

## 多元高斯分布
mu=X.mean(axis=0)
cov=np.cov(X.T)
## 创建多元高斯分布模型
multi_normal = stats.multivariate_normal(mu, cov)

机器学习python代码

文章目录

前言