Machine-Learning 编程作业

Programming Exercise 1：Linear Regression

1. 单变量线性回归
1.1 读取数据并显示
1.2 定义代价函数
1.3 梯度下降法
1.4 可视化
2. 多变量线性回归
2.1 读入数据并显示
2.2 特征归一化
2.3 代价函数
2.4 梯度下降
3. 正规方程法实现单变量回归
4. 用sklearn库中函数实现单变量回归
作业文件打包如下：链接：https://pan.baidu.com/s/1S6-q29v_zYWUXugWAZk-zg 提取码：h20r

单变量线性回归

1.1 读取数据并显示

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

path = 'ex1data1.txt'
data = pd.read_csv(path, header=None, names=['Population', 'Profit'])  #参数设置参见pd.read_csv函数
print(data.head())  #输出前5行，也可以在括号里自定义输出前几行
# print(data.describe())
data.plot(x='Population', y='Profit', kind='scatter', figsize=(10,8))  #参见plot函数参数设置
plt.show()#画图

运行结果如图：

在这里插入图片描述
1.2 定义代价函数
根据代价函数的定义：
所以在定义代价函数的同时要对数据进行处理，将读入的数据分为：X，y，theta。同时，我们在X中插入X0，令其等于1.

#定义代价函数
def computeCost(X, y ,theta):
    Orc = np.power(((X * theta.T) - y), 2)
    return np.sum(Orc) / (2 * len(X))

#处理数据
data.insert(0, 'Ones', 1)  #加一行全为1的数
# print(data.head())  #可以边写边看数据
cols = data.shape[1]
X = data.iloc[:,0:cols-1]  #X是所有行，去掉最后一列
y = data.iloc[:,cols-1:cols]  #y是所有行的最后一列
# print(X.head())
# print(y.head())
X = np.matrix(X.values)#将X的值转化为矩阵形式，方便计算
y = np.matrix(y.values)
theta = np.matrix(np.array([0,0]))
# print(theta)
# print(X.shape, theta.shape, y.shape)
print(computeCost(X, y, theta))

1.3 定义梯度下降函数

#梯度下降法
def gradientDescent(X, y, theta, alpha, iters):
    temp = np.matrix(np.zeros(theta.shape))
    parameters = int(theta.ravel().shape[1])  #计算需求解的参数个数
    cost = np.zeros(iters)  #构建一个iters个0的矩阵，用来存放每一次的代价函数

    for i in range(iters):
        error = (X * theta.T) - y

        for j in range(parameters):
            term = np.multiply(error, X[:,j])
            temp[0,j] = theta[0,j] - ((alpha / len(X)) * np.sum(term))

        theta = temp
        cost[i] = computeCost(X, y, theta)

    return theta, cost
alpha = 0.01   #初始化参数
iters = 1000
g, cost = gradientDescent(X, y, theta, alpha, iters)
print(g)
print(computeCost(X, y, g))

x = np.linspace(data.Population.min(), data.Population.max(), 100)
f = g[0, 0] + (g[0, 1] * x)

1.4 可视化

#在数据图上显示拟合后的结果
ax.plot(x, f, 'r', label='Prediction')
ax.scatter(data.Population, data.Profit, label='Traning Data')
ax.legend(loc=2)  #给所绘制的图中，加上各个点或现的注释
ax.set_xlabel('Population')
ax.set_ylabel('Profit')
ax.set_title('Predicted Profit vs. Population Size')
plt.show()

#显示代价函数随迭代次数变化的结果
fig, ax = plt.subplots(figsize=(10,8))
ax.plot(np.arange(iters), cost, 'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Error vs. Training Epoch')
plt.show()

结果图如下：在这里插入图片描述

以上就是单变量线性回归的全部代码。

多变量线性回归

过程与单变量线性回归类似，，这里不再赘述。直接上代码：

#!/usr/bin/env python
# -*- coding:utf-8 -*-
#多变量线性回归

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Path = "ex1data2.txt"
data = pd.read_csv(Path, header=None, names=['Size', 'Bedrooms', 'Price'])
print(data.head())

# 数据处理
data = (data - data.mean()) / data.std()#将特征进行归一化处理，否则会出现梯度溢出错误。
#加一行X0=1
data.insert(0, 'X0', 1)
# print(data.head())
cols = data.shape[1]
X = data.iloc[:,0:cols-1]#X是所有行，去掉最后一列
y = data.iloc[:,cols-1:cols]#y是所有行的最后一列
X = np.matrix(X.values)#将X的值转化为数组形式，方便计算
y = np.matrix(y.values)
theta = np.matrix(np.zeros(data.shape[1]-1))#将θ的初值设为0
# print(data.shape[1]-1)

#定义代价函数
def computeCost(X, y ,theta):
    Orc = np.power(((X * theta.T) - y), 2)
    return np.sum(Orc) / (2 * len(X))
# print (theta.shape[1])
# P = int(theta.shape[1])
# print(theta.ravel().shape[1])
# parameters = int(theta.ravel().shape[1])

#定义梯度下降函数
def Descent(X, y, theta, alpha, iterms):
    temp = np.matrix(np.zeros(theta.shape))#用一个中间变量，来存储θ的值
    parameters = int(theta.shape[1])#计算θ中有几个参数
    cost = np.zeros(iterms)

    for i in range(iterms):
        error = (X * theta.T) - y #计算偏离误差

        for j in range(parameters):
            term = np.multiply(error, X[:,j])#multiply  后面两个参数相乘
            temp[0,j] = theta[0,j] - ((alpha / len(X)) * np.sum(term))#梯度下降公式

        theta = temp
        cost[i] = computeCost(X, y, theta)#将每一轮计算的代价函数放入cost
        # cost.append(computeCost(X, y,theta)) #
    return theta, cost
alpha = 0.01
iters = 1000
g, cost = Descent(X, y, theta, alpha, iters)
# print(g)
print(computeCost(X, y, g))

#画误差迭代图
fig, ax = plt.subplots(figsize=(10,8))
ax.plot(np.arange(iters), cost, 'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Error vs. Training Epoch')
plt.show()

结果如下图：
在这里插入图片描述

正规方程法实现单变量线性回归

注意，使用正规方程法实现单变量回归取代了梯度下降法，他们之间的区别有以下几点：
在这里插入图片描述
完整代码如下：

#!/usr/bin/env python
# -*- coding:utf-8 -*-
#单变量线性回归用正规方程法

import  numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Path = "ex1data1.txt"
data = pd.read_csv(Path, header=None, names=['Population','Profit'])
print(data.head())

#定义代价函数
def Cost(X, y, theta):
    ins = np.power(((X * theta.T) - y),2)
    return np.sum(ins) / (2 * len(X))

data.insert(0, 'Ones', 1)
cols = data.shape[1] #获取data的列数
X = data.iloc[:,0:cols-1]#X是所有行，去掉最后一列
y = data.iloc[:,cols-1:cols]#y是所有行的最后一列

X = np.matrix(X.values)#将X的值转化为数组形式，方便计算
y = np.matrix(y.values)
theta = np.matrix(np.zeros(data.shape[1]-1))#将θ的初值设为0
print(Cost(X, y , theta))


def normalF(X, y):
    theta = np.linalg.inv(X.T * X) * X.T * y
    return theta
# def normalEqn(X, y):
#     theta = np.linalg.inv(X.T.dot(X)).dot(X.T.dot(y))
#     return theta
theta = normalF(X, y)#theta为数组中包含两个数组
print(theta)
x = np.linspace(data.Population.min(), data.Population.max(), 100)
f = theta[0,0] + (theta[1,0] * x)

# 第二种方案
# g = [float(theta[0]),float(theta[1])]#将数组中的元素取出来，重新定义一个新的数组，只包含数字
# print(g)
# x = np.linspace(data.Population.min(), data.Population.max(), 100)
# f = g[0] + (g[1] * x)

fig, ax = plt.subplots(figsize=(10,8))
# print(fig,ax)
ax.plot(x, f, 'r', label='Prediction')
ax.scatter(data.Population, data.Profit, label='Traning Data')
ax.legend(loc=2)#给所绘制的图中，加上各个点或现的注释
ax.set_xlabel('Population')
ax.set_ylabel('Profit')
ax.set_title('Predicted Profit vs. Population Size')
plt.show()

结果如下：
在这里插入图片描述

扫描二维码关注公众号，回复： 5355536 查看本文章

用sklearn库中函数实现单变量线性回归

这个最简单，直接调用库函数，填好参数就行，完整代码如下：

#!/usr/bin/env python
# -*- coding:utf-8 -*-
#单变量回归用sklern库中的函数实现

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model


path = 'ex1data1.txt'
data = pd.read_csv(path, header=None, names=['Population', 'Profit'])
print(data.head())

data.plot(x='Population', y='Profit', kind='scatter', figsize=(10,8))
plt.show()#画图

data.insert(0, 'Ones', 1)
# print(data.head())
cols = data.shape[1]
X = data.iloc[:,0:cols-1]#X是所有行，去掉最后一列
y = data.iloc[:,cols-1:cols]#y是所有行的最后一列
X = np.matrix(X.values)#将X的值转化为数组形式，方便计算
y = np.matrix(y.values)
theta = np.matrix(np.array([0,0]))

model = linear_model.LinearRegression()
model.fit(X, y)
x = np.array(X[:, 1].A1)
f = model.predict(X).flatten()

fig, ax = plt.subplots(figsize=(12,8))
ax.plot(x, f, 'r', label='Prediction')
ax.scatter(data.Population, data.Profit, label='Traning Data')
ax.legend(loc=2)
ax.set_xlabel('Population')
ax.set_ylabel('Profit')
ax.set_title('Predicted Profit vs. Population Size')
plt.show()