Dataset address: Index of /ml/machine-learning-databases/housing (uci.edu)
There are 506 samples in the data set, and each sample contains 13 characteristics that affect housing prices.
dataset format
0.00632 18.00 2.310 0 0.5380 6.5750 65.20 4.0900 1 296.0 15.30 396.90 4.98 24.00 0.02731 0.00 7.070 0 0.4690 6.4210 78.90 4.9671 2 242.0 17.80 396.90 9.14 21.60 0.02729 0.00 7.070 0 0.4690 7.1850 61.10 4.9671 2 242.0 17.80 392.83 4.03 34.70 0.03237 0.00 2.180 0 0.4580 6.9980 45.80 6.0622 3 222.0 18.70 394.63 2.94 33.40 0.06905 0.00 2.180 0 0.4580 7.1470 54.20 6.0622 3 222.0 18.70 396.90 5.33 36.20 0.02985 0.00 2.180 0 0.4580 6.4300 58.70 6.0622 3 222.0 18.70 394.12 5.21 28.70 0.08829 12.50 7.870 0 0.5240 6.0120 66.60 5.5605 5 311.0 15.20 395.60 12.43 22.90 0.14455 12.50 7.870 0 0.5240 6.1720 96.10 5.9505 5 311.0 15.20 396.90 19.15 27.10 0.21124 12.50 7.870 0 0.5240 5.6310 100.00 6.0821 5 311.0 15.20 386.63 29.93 16.50
data read
np.fromfile() reads data without datatype and shape of data. So data.reshape() is used here to re-transform into the original shape.
# 导入需要用到的package
import numpy as np
import json
# 读入训练数据
datafile = 'housing.data'
data = np.fromfile(datafile, sep=' ')
print(data.shape)
# 每条数据包括14项,其中前面13项是影响因素,第14项是相应的房屋价格
feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
feature_num = len(feature_names)
# 将原始数据进行reshape, 变为[N, 14]这样的形状
data = data.reshape([data.shape[0] // feature_num, feature_num])
print(data.shape)
# 输出(506, 14)
# 查看数据
X = data[0]
print(X.shape)
print(X)
(7084,)
(506, 14)
(14,)
[6.320e-03 1.800e+01 2.310e+00 0.000e+00 5.380e-01 6.575e+00 6.520e+01
4.090e+00 1.000e+00 2.960e+02 1.530e+01 3.969e+02 4.980e+00 2.400e+01]
Divide the dataset
In machine learning, the data set is usually divided into training set and test set, the training set is used for training, and the test set is used to evaluate the performance of the model. The ratio of the two is greater than 8:1
# 导入需要用到的package
import numpy as np
import json
# 读入训练数据
datafile = 'housing.data'
data = np.fromfile(datafile, sep=' ')
print(data.shape)
# 每条数据包括14项,其中前面13项是影响因素,第14项是相应的房屋价格
feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
feature_num = len(feature_names)
# 将原始数据进行reshape, 变为[N, 14]这样的形状
data = data.reshape([data.shape[0] // feature_num, feature_num])
print(data.shape)
# 输出(506, 14)
# 查看数据
X = data[0]
print(X.shape)
print(X)
ratio = 0.8
offset = int(data.shape[0] * ratio)
train_data = data[:offset]
test_data = data[offset:]
print('训练集的大小',train_data.shape)
print('测试集的大小',test_data.shape)
(7084,)
(506, 14)
(14,)
[6.320e-03 1.800e+01 2.310e+00 0.000e+00 5.380e-01 6.575e+00 6.520e+01
4.090e+00 1.000e+00 2.960e+02 1.530e+01 3.969e+02 4.980e+00 2.400e+01]
Size of training set (404, 14)
Size of test set (102, 14)
data normalization
# 导入需要用到的package
import numpy as np
import json
# 读入训练数据
datafile = 'housing.data'
data = np.fromfile(datafile, sep=' ')
print(data.shape)
# 每条数据包括14项,其中前面13项是影响因素,第14项是相应的房屋价格
feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
feature_num = len(feature_names)
# 将原始数据进行reshape, 变为[N, 14]这样的形状
data = data.reshape([data.shape[0] // feature_num, feature_num])
print(data.shape)
# 输出(506, 14)
# 查看数据
X = data[0]
print(X.shape)
print(X)
ratio = 0.8
offset = int(data.shape[0] * ratio)
train_data = data[:offset]
test_data = data[offset:]
print('训练集的大小',train_data.shape)
print('测试集的大小',test_data.shape)
print('归一化前的数据',train_data[0])
# 计算train数据集的最大值、最小值和平均值
maxinums, mininums, avgs = train_data.max(axis=0), train_data.min(axis=0), train_data.sum(axis=0) / train_data.shape[0]
# 对数据进行归一化处理
for i in range(feature_num):
# print(maxinums[i], mininums[i], avgs[i])
train_data[:, i] = (train_data[:, i] - avgs[i]) / (maxinums[i] - mininums[i])
print('归一化后的数据',train_data[0])
(7084,)
(506, 14)
(14,)
[6.320e-03 1.800e+01 2.310e+00 0.000e+00 5.380e-01 6.575e+00 6.520e+01
4.090e+00 1.000e+00 2.960e+02 1.530e+01 3.969e+02 4.980e+00 2.400e+01]
Size of training set (404, 14)
Size of test set (102, 14)
Data before normalization [6.320e-03 1.800e+01 2.310e+00 0.000e+00 5.380e-01 6.575e+00 6.520e+01
4.090e+00 1.000e+00 2.960e+02 1.530e+01 3.969e+02 4.980e+00 2 .400e +01]
Normalized data [-0.02146321 0.03767327 -0.28552309 -0.08663366 0.01289726 0.04634817 0.00795597 -0.00765794 -0.25172191 -0.11881188 - 0.29002528
0.0519112 -0.17590923
-0.00390539]
Model
import numpy as np
class NetWork(object):
def __init__(self, num_of_weights):
# 随机产生w的初始值
# 为了保持程序每次运行结果的一致性,此处设置了固定的随机数种子
np.random.seed(0)
self.w = np.random.randn(num_of_weights, 1)
self.b = 0
def forward(self, x):
z = np.dot(x, self.w) + self.b
return z
def loss(self, z, y):
error = z - y
cost = error * error
cost = np.mean(cost)
return cost
def gradient(self, x, y):
z = self.forward(x)
gradient_w = (z - y) * x
gradient_w = np.mean(gradient_w, axis=0) # axis=0表示把每一行做相加然后再除以总的行数
gradient_w = gradient_w[:, np.newaxis]
gradient_b = (z - y)
gradient_b = np.mean(gradient_b)
# 此处b是一个数值,所以可以直接用np.mean得到一个标量(scalar)
return gradient_w, gradient_b
def update(self, gradient_w, gradient_b, eta=0.01): # eta代表学习率,是控制每次参数值变动的大小,即移动步长,又称为学习率
self.w = self.w - eta * gradient_w # 相减: 参数向梯度的反方向移动
self.b = self.b - eta * gradient_b
def train(self, x, y, iterations=1000, eta=0.01):
losses = []
for i in range(iterations):
z = self.forward(x) # 前向计算
L = self.loss(z, y) # 求误差
gradient_w, gradient_b = self.gradient(x, y) # 求梯度
self.update(gradient_w, gradient_b, eta) # 更新参数
losses.append(L)
if (i + 1) % 10 == 0:
print('iter {}, loss {}'.format(i, L))
return losses
if __name__=="__main__":
#定义模型的输入
input=np.random.randn(1000,13) #表示有4个样本,每个样本有13个特征
gt_output=np.random.randn(1000,1) ##真实的标签,后续进行损失计算
#定义模型
model=NetWork(13)
print('模型的初始参数',model.w,model.b)
for i in range(100):
#模型的前线传播得到输出
output=model.forward(input)
#计算模型的损失
loss=model.loss(output,gt_output)
# print(loss)
#求梯度
w,b=model.gradient(input,output)
print('第{}epoch参数'.format(i),model.w,model.b)
#模型的更新
model.update(w,b)
full code
import numpy as np
from matplotlib import pyplot as plt
def load_data():
# 从文件导入数据
datafile = 'housing.data'
data = np.fromfile(datafile, sep=' ')
print(data.shape)
# 每条数据包括14项,其中前面13项是影响因素,第14项是相应的房屋价格中位数
feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
feature_num = len(feature_names)
# 将原始数据进行reshape, 变为[N, 14]这样的形状
data = data.reshape([data.shape[0] // feature_num, feature_num])
print(data.shape)
# 将原数据集拆分成训练集和测试集
# 这里使用80%的数据做训练,20%的数据做测试
# 测试集和训练集必须是没有交集的
ratio = 0.8
offset = int(data.shape[0] * ratio)
data_slice = data[:offset]
# 计算train数据集的最大值、最小值和平均值
maxinums, mininums, avgs = data_slice.max(axis=0), data_slice.min(axis=0), data_slice.sum(axis=0) / data_slice.shape[0]
# 对数据进行归一化处理
for i in range(feature_num):
# print(maxinums[i], mininums[i], avgs[i])
data[:, i] = (data[:, i] - avgs[i]) / (maxinums[i] - mininums[i])
# 训练集和测试集的划分比例
# ratio = 0.8
train_data = data[:offset]
test_data = data[offset:]
return train_data, test_data
class NetWork(object):
def __init__(self, num_of_weights):
# 随机产生w的初始值
# 为了保持程序每次运行结果的一致性,此处设置了固定的随机数种子
np.random.seed(0)
self.w = np.random.randn(num_of_weights, 1)
self.b = 0
def forward(self, x):
z = np.dot(x, self.w) + self.b
return z
def loss(self, z, y):
error = z - y
cost = error * error
cost = np.mean(cost)
return cost
def gradient(self, x, y):
z = self.forward(x)
gradient_w = (z - y) * x
gradient_w = np.mean(gradient_w, axis=0) # axis=0表示把每一行做相加然后再除以总的行数
gradient_w = gradient_w[:, np.newaxis]
gradient_b = (z - y)
gradient_b = np.mean(gradient_b)
# 此处b是一个数值,所以可以直接用np.mean得到一个标量(scalar)
return gradient_w, gradient_b
def update(self, gradient_w, gradient_b, eta=0.01): # eta代表学习率,是控制每次参数值变动的大小,即移动步长,又称为学习率
self.w = self.w - eta * gradient_w # 相减: 参数向梯度的反方向移动
self.b = self.b - eta * gradient_b
def train(self, x, y, iterations=1000, eta=0.01):
losses = []
for i in range(iterations):
# 四步法
z = self.forward(x)
L = self.loss(z, y)
gradient_w, gradient_b = self.gradient(x, y)
self.update(gradient_w, gradient_b, eta)
losses.append(L)
if (i + 1) % 10 == 0:
print('iter {}, loss {}'.format(i, L))
return losses
# 获取数据
train_data, test_data = load_data()
print(train_data.shape)
x = train_data[:, :-1]
y = train_data[:, -1:]
# 创建网络
net = NetWork(13)
num_iterations = 2000
# 启动训练
losses = net.train(x, y, iterations=num_iterations, eta=0.01)
# 画出损失函数的变化趋势
plot_x = np.arange(num_iterations)
plot_y = np.array(losses)
plt.plot(plot_x, plot_y)
plt.show()
references:
Boston House Price Prediction - An Entry-Level Case of Machine Learning - Programmer Sought