线性回归¶
PaddlePaddle官方文档用波士顿房价数据集给出了线性回归的使用教程,由于使用的是内置的数据集以及本身模型的可解释性并不直观,因此本文使用自定义数据集和模型重现了线性回归的建模和预测过程,从而在如下方面进行较为清晰的展示:
- 数据集的构造
- 如何构造训练数据集
- 如何构造测试数据集
- 模型的构造
- 本文使用最简单的一元线性回归模型y=3x+1,只要简单修改代码,也可以将其扩展为多元线性归回模型
- 训练数据集和测试数据集都基于该方程构建
背景介绍
本文使用的PaddlePaddle环境为官方提供的paddlepaddle/book镜像,PaddlePaddle版本为0.11.0,详细信息如下所示:
root@d89b8ceaeb86:/book# paddle version PaddlePaddle 0.11.0, compiled with with_avx: ON with_gpu: OFF with_mkl: ON with_mkldnn: ON with_double: OFF with_python: ON with_rdma: OFF
代码样例
# 加载所需包 import paddle.v2 as paddle import numpy as np
#按照y=3x+1生成训练数据 def train_reader(): data = np.array([[1, 4], [2, 7], [3, 10], [4, 13], [5, 16], [6, 19], [7, 22]]) def reader(): for d in data: yield d[:-1], d[-1:] return reader
#按照y=3x+1生成测试数据 def test_reader(): data = ([[0.5, 2.5], [1.5, 5.5], [-2, -5], [0, 1]]) def reader(): for d in data: yield d[:-1], d[-1:] return reader # 从数据数据集reader中提取测试数据和Label test_data = [] test_label = [] test_data_creator = test_reader() for item in test_data_creator(): test_data.append((item[0], )) test_label.append(item[1])
#初始化Paddle paddle.init(use_gpu=False) #配置训练网络 x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(1)) y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) y = paddle.layer.data(name='y', type= paddle.data_type.dense_vector(1)) cost = paddle.layer.square_error_cost(input=y_predict, label=y)
# 保存网络拓扑,本段代码不是必须执行的 inference_topology = paddle.topology.Topology(layers=y_predict) with open ("lessontwo_topology.pkl", 'wb') as f: inference_topology.serialize_for_inference(f)
#创建参数 parameters = paddle.parameters.create(cost)
#创建Trainer optimizer = paddle.optimizer.Momentum(momentum=0) trainer = paddle.trainer.SGD(cost= cost, parameters=parameters, update_equation=optimizer)
feeding = {'x': 0, 'y': 1}
# 定义event handler,打印训练进度 def event_handler(event): if isinstance(event, paddle.event.EndIteration): if event.batch_id % 1 == 0: print "Pass %d, Batch %d, Cost %f" % (event.pass_id, event.batch_id, event.cost)
# 定义event handler_plot,以图形的形式打印训练进度 from paddle.v2.plot import Ploter train_title = "Train cost" test_title = "Test cost" cost_ploter = Ploter(train_title, test_title) step = 0 def event_handler_plot(event): global step if isinstance(event, paddle.event.EndIteration): if step % 1 == 0: # every 1 batches, record a train cost cost_ploter.append(train_title, step, event.cost) if step % 1 == 0: # every 1 batches, record a test cost result = trainer.test( reader=paddle.batch( test_reader(), batch_size=3), feeding=feeding) cost_ploter.append(test_title, step, result.cost) if step % 1 == 0: # every 1 batches, update cost plot cost_ploter.plot() step += 1 if isinstance(event, paddle.event.EndPass): if event.pass_id % 10 == 0: with open('params_pass_%d.tar' % event.pass_id, 'w') as f: trainer.save_parameter_to_tar(f)
# 开始训练 trainer.train( paddle.batch( paddle.reader.shuffle( train_reader(),buf_size=3), batch_size=2), feeding=feeding, event_handler=event_handler_plot, num_passes=10)
# 进行预测 probs = paddle.infer(output_layer=y_predict, parameters=parameters, input=test_data) for i in xrange(len(probs)): print "label=" + str(test_label[i][0]) + ", predict=" + str(probs[i][0])