[Notes] Don't bother PYTHON | Tensorflow Tutorial - Building Our First Neural Network (Chapter 3)

#3.1 Example 3 Add layer def add_layer()
adds a neural layer to the neural network

import tensorflow as tf
#添加一个神经层
def add_layer(inputs, in_size, out_size, activation_function = None):
    Weights = tf.Variable(tf.random_normal([in_size,out_size])) #in_size代表行/输入层
    biases = tf.Variable(tf.zeros([1,out_size]) + 0.1)
    Wx_plus_b = tf.matmul(inputs,Weights) + biases #Wx_plus_b代表W*x+b

    if activation_function is None: #如果没有激励函数,即为线性关系,那么直接输出,不需要激励函数(非线性函数)
        outputs = Wx_plus_b
    else:
        outputs = activation_function(Wx_plus_b) #把这个值传进去
    return outputs    


#3.2 Example 3 Building a Neural Network
Build a simple Neural Network to predict y=x^2

import tensorflow as tf
import numpy as np
#添加一个神经层,定义添加神经层的函数
def add_layer(inputs, in_size, out_size, activation_function = None):
    Weights = tf.Variable(tf.random_normal([in_size,out_size])) #in_size代表行/输入层
    biases = tf.Variable(tf.zeros([1,out_size]) + 0.1)
    Wx_plus_b = tf.matmul(inputs,Weights) + biases #Wx_plus_b代表W*x+b

    if activation_function is None: #如果没有激励函数,即为线性关系,那么直接输出,不需要激励函数(非线性函数)
        outputs = Wx_plus_b
    else:
        outputs = activation_function(Wx_plus_b) #把这个值传进去
    return outputs

x_data = np.linspace(-1, 1, 300, dtype = np.float32)[:, np.newaxis]  #输入,np.float32改变数组的长度显示,linspace创建一个从-1到1的等差数列,默认为50个数,这里规定了要生成300个数,并且使用[:, np.newaxis]将数组转换为列向量,[np.newaxis,:]可转换为行向量
noise = np.random.normal(0, 0.05, x_data.shape).astype(np.float32) #生成一个均值/中心为0,标准差/宽度为0.05的正太分布作为噪点/干扰点,它的格式为x_data,使得我们想要预测的函数更加接近实际情况;astype转换数据类型格式为float32
y_data = np.square(x_data) - 0.5 + noise #x的平方减去一个任意值再加上噪点

xs = tf.placeholder(tf.float32, [None, 1]) #占位符,保存数据的利器,float32数据类型,[None,1]表示列为1,行不定的列向量;xs表示x_Session,因为placeholder是与Session一起用的,它在使用的时候和前面的variable不同的是在session运行阶段,需要给placeholder提供数据,利用feed_dict的字典结构给placeholdr变量“喂数据”;placeholder的语法:tf.placeholder(dtype, shape=[None,None] [, name=None])
ys = tf.placeholder(tf.float32, [None, 1])


l1 = add_layer(xs, 1, 10, activation_function = tf.nn.relu) #创建一个隐藏层l1,输入为xs,输入的层数/神经元的个数1=输入层,输出的层数10=隐藏层中神经元的个数
prediction = add_layer(l1, 10, 1, activation_function = None) #预测值;定义输出层,输入为l1=前一层隐藏层的输出,输入的层数为10=隐藏层神经元的个数,输出的层数为1=输出一般只有1层 
loss = tf.reduce_mean(tf.reduce_sum(tf.square(ys-prediction),reduction_indices = [1])) #计算预测值prediction与真实值ys的误差:所有的平方差相加再求平均;reduction_indices = [1]表示相加的方法,[1]表示行求和,[0]表示列求和,具体解释见下文

train_step = tf.train.GradientDescentOptimizer(0.1).minimize(loss) #机器要学习的内容,使用优化器提升准确率,学习率为0.1<1,表示以0.1的效率来最小化误差loss

init = tf.global_variables_initializer() #使用变量,就要对其初始化
sess = tf.Session() #定义Session,并使用Session来初始化步骤
sess.run(init)

for i in range(1000): #训练1000次
    sess.run(train_step, feed_dict={xs:x_data, ys:y_data}) #给placeholder喂数据,把x_data赋值给xs
    if i % 50 == 0: #每50步输出一次机器学习的误差
        print(sess.run(loss, feed_dict={xs:x_data, ys:y_data})) 

It can be seen that the error is getting smaller and smaller.

##About the reduce_sum() function:
Use the addition function in tensorflow, and you need reduction_indices=[1, 0], [1] means row summation, [0] means column summation.
In the following figure, [3, 3] are displayed vertically to show that the dimension information is always retained in the process of reduction_indices, so it is not a column vector, so it is not [ [3], [3] ], its essence is still [3, 3], so whether it is row summation or column summation, the final output is converted into a row vector.

write picture description here

 

From Zhihu users: What does the reduce_sum() function of Fanxin Tensorflow mean, who can explain it? - Know almost

#3.3 Example 3 Visualization of results

#import os
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

#添加一个神经层,定义添加神经层的函数
def add_layer(inputs, in_size, out_size, activation_function = None):
    Weights = tf.Variable(tf.random_normal([in_size,out_size])) #in_size代表行/输入层
    biases = tf.Variable(tf.zeros([1,out_size]) + 0.1)
    Wx_plus_b = tf.matmul(inputs,Weights) + biases #Wx_plus_b代表W*x+b

    if activation_function is None: #如果没有激励函数,即为线性关系,那么直接输出,不需要激励函数(非线性函数)
        outputs = Wx_plus_b
    else:
        outputs = activation_function(Wx_plus_b) #把这个值传进去
    return outputs

x_data = np.linspace(-1, 1, 300, dtype = np.float32)[:, np.newaxis]  #输入,np.float32改变数组的长度显示,linspace创建一个从-1到1的等差数列,默认为50个数,这里规定了要生成300个数,并且使用[:, np.newaxis]将数组转换为列向量,[np.newaxis,:]可转换为行向量
noise = np.random.normal(0, 0.05, x_data.shape).astype(np.float32) #生成一个均值/中心为0,标准差/宽度为0.05的正太分布作为噪点/干扰点,它的格式为x_data,使得我们想要预测的函数更加接近实际情况;astype转换数据类型格式为float32
y_data = np.square(x_data) - 0.5 + noise #x的平方减去一个任意值再加上噪点

xs = tf.placeholder(tf.float32, [None, 1]) #占位符,保存数据的利器,float32数据类型,[None,1]表示列为1,行不定的列向量;xs表示x_Session,因为placeholder是与Session一起用的,它在使用的时候和前面的variable不同的是在session运行阶段,需要给placeholder提供数据,利用feed_dict的字典结构给placeholdr变量“喂数据”;placeholder的语法:tf.placeholder(dtype, shape=[None,None] [, name=None])
ys = tf.placeholder(tf.float32, [None, 1])


l1 = add_layer(xs, 1, 10, activation_function = tf.nn.relu) #创建一个隐藏层l1,输入为xs,输入的层数/神经元的个数1=输入层,输出的层数10=隐藏层中神经元的个数
prediction = add_layer(l1, 10, 1, activation_function = None) #预测值;定义输出层,输入为l1=前一层隐藏层的输出,输入的层数为10=隐藏层神经元的个数,输出的层数为1=输出一般只有1层 
loss = tf.reduce_mean(tf.reduce_sum(tf.square(ys-prediction),reduction_indices = [1])) #计算预测值prediction与真实值ys的误差:所有的平方差相加再求平均;reduction_indices = [1]表示相加的方法,[1]表示行求和,[0]表示列求和,具体解释见下文

train_step = tf.train.GradientDescentOptimizer(0.1).minimize(loss) #机器要学习的内容,使用优化器提升准确率,学习率为0.1<1,表示以0.1的效率来最小化误差loss

init = tf.global_variables_initializer() #使用变量,就要对其初始化
sess = tf.Session() #定义Session,并使用Session来初始化步骤
sess.run(init)

#绘制真实数据
fig = plt.figure() #生成一个画框/画布
ax = fig.add_subplot(1, 1, 1) #将画框分为1行1列,并将图 画在画框的第1个位置
ax.scatter(x_data, y_data) #画散点图
plt.ion() #交互绘制功能,用于连续显示;本次运行时请注释掉这条语句,全局运行不需要注释掉
plt.show() #显示所绘制的图形,但是他只显示当前运行时的图像,不会一直显示多次;在实际运行当中,注释掉上面两条代码,程序才会正常运行,暂时不知道为什么。。。
for i in range(1000): #训练1000次
    sess.run(train_step, feed_dict={xs:x_data, ys:y_data}) #给placeholder喂数据,把x_data赋值给xs
    if i % 50 == 0: #每50步输出一次机器学习的误差
        #print(sess.run(loss, feed_dict={xs:x_data, ys:y_data})) 
        #可视化结果与改进
        try:
            ax.lines.remove(lines[0]) #抹去前一条绘制的曲线,在这里我们要先抹去再绘制,防止第一次运行时报错,我们使用try语句
        except Exception:
            pass
        prediction_value = sess.run(prediction, feed_dict={xs:x_data})
        #绘制预测数据
        lines = ax.plot(x_data, prediction_value,'r-',lw=5) #x轴数据,y轴数据,红色的线,线的宽度为5
        plt.pause(0.1) #绘制曲线的时间间隔为0.1秒


Scatter plot:

Training results (Why are my training results so bad? Unresolved)

##关于add_subplot(1, 1, 1)http://www.codeweblog.com/matplotlib-pyplot%E4%B8%ADadd_subplot%E6%96%B9%E6%B3%95%E5%8F%82%E6%95%B0111%E7%9A%84%E5%90%AB%E4%B9%89/

#3.4 Speed ​​Up Training
The original text is very vivid: Speed ​​Up Training - Tensorflow | Don't bother with Python
, but there is no formula, and the formula cannot be understood. . . The push down process is not well understood.
Large datasets can make training slow if acceleration is not applicable.
Several acceleration methods:
###Stochastic Gradient Descent (SGD) Stochastic Gradient Descent (Beginner)

Divide these data into small batches, and then put them into NN for calculation in batches without losing too much accuracy.
###Momentum (commonly used)
learning rate (learning rate)
correction value (dx)
update W
A visual example: most other approaches are to move hands and feet on the step of updating the neural network parameters. The traditional update of the parameter W is to The original W is accumulated with a negative learning rate multiplied by the correction value (dx). This method can make the learning process tortuous, looking like a drunk person walking home a lot wobbly Detours.
So we put this person on a slope from the flat ground. As long as he walks a little in the direction of the downhill, due to the inertia of the downwards, he unconsciously keeps going down, and the detours he takes become less. . This is the Momentum parameter update.


###AdaGrad manipulates
the learning rate, so that each parameter update will have its own unique learning rate, and give him a pair of shoes that are not easy to walk, which makes his feet hurt when he walks, and the shoes become detours. resistance, forcing him to walk straight ahead.

###RMSProp (AlphaGo uses this optimizer to optimize)
With the inertia principle of momentum, plus the resistance of adagrad to wrong directions, we can merge into this. Let RMSProp have the advantages of both methods at the same time. This This method has not fully incorporated Momentum, and RMSProp is still missing a part of Momentum.

###Adam (commonly used)
has the attribute of momentum downhill when calculating m, and the attribute of adagrad resistance when calculating v, and then takes both m and V into account when updating the parameters. Experiments show that most of the time, using adam can It reaches the goal quickly and well, and converges quickly. Therefore, when accelerating the training of the neural network, a downhill and a pair of broken shoes are indispensable.


#3.5 Optimizer optimizer
The most basic linear optimization
tensorflow optimizer in machine learning has a total of seven kinds.
Commonly used, see the previous section

tf.train provides a set of classes and functions that help train models.

The Optimizer
base class provides methods to compute the gradient of the loss and apply the gradient to the variables. A collection of subclasses that implement classic optimization algorithms such as GradientDescent and Adagrad.

You never instantiate the Optimizer class itself, but one of its subclasses.

tf.train.Optimizer
tf.train.GradientDescentOptimizer
tf.train.AdadeltaOptimizer
tf.train.AdagradOptimizer
tf.train.AdagradDAOptimizer
tf.train.MomentumOptimizer
tf.train.AdamOptimizer
tf.train.FtrlOptimizer
tf.train.ProximalGradientDescentOptimizer
tf.train.ProximalAdagradOptimizer
tf.train.RMSPropOptimizer
请参阅tf.contrib.opt更多优化器。
 

Guess you like

Origin blog.csdn.net/nyist_yangguang/article/details/122756514