本文所述内容参考caffe官网教程:http://nbviewer.jupyter.org/github/BVLC/caffe/blob/master/examples/02-fine-tuning.ipynb。由于官网教程为英文,且略显深奥,我将学习教程后自己总结的资料和代码记录下来,供日后查看,也方便新手入门caffe这一机器学习框架。
本文主要按以下几个步骤讲述fine tune的意义和实现:
一、fine tune原理;二、fine tune程序实现的大致思路;三、代码实现和实验效果。
一、fine tune原理。
未在教材和网络上找到fine tune的权威定义,我的理解为——对于已经针对某一数据集A训练好的网络参数集合,使用另一数据集B的数据和标签重新训练,以使神经网络达在B上达到较好分类性能的训练方法。 fine tune的主要意义在于用较少数据达到好的分类效果。
二、fine tune程序实现大致思路。
官网教程实现了以下功能——对已经由IMAGENET数据集训练完成的参数,将之用另一个仅有5个标签、2000张图像的数据集fine tune ,实现了较好的分类效果(与随机参数的网络迭代训练相同次数相比较结果完胜)。
程序主要分为以下几个步骤:1.准备图像还原函数,获取fine tune的数据集的准备工作。2.编写网络结构代码,载入训练数据集和标签的数据。3.编写solver,设定solver的学习参数。4.编写迭代训练函数。5.相关效果的测试程序。
三、代码实现和实验效果。
为方便自己和新手理解,本文代码经过重构,与官网代码有所不同,本代码复制至jupyter notebook可直接运行(修改caffe_root即可)。
3.1.获取数据集、图像还原、设置路径等预备工作
这部分代码无技术可言,直接从官网复制。
获取数据集
caffe_root = '/your/caffe/root' # 这里是caffe的根目录
# Download just a small subset of the data for this exercise.
# (2000 of 80K images, 5 of 20 labels.)
# To download the entire dataset, set `full_dataset = True`.
full_dataset = False
if full_dataset:
NUM_STYLE_IMAGES = NUM_STYLE_LABELS = -1
else:
NUM_STYLE_IMAGES = 2000
NUM_STYLE_LABELS = 5
# This downloads the ilsvrc auxiliary data (mean file, etc),
# and a subset of 2000 images for the style recognition task.
import os
os.chdir(caffe_root) # run scripts from caffe root
!data/ilsvrc12/get_ilsvrc_aux.sh
!scripts/download_model_binary.py models/bvlc_reference_caffenet
!python examples/finetune_flickr_style/assemble_data.py \
--workers=-1 --seed=1701 \
--images=$NUM_STYLE_IMAGES --label=$NUM_STYLE_LABELS
# back to examples
os.chdir('examples')
图像还原
import sys
sys.path.insert(0, caffe_root + 'python')
import caffe
caffe.set_mode_cpu()
import numpy as np
from pylab import *
%matplotlib inline
import tempfile
# 显示图像函数(在处理时图像数据格式改变,不能直接显示)
def deprocess_net_image(image):
image = image.copy() # don't modify destructively
image = image[::-1] # BGR -> RGB
image = image.transpose(1, 2, 0) # CHW -> HWC
image += [123, 117, 104] # (approximately) undo mean subtraction
# 限定图像中数值的区间
image[image < 0], image[image > 255] = 0, 255
# 将图像处理后的浮点数转换为整数
image = np.round(image)
image = np.require(image, dtype=np.uint8)
return image
设置路径加载模型
import os
net_weights = os.path.join(caffe_root, 'data/weight/bvlc_reference_caffenet.caffemodel')
assert os.path.exists(net_weights)
3.2.编写神经网络结构代码、获取数据集和标签数据
这部分内容不灵活,个别代码解释见注释。
编写神经网络结构代码
from caffe import layers as L
from caffe import params as P
# 神经网络逐层的学习参数,本例中除最后一层外所有层均采用这些参数
weight_param = dict(lr_mult=1, decay_mult=1)
bias_param = dict(lr_mult=2,decay_mult=0)
learned_param = [weight_param, bias_param]
frozen_param = [dict(lr_mult=0)] * 2
def conv_relu(bottom,ks,nout,stride=1,pad=0,group=1,
param=learned_param,
weight_filler=dict(type='gaussian'),std=0.01,
bias_filler=dict(type='constant'),value=0.1):
conv = L.Convolution(bottom,kernel_size=ks,stride=stride,
num_output=nout,pad=pad,group=group,
param=param,weight_filler=weight_filler,
bias_filler=bias_filler)
return conv, L.ReLU(conv,in_place=True)
def fc_relu(bottom,nout,param=learned_param,
weight_filler=dict(type='gaussian',std=0.005),
bias_filler=dict(type='constant',value=0.1)):
fc = L.InnerProduct(bottom,num_output=nout,param=param,
weight_filler=weight_filler,
bias_filler=bias_filler)
return fc,L.ReLU(fc,in_place=True)
def max_pool(bottom,ks,stride=1):
return L.Pooling(bottom,pool=P.Pooling.MAX,kernel_size=ks,
stride=stride)
def caffenet(data,label=None,train=True,num_classes=1000,
classifier_name='fc8',learn_all=False):
n = caffe.NetSpec()
n.data = data
# 如果learn_all变量为真,则训练所有带参数层;若为假,只训练最后一个全连接层
param = learned_param if learn_all else frozen_param
n.conv1,n.relu1 = conv_relu(n.data,11,96,stride=4,param=param)
n.pool1 = max_pool(n.relu1,3,stride=2)
n.norm1 = L.LRN(n.pool1,local_size=5,alpha=1e-4,beta=0.75)
n.conv2,n.relu2 = conv_relu(n.norm1,5,256,pad=2,group=2,param=param)
n.pool2 = max_pool(n.relu2,3,stride=2)
n.norm2 = L.LRN(n.pool2,local_size=5,alpha=1e-4,beta=0.75)
n.conv3,n.relu3 = conv_relu(n.norm2,3,384,pad=1,param=param)
n.conv4,n.relu4 = conv_relu(n.relu3,3,384,pad=1,group=2,param=param)
n.conv5,n.relu5 = conv_relu(n.relu4,3,256,pad=1,group=2,param=param)
n.pool5 = max_pool(n.relu5,3,stride=2)
n.fc6,n.relu6 = fc_relu(n.pool5,4096,param=param)
if train:
n.drop6 = fc7input = L.Dropout(n.relu6,in_place=True)
else:
fc7input = n.relu6
n.fc7,n.relu7 = fc_relu(fc7input,4096,param=param)
if train:
n.drop7 = fc8input = L.Dropout(n.relu7,in_place=True)
else:
fc8input = n.relu7
# 最后一层使用与其他层不同的学习参数,因为fine tune有时只对最后一层训练以提高整体的训练速度
fc8 = L.InnerProduct(fc8input,num_output=num_classes,param=learned_param)
n.__setattr__(classifier_name,fc8)
# 如果在测试状态,则用softmax函数对概率归一化
if not train:
n.probs = L.Softmax(fc8)
# 如果在训练状态,则计算损失函数值和准确率
if label is not None:
n.label = label
n.loss = L.SoftmaxWithLoss(fc8,n.label)
n.acc = L.Accuracy(fc8,n.label)
with tempfile.NamedTemporaryFile(delete=False) as f:
f.write(str(n.to_proto()))
# 返回值为记录网络结构文件的路径
return f.name
获取数据集和标签数据
style_type_file = caffe_root + 'examples/finetune_flickr_style/style_names.txt'
def style_caffenet(data_input,labels_input,num_classes,train=False,learn_all=False):
# 对caffenet网络进行重构
return caffenet(data=data_input,label=labels_input,train=train,num_classes=num_classes,
learn_all=learn_all,classifier_name='fc8_flickr')
def get_data_batch():
# 获取数据集的图像和标签数据
source = caffe_root + 'data/flickr_style/test.txt'
transform_param = dict(mirror=True, crop_size=227,
mean_file=caffe_root + 'data/ilsvrc12/imagenet_mean.binaryproto')
data_input,labels_input = L.ImageData(
transform_param=transform_param, source=source,
batch_size=50, new_height=256, new_width=256, ntop=2)
return data_input,labels_input
def get_file_content():
# 获取标签名称数据
style_list = list(np.loadtxt(style_type_file,str,delimiter='\n'))
return style_list
3.3.编写solver,设定solver参数
这部分我从官网那个直接复制过来了(仅在用GPU训练处改称了CPU训练)。设定参数需要对神经网络的运行机制有一定了解,本文不多叙述。
设定神经网络参数
from caffe.proto import caffe_pb2
def solver(train_net_path, test_net_path=None, base_lr=0.001):
# 输入的是神经网络结构文件的路径。
s = caffe_pb2.SolverParameter()
# Specify locations of the train and (maybe) test networks.
s.train_net = train_net_path
if test_net_path is not None:
s.test_net.append(test_net_path)
s.test_interval = 1000 # Test after every 1000 training iterations.
s.test_iter.append(100) # Test on 100 batches each time we test.
# The number of iterations over which to average the gradient.
# Effectively boosts the training batch size by the given factor, without
# affecting memory utilization.
s.iter_size = 1
s.max_iter = 100000 # # of times to update the net (training iterations)
# Solve using the stochastic gradient descent (SGD) algorithm.
# Other choices include 'Adam' and 'RMSProp'.
s.type = 'SGD'
# Set the initial learning rate for SGD.
s.base_lr = base_lr
# Set `lr_policy` to define how the learning rate changes during training.
# Here, we 'step' the learning rate by multiplying it by a factor `gamma`
# every `stepsize` iterations.
s.lr_policy = 'step'
s.gamma = 0.1
s.stepsize = 20000
# Set other SGD hyperparameters. Setting a non-zero `momentum` takes a
# weighted average of the current gradient and previous gradients to make
# learning more stable. L2 weight decay regularizes learning, to help prevent
# the model from overfitting.
s.momentum = 0.9
s.weight_decay = 5e-4
# Display the current training loss and accuracy every 1000 iterations.
s.display = 1000
# Snapshots are files used to store networks we've trained. Here, we'll
# snapshot every 10K iterations -- ten times during training.
s.snapshot = 10000
s.snapshot_prefix = caffe_root + 'models/finetune_flickr_style/finetune_flickr_style'
# Train on the CPU. Using the CPU to train large networks is very slow.
s.solver_mode = caffe_pb2.SolverParameter.CPU
# Write the solver to a temporary file and return its filename.
with tempfile.NamedTemporaryFile(delete=False) as f:
f.write(str(s))
return f.name
3.4.运行solver,迭代训练
运行solver的函数,进行特定次数的SGD过程,对参数进行调整;保存下每次的Loss和Accuracy数值;最后将所有迭代过solver的参数都以文件形式保存。
def run_solvers(solvers,niter):
# 运行solver训练神经网络
blobs = ('loss','acc')
# 创建保存Loss和Accuracy的字典
loss,acc = ({name:np.zeros(niter) for name,_ in solvers} for _ in blobs)
for name,s in solvers:
for it in range(niter):
# 进行一次SGD迭代并保存Loss和Accuracy
s.step(1)
loss[name][it] = s.net.blobs['loss'].data.copy()
acc[name][it] = s.net.blobs['acc'].data.copy()
if it % 10 == 0:
print '(%d) loss is %5f,acc is %6f%%'%(it+1,loss[name][it],acc[name][it]*100)
weights = {}
for name,s in solvers:
# 训练结束后将训练好的每个solver参数都以文件形式保存下来
weights[name] = caffe_root + 'data/weight/%s-pretrained_model.caffemodel'%name
s.net.save(weights[name])
return loss,acc,weights
运行
def finetune_training():
"""set fundamental hyperparameter of network then begin to run solvers """
niter = 200
data_input,labels_input = get_data_batch()
style_solver_filename = solver(style_caffenet(data_input=data_input,
labels_input=labels_input,
num_classes=5,train=True))
style_solver = caffe.get_solver(style_solver_filename)
style_solver.net.copy_from(net_weights)
# 设置solver list,以便同时运行多个solver训练任务。
print 'Running solvers for %d iterations...' % niter
solvers = [('finetuned', style_solver)]
loss, acc, weights = run_solvers(solvers,niter)
print 'Done.'
train_loss = loss['finetuned']
train_acc = acc['finetuned']
style_weights = weights['finetuned']
return train_loss,train_acc,style_weights
train_loss,train_acc,style_weights = finetune_training()
四、编写相关的测试程序
将训练好的权重数据输入网络,测试网络对于目前数据集的分类效果。
# 设定保存的weight路径
pretrained_weights = os.path.join(caffe_root,'data/weight/your_weight')
def eval_style_net():
"""evaluate the classification performance of the network"""
net_stru = style_caffenet(data_input=data_input,labels_input=labels_input,
num_classes=5)
net = caffe.Net(net_stru,pretrained_weights,caffe.TEST)
accuracy = 0
niter = 50
for i in range(niter):
accuracy += net.forward()['acc']
if i % 10 == 0:
print 'iterate number is %d current sum is %f'%(i,accuracy)
return accuracy/niter
general_accuracy = eval_style_net()
print 'general accuracy is %f%%'%(general_accuracy*100)