基于pytorch1.2的FCN语义分割手提包数据集
前言
作为一只刚刚入门深度学习的菜鸟来说,这是第一次编写使用FCN的代码来做语义分割,过程还是挺头疼的,别人的代码一看就懂,自己一写就懵。这篇博客仅记录一下自己的体验。
关于手提包数据集在下述连接有着详细的阐释,这里仅仅是根据个人理解对原文代码的修改。
原文参考链接:https://blog.csdn.net/u014453898/article/details/92080859
代码
- 自定义数据集代码
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import os
import cv2
import pdb
from onehot import onehot
import torch
class BagDataset(Dataset):
def __init__(self, mode):
self.tranform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
self.img = os.listdir('last')
if mode == 'train':
self.img = self.img[:int(0.6*len(self.img))]
elif mode == 'val':
self.img = self.img[int(0.6*len(self.img)):int(0.8*len(self.img))]
else:
self.img = self.img[int(0.8*len(self.img)):]
def __len__(self):
return len(self.img)
def __getitem__(self, idx):
img_name = self.img[idx]
imgA = cv2.imread('last/'+img_name)
imgA = cv2.resize(imgA, (160, 160))
imgB = cv2.imread('last_msk/' + img_name, 0)
imgB = cv2.resize(imgB, (160, 160))
imgB = imgB / 255
imgB = imgB.astype('uint8')
imgB = onehot(imgB, 2)
imgB = imgB.swapaxes(0, 2).swapaxes(1, 2)
imgB = torch.FloatTensor(imgB)
imgA = self.tranform(imgA)
return imgA, imgB
train_db = BagDataset(mode='train')
val_db = BagDataset(mode='val')
test_db = BagDataset(mode='test')
train_loader = DataLoader(train_db, batch_size=4, shuffle=True, num_workers=4)
val_loader = DataLoader(val_db, batch_size=4, shuffle=True, num_workers=4)
test_loader = DataLoader(test_db, batch_size=4, shuffle=True, num_workers=4)
if __name__ == '__main__':
for batch in train_loader:
break
- onehot函数
import numpy as np
def onehot(data, n):
buf = np.zeros(data.shape + (n, ))
nmsk = np.arange(data.size)*n + data.ravel()
buf.ravel()[nmsk-1] = 1
return buf
- FCN模型代码
import torch
import torch.nn as nn
from torchvision import models
import time
import visdom
from BagData import train_loader, test_loader, val_loader
import torch.optim as optim
from torchvision.models.vgg import VGG
import numpy as np
class FCN32s(nn.Module):
def __init__(self, n_class):
super(FCN32s, self).__init__()
self.n_class = n_class
self.feature = models.vgg16(pretrained=True).features
self.feature[0] = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=100)
self.module = nn.Sequential(
nn.Conv2d(512, 4096, kernel_size=7),
nn.ReLU(inplace=True),
nn.Dropout2d(),
nn.Conv2d(4096, 4096, kernel_size=1),
nn.ReLU(inplace=True),
nn.Dropout2d()
)
self.classifier = nn.Conv2d(4096, self.n_class, kernel_size=1)
self.deconv1 = nn.ConvTranspose2d(self.n_class, self.n_class, kernel_size=64, stride=32, bias=False)
def forward(self, x):
h = x
x = self.feature(x)
x = self.module(x)
score = self.classifier(x)
upsample = self.deconv1(score)
upsample = upsample[:, :, 19:19 + x.size()[2], 19:19 + x.size()[3]].contiguous()
return upsample
class FCN16s(nn.Module):
def __init__(self, n_class):
super(FCN16s, self).__init__()
self.n_class = n_class
self.feature = list(models.vgg16(pretrained=True).features)
self.feature[0] = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=100)
self.feature1 = nn.Sequential(*self.feature[:24])
self.feature2 = nn.Sequential(*self.feature[24:])
self.module = nn.Sequential(
nn.Conv2d(512, 4096, kernel_size=7),
nn.ReLU(inplace=True),
nn.Dropout2d(),
nn.Conv2d(4096, 4096, kernel_size=1),
nn.ReLU(inplace=True),
nn.Dropout2d()
)
self.classifier1 = nn.Conv2d(4096, self.n_class, kernel_size=1)
self.classifier2 = nn.Conv2d(512, self.n_class, kernel_size=1)
self.deconv1 = nn.ConvTranspose2d(self.n_class, self.n_class, kernel_size=4, stride=2, bias=False)
self.deconv2 = nn.ConvTranspose2d(self.n_class, self.n_class, kernel_size=32, stride=16, bias=False)
self.bn = nn.BatchNorm2d(self.n_class)
def forward(self, x):
# pool4 之后的输出
num = self.feature1(x)
num1 = self.feature2(num)
num2 = self.module(num1)
score1 = self.classifier1(num2)
score2 = self.classifier2(num)
upsample1 = self.bn(self.deconv1(score1))
score2 = score2[:, :, 5:5 + upsample1.size()[2], 5:5 + upsample1.size()[3]]
upsample1 += score2
upsample = self.bn(self.deconv2(upsample1))
upsample = upsample[:, :, 27:27 + x.size()[2], 27:27 + x.size()[3]].contiguous()
return upsample
class FCN8s(nn.Module):
def __init__(self, n_class):
super(FCN8s, self).__init__()
self.n_class = n_class
self.feature = list(models.vgg16(pretrained=True).features)
self.feature[0] = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=100)
self.feature1 = nn.Sequential(*self.feature[:17])
self.feature2 = nn.Sequential(*self.feature[17:24])
self.feature3 = nn.Sequential(*self.feature[24:])
self.module = nn.Sequential(
nn.Conv2d(512, 4096, kernel_size=7),
nn.ReLU(inplace=True),
nn.Dropout2d(p=0.5),
nn.Conv2d(4096, 4096, kernel_size=1),
nn.ReLU(inplace=True),
nn.Dropout2d(p=0.5)
)
self.classifier1 = nn.Conv2d(4096, self.n_class, kernel_size=1)
self.classifier2 = nn.Conv2d(512, self.n_class, kernel_size=1)
self.classifier3 = nn.Conv2d(256, self.n_class, kernel_size=1)
self.deconv1 = nn.ConvTranspose2d(self.n_class, self.n_class, kernel_size=4, stride=2, bias=False)
self.deconv2 = nn.ConvTranspose2d(self.n_class, self.n_class, kernel_size=4, stride=2, bias=False)
self.deconv3 = nn.ConvTranspose2d(self.n_class, self.n_class, kernel_size=16, stride=8, bias=False)
self.bn = nn.BatchNorm2d(self.n_class)
def forward(self, x):
num = self.feature1(x)
num1 = self.feature2(num)
num2 = self.feature3(num1)
num3 = self.module(num2)
score1 = self.classifier1(num3) # pool5后的得分值
score2 = self.classifier2(num1) # pool4后的得分值
score3 = self.classifier3(num) # pool3后的得分值
upsample1 = self.bn(self.deconv1(score1)) # 对pool5 2倍上采样
score2 = score2[:, :, 5:5 + upsample1.size()[2], 5:5 + upsample1.size()[3]]
upsample1 += score2
upsample2 = self.bn(self.deconv2(upsample1)) # 对求和后的结果进行2倍上采样
score3 = score3[:, :, 9:9 + upsample2.size()[2], 9:9 + upsample2.size()[3]]
upsample2 += score3
upsample = self.bn(self.deconv3(upsample2)) # 对求和后的结果进行2倍上采样
upsample = upsample[:, :, 31:31 + x.size()[2], 31:31 + x.size()[3]].contiguous()
return upsample
if __name__ == '__main__':
x = torch.rand(3, 3, 224, 224)
fcn = FCN8s(20)
out = fcn(x)
print(out.shape)
- 主函数
import torch
import torch.nn as nn
import time
import visdom
from BagData import train_loader, val_loader, test_loader
import torch.optim as optim
import numpy as np
from Myfcn import FCN8s
device = torch.device('cuda')
# torch.manual_seed(1234)
def evalute(model, loader):
model.eval()
correct = 0
for step, (x, y) in enumerate(loader):
x, y = x.to(device), y.to(device)
with torch.no_grad():
logits = model(x)
output = torch.sigmoid(logits)
pred = output.argmin(dim=1)
y = y.argmin(dim=1)
correct += torch.eq(pred, y).sum().float().item()
y = y.cpu().data.numpy().copy()
total = len(y.flatten()) * (step + 1)
acc = correct / total
print('acc:', acc)
return acc
def main():
vis = visdom.Visdom()
fcn_model = FCN8s(n_class=2).to(device)
optimizer = optim.SGD(fcn_model.parameters(), lr=1e-2, momentum=0.7)
criteon = nn.BCELoss().to(device)
best_acc, best_epoch = 0, 0
global_step = 0
vis.line([0], [-1], win='loss', opts=dict(title='loss'))
vis.line([0], [-1], win='val_acc', opts=dict(title='val_acc'))
for epoch in range(100):
for step, (x, y) in enumerate(train_loader):
x = torch.autograd.Variable(x)
y = torch.autograd.Variable(y)
x = x.to(device)
y = y.to(device)
fcn_model.train()
logits = fcn_model(x)
output = torch.sigmoid(logits)
loss = criteon(output, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
output_np = output.cpu().data.numpy().copy()
output_np = np.argmin(output_np, axis=1)
y_np = y.cpu().data.numpy().copy()
y_np = np.argmin(y_np, axis=1)
if step % 20 == 0:
print('epoch {}, {}/{}, loss is {}'.format(epoch, step, len(train_loader), loss.data))
vis.close(win='pred')
vis.close(win='label')
vis.images(output_np[:, None, :, :], win='pred', opts=dict(title='pred'))
vis.images(y_np[:, None, :, :], win='label', opts=dict(title='label'))
vis.line([loss.item()], [global_step], win='loss', update='append')
global_step += 1
if epoch % 1 == 0:
val_acc = evalute(fcn_model, val_loader)
if val_acc > best_acc:
best_epoch = epoch
best_acc = val_acc
torch.save(fcn_model.state_dict(), 'best.mdl')
vis.line([val_acc], [global_step], win='val_acc', update='append')
print('best acc:', best_acc, 'best epoch:', best_epoch)
fcn_model.load_state_dict(torch.load('best.mdl'))
print('loaded from ckpt!')
test_acc = evalute(fcn_model, test_loader)
print('test acc:', test_acc)
if __name__ == '__main__':
main()
结果分析
根据最后的输出可以看出像素准确度(PA)为0.879546875,最好的epoch为第20个epoch,最后测试集的像素准确度(PA)为0.8733546875
loss损失函数图
像素准确度(PA)变化曲线图
标签与分割结果对比图
总结
FCN 采用跨层方法,既同时兼顾全局语义信息和局部位置信息,又能从抽象特征中恢复出像素所属的类别,把图像级别的分类进一步延伸到了像素级别的分类,成功地将原本用于图像分类的网络转变为用于图像分割的网络。
FCN 在分割过程中能够恢复像素所属的类别,但是仍然存在两个问腿:
①图像经过池化操作后,特征图的分辨率不断降低,部分像素的空间位置信息丢失;
②分割过程未能有效地考虑图像上下文信息,无法充分利用丰富的空间位置信息,导致局部特征和全局特征的利用率失衡。
FCN 未能有效地解决这两个问题,致使分割结果粗糙、分割边界不连续。后续可以采用优化卷积结构,添加条件随机场(CRF)等方法改进。