Image segmentation
Different from target detection, semantic segmentation can identify and understand the content of each pixel in the image: the annotation and prediction of its semantic region is at the pixel level. Compared with target detection, in semantic segmentation, the labels of dogs, cats and backgrounds in images, the pixel-level borders marked by semantic segmentation are obviously finer.
This paper mainly combs the image segmentation method based on deep learning. According to different tasks, image segmentation can be divided into three categories: semantic segmentation, instance segmentation, and panoramic segmentation.
Semantic Segmentation: Semantic segmentation refers to classifying pixels in an image into semantic classes. Pixels belonging to a particular class are only classified into that class without regard to other information or context.
Instance Segmentation: Instance segmentation models classify pixels according to "instances" rather than categories.
Panoptic Segmentation: Panoptic segmentation is a newly developed segmentation task that can be expressed as a combination of semantic segmentation and instance segmentation, where each instance of an object in an image is separated and the identity of the object is predicted. The difference from instance segmentation is that the entire image is segmented.
1. Semantic Segmentation
1.1 Principle and Implementation of U-Net
It can be understood according to the following ideas: data reader DataLoader, network Network, loss function Loss Function, training method and optimizer Train Setting. In the above code, the label is an RGB pixel value, so the predicted image will appear in different colors. Another type of label is to map pixel values to category values.
Combined with this picture, you can understand how to construct a UNet network. It can be seen that after c1, c2, c3, c4, c5. The size of the image gradually becomes smaller, and the size becomes 16×16, and this process becomes the Encode process. In order to classify at the pixel level, the idea adopted is to upsample the encoded matrix, the size becomes larger, and the matrix with the same size as the previous encoding is superimposed in the channel direction, as shown by the gray arrow. Perform several superpositions, and finally map it into a probability value for classification at the pixel level.
1.1.1 DataLoader
transform=transforms.Compose([
transforms.ToTensor()
])
class MyDataset(Dataset):
def __init__(self,path):
self.path=path
self.name=os.listdir(os.path.join(path,'SegmentationClass'))
def __len__(self):
return len(self.name) # 数据集的数量
def __getitem__(self, index):
segment_name=self.name[index] #xx.png
segment_path=os.path.join(self.path,'SegmentationClass',segment_name)
print(segment_path)
image_path=os.path.join(self.path,'JPEGImages',segment_name.replace('png','jpg'))
segment_image=keep_image_size_open(segment_path)
image=keep_image_size_open(image_path)
return transform(image),transform(segment_image)
Pay attention to the following points: the size of the picture needs to be unified and the pixels correspond to each other; the data type and size of the picture and the label image shape = (n, c, h, w), label shape = (n, c, h, w)
.
1.1.2 Network
class UNet(nn.Module):
def __init__(self):
super(UNet, self).__init__()
self.c1=Conv_Block(3,64) # 卷积Block
self.d1=DownSample(64)
self.c2=Conv_Block(64,128)
self.d2=DownSample(128)
self.c3=Conv_Block(128,256)
self.d3=DownSample(256)
self.c4=Conv_Block(256,512)
self.d4=DownSample(512)
self.c5=Conv_Block(512,1024)
self.u1=UpSample(1024)
self.c6=Conv_Block(1024,512)
self.u2 = UpSample(512)
self.c7 = Conv_Block(512, 256)
self.u3 = UpSample(256)
self.c8 = Conv_Block(256, 128)
self.u4 = UpSample(128)
self.c9 = Conv_Block(128, 64)
self.out=nn.Conv2d(64,3,3,1,1) # inc=64, outc=3 , kernal_size=3, stride=1, padding=1
self.Th=nn.Sigmoid()
def forward(self,x):
R1=self.c1(x)
# print('R1.shape:', R1.shape) # 2*64*256*256
R2=self.c2(self.d1(R1))
# print('R2.shape:', R2.shape) # 2*128*128*128
R3 = self.c3(self.d2(R2))
# print('R3.shape:', R3.shape) # 2*256*64*64
R4 = self.c4(self.d3(R3))
# print('R4.shape:', R4.shape) # 2*512*32*32
R5 = self.c5(self.d4(R4))
# print('R5.shape:', R5.shape) # 2*1024*16*16
O1 = self.c6(self.u1(R5,R4)) # 2*1024*16*16 (变化) cat 2*512*32*32 -> 2*512*32*32
O2 = self.c7(self.u2(O1, R3)) # 2*512*32*32 (变化) cat 2*256*64*64 -> 2*256*64*64
O3 = self.c8(self.u3(O2, R2)) # 2*256*64*64 (变化) cat 2*128*128*128 -> 2*128*128*128
O4 = self.c9(self.u4(O3, R1)) # 2*128*128*128 (变化) cat 2*64*256*256 -> 2*64*256*256
return self.Th(self.out(O4)) # 2*64*256*256 -> 2*3*256*256 -> sigmoid() 求了一个概率值
1.1.3 Train
net=UNet().to(device)
opt=optim.Adam(net.parameters())
loss_fun=nn.BCELoss()
while True:
running_loss = 0.0
print('Epoch {}/{}'.format(epoch, 10000))
for i,(image,segment_image) in enumerate(data_loader):
image, segment_image=image.to(device),segment_image.to(device)
# print(torch.unique(segment_image))
# print('type(segment_image):', type(segment_image),
# 'segment_image.shape: ', segment_image.shape, 'image.shape:', image.shape) image.shape = [2, 3, 256, 256] segment.shape = [2, 3, 256, 256]
out_image=net(image) # out_image.shape = [2, 3, 256, 256]
train_loss=loss_fun(out_image,segment_image)
opt.zero_grad()
train_loss.backward()
opt.step()
running_loss += train_loss.data.item()
epoch_loss = running_loss / epoch
if i%5==0:
print(f'{
epoch}-{
i}-train_loss===>>{
train_loss.item()}')
if i%100==0:
torch.save(net.state_dict(),weight_path)
_image=image[0]
_segment_image=segment_image[0]
_out_image=out_image[0]
print("++++++++++++++out_image:", _out_image)
img=torch.stack([_image,_segment_image,_out_image],dim=0)
save_image(img,f'{
save_path}/{
i}.png')
writer.add_scalar('data/trainloss', epoch_loss, epoch)
if epoch%1000 == 0:
torch.save(net, 'checkpoints/model_epoch_{}.pth'.format(epoch))
print('checkpoints/model_epoch_{}.pth saved!'.format(epoch))
epoch+=1
When using loss calculations, pay attention to the output of the network and the shape of the label. Because the loss calculation module encapsulated by nn has regulations on the shapes of out_image and segment_image.
2. Instance Segmentation
2.1 RCNN
RCNN (Region with CNN feature) is a milestone leap in the application of convolutional neural networks to target detection problems. CNN has good feature extraction and classification performance, and the RegionProposal method is used to realize the target detection problem. The algorithm can be divided into three steps: candidate region selection, CNN feature extraction, classification and boundary regression.
-
Candidate region selection: Region Proposal is a traditional region extraction method based on a heuristic region extraction method. The method used is Selective Search (SS), which looks at existing small regions and merges the two most There are possible regions, repeat this step until the images are merged into one region, and finally output the candidate region. Then standardize the target image extracted according to the suggestion. As the standard input of CNN, it can be regarded as a window to obtain potential target images by sliding. In RCNN, the general Candidate option is 1k~2k, which can be understood as dividing the image into 1k~2k Grid, and then perform feature extraction or convolution operations on the grid, which is determined according to the branch under the RCNN algorithm. It is then normalized as the standard input to the CNN based on the target image extracted for the proposal.
-
CNN Feature Extraction: A standard convolutional neural network performs operations such as convolution or pooling on the input to obtain a fixed-dimensional output. That is, after feature extraction, the feature maps are convolved and pooled to obtain the output.
-
Classification and boundary regression: There are actually two sub-steps, one is to classify the output vector of the previous step (the classifier needs to be trained according to the characteristics); the second is to obtain the precise area through the boundary regression box regression (abbreviated as bbox) information. Its purpose is to accurately localize and merge the expected objects to complete the classification and avoid multiple detections. In the selection of classifiers, there are support vector machines SVM, Softmax, etc.; boundary regression includes bbox regression, multi-task loss function border regression, etc.
There are three biggest problems with R-CNN: it is necessary to extract images corresponding to multiple candidate regions in advance. This behavior will take up a lot of disk space; for traditional CNNs, the input map needs to be of fixed size, and the deformation of the image during the normalization process will cause the size of the image to change, which has a negative effect on the feature extraction of CNN. Fatal disadvantage; each region proposal needs to enter the CNN network calculation. In turn, the same feature extraction will be repeated many times, which will lead to a great waste of calculation.
2.2 Faster R-CNN
Faster R-CNN is an improved version of the R-CNN architecture with two stages:
Region Proposal Network (RPN) uses the anchor point and box regression mechanism to continuously approach the box of Ground Truth.
Fast R-CNN utilizes RoIPool (Region of Interest Pool) to extract features from each candidate box and perform classification and bounding box regression. RoIPool is an operation for extracting small feature maps from each RoI in detection.
The biggest difference from rcnn is the RPN module, which greatly reduces the amount of calculation.
2.3 Mask R-CNN
Mask R-CNN principle and implementation
Mask R-CNN is built using Fast R-CNN. Fast R-CNN has 2 outputs for each candidate object: a class label and a bounding box offset, while Mask R-CNN designs a third branch to output the object mask. The additional mask output differs from the class and box outputs and needs to extract a finer object space layout.
Mask R-CNN is an extension of Fast R-CNN that works by adding a branch for predicting object masks (regions of interest), in parallel to the existing branch for bounding box recognition.
DataLoader
import os
import numpy as np
import torch
from PIL import Image
class PennFudanDataset(torch.utils.data.Dataset):
def __init__(self, root, transforms):
self.root = root
self.transforms = transforms
# load all image files, sorting them to
# ensure that they are aligned
self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))
def __getitem__(self, idx):
# load images and masks
img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
img = Image.open(img_path).convert("RGB")
# note that we haven't converted the mask to RGB,
# because each color corresponds to a different instance
# with 0 being background
mask = Image.open(mask_path)
# convert the PIL Image into a numpy array
mask = np.array(mask)
# instances are encoded as different colors
obj_ids = np.unique(mask)
# first id is the background, so remove it
obj_ids = obj_ids[1:]
# split the color-encoded mask into a set
# of binary masks
masks = mask == obj_ids[:, None, None]
# get bounding box coordinates for each mask
num_objs = len(obj_ids)
boxes = []
for i in range(num_objs):
pos = np.where(masks[i])
xmin = np.min(pos[1])
xmax = np.max(pos[1])
ymin = np.min(pos[0])
ymax = np.max(pos[0])
boxes.append([xmin, ymin, xmax, ymax])
# convert everything into a torch.Tensor
boxes = torch.as_tensor(boxes, dtype=torch.float32)
# there is only one class
labels = torch.ones((num_objs,), dtype=torch.int64)
masks = torch.as_tensor(masks, dtype=torch.uint8)
image_id = torch.tensor([idx])
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
# suppose all instances are not crowd
iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
target = {
}
target["boxes"] = boxes
target["labels"] = labels
target["masks"] = masks
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd
if self.transforms is not None:
img, target = self.transforms(img, target)
return img, target
def __len__(self):
return len(self.imgs)
- image: a PIL Image of size
(H, W)
- target: a dict containing the following fields
boxes (FloatTensor[N, 4])
: the coordinates of theN
bounding boxes in[x0, y0, x1, y1]
format, ranging from0
toW
and0
toH
labels (Int64Tensor[N])
: the label for each bounding box.0
represents always the background class.image_id (Int64Tensor[1])
: an image identifier. It should be unique between all the images in the dataset, and is used during evaluationarea (Tensor[N])
: The area of the bounding box. This is used during evaluation with the COCO metric, to separate the metric scores between small, medium and large boxes.iscrowd (UInt8Tensor[N])
: instances with iscrowd=True will be ignored during evaluation.- (optionally)
masks (UInt8Tensor[N, H, W])
: The segmentation masks for each one of the objects
Network
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
def get_model_instance_segmentation(num_classes):
# load an instance segmentation model pre-trained on COCO
model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights="DEFAULT")
# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
# now get the number of input features for the mask classifier
in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
hidden_layer = 256
# and replace the mask predictor with a new one
model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
hidden_layer,
num_classes)
return model
Train
from engine import train_one_epoch, evaluate
import utils
def main():
# train on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# our dataset has two classes only - background and person
num_classes = 2
# use our dataset and defined transformations
dataset = PennFudanDataset('PennFudanPed', get_transform(train=True))
dataset_test = PennFudanDataset('PennFudanPed', get_transform(train=False))
# split the dataset in train and test set
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices[:-50])
dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])
# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
dataset, batch_size=2, shuffle=True, num_workers=4,
collate_fn=utils.collate_fn)
data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=1, shuffle=False, num_workers=4,
collate_fn=utils.collate_fn)
# get the model using our helper function
model = get_model_instance_segmentation(num_classes)
# move model to the right device
model.to(device)
# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005,
momentum=0.9, weight_decay=0.0005)
# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
step_size=3,
gamma=0.1)
# let's train it for 10 epochs
num_epochs = 10
for epoch in range(num_epochs):
# train for one epoch, printing every 10 iterations
train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
# update the learning rate
lr_scheduler.step()
# evaluate on the test dataset
evaluate(model, data_loader_test, device=device)
print("That's it!")
Refer to this link for the full version of the above code
difference detection
Similar to the semantic segmentation task, classification is performed at the pixel level, except that the category of difference detection is special, including only two categories: pixels with changes and pixels without changes.