项目目录:
制作自己的数据集
运行&解析小训练脚本
解析Mask-RCNN:
了解mask-rcnn
上一节我们已经说是如何使用代码进行训练以及验证,接下来要进一步的了解代码的结构
进入mask-rcnn\libraries\mrcnn\model.py:
这里的代码量极大,差不多有3000行全部看懂是相当费劲的
我们先从主类Mask-RCNN看起
参数集成在config
model_dir和set_logs_dir分别为初始化权重路径训练保存路径
class MaskRCNN():
"""Encapsulates the Mask RCNN model functionality.
The actual Keras model is in the keras_model property.
"""
def __init__(self, mode, config, model_dir):
"""
mode: Either "training" or "inference"
“训练”或“推理”
也就是分为训练和测试
config: A Sub-class of the Config class
配置类的一个子类
这个config基本上包括了,整个代码使用的参数,通过调节这里面的参数就可以训练
model_dir: Directory to save training logs and trained weights
保存训练日志和训练权重的目录
"""
# 如果不是训练或者测试抛出异常
assert mode in ['training', 'inference']
self.mode = mode
self.config = config
self.model_dir = model_dir
self.set_log_dir()
self.keras_model = self.build(mode=mode, config=config)
接主:
判断输入图片维度是否符合
def build(self, mode, config):
"""Build Mask R-CNN architecture.
构建蒙版R-CNN架构。
input_shape: The shape of the input image.
输入图像的形状。
mode: Either "training" or "inference". The inputs and
outputs of the model differ accordingly.
要么是“训练”,要么是“推理”。的输入和模型的输出也相应不同。
"""
# 如果不是训练或者测试抛出异常
assert mode in ['training', 'inference']
# Image size must be dividable by 2 multiple times
# 强制要求了图片裁剪后尺度为2^n,且n>=6,保证下采样后不产生小数
h, w = config.IMAGE_SHAPE[:2]
if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6):
raise Exception("Image size must be dividable by 2 at least 6 times "
"to avoid fractions when downscaling and upscaling."
"For example, use 256, 320, 384, 448, 512, ... etc. ")
接主:
如果是trian,初始化相关张量
# Inputs
'''
下面是keras的初始化张量
注意keras的习惯不同于placeholder,上面代码的shape没有包含batch
'''
# input_image 输入图片维度
# input_image_meta 图片的信息(包含形状、预处理信息等) 16
input_image = KL.Input(
shape=[None, None, 3], name="input_image")
input_image_meta = KL.Input(shape=[config.IMAGE_META_SIZE],
name="input_image_meta")
if mode == "training":
# input_rpn_match、input_rpn_bbox 计算loss时用到
input_rpn_match = KL.Input(
shape=[None, 1], name="input_rpn_match", dtype=tf.int32)
input_rpn_bbox = KL.Input(
shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32)
# 检测GT(类id、锚框和掩码)
input_gt_class_ids = KL.Input(
shape=[None], name="input_gt_class_ids", dtype=tf.int32)
# 2. GT box(像素)(零填充)占位
# [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates MAX_GT_INSTANCES=100
input_gt_boxes = KL.Input(
shape=[None, 4], name="input_gt_boxes", dtype=tf.float32)
# 正常化坐标 (零填充)占位
gt_boxes = KL.Lambda(lambda x: norm_boxes_graph(
x, K.shape(input_image)[1:3]))(input_gt_boxes)
# 3.GT口罩(零衬垫)
# 当 USE_MINI_MASK 是True时用的是 [56,56] 否 [512, 512]图片原大小 这个是最小的描边框
if config.USE_MINI_MASK:
input_gt_masks = KL.Input(
shape=[config.MINI_MASK_SHAPE[0],
config.MINI_MASK_SHAPE[1], None],
name="input_gt_masks", dtype=bool)
else:
input_gt_masks = KL.Input(
shape=[config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1], None],
name="input_gt_masks", dtype=bool)
接主:
如果是inference则执行这段代码
# 如果不是训练执行测试
elif mode == "inference":
# input_anchors 锚框,[batch, None, 4]
# Anchors in normalized coordinates
# 归一化坐标中的锚
input_anchors = KL.Input(shape=[None, 4], name="input_anchors")
接主:
ResNet网络,分别有ResNet50,ResNet101,ResNet152,ResNet200
BACKBONE就是用来选择网络的
下面是网络部分我不做网络详解
# Build the shared convolutional layers.
# 构建共享的卷积层。
# Bottom-up Layers
# 自底向上的层次
# Returns a list of the last layers of each stage, 5 in total.
# 返回每个阶段的最后一层的列表,总共5层。
# Don't create the thead (stage 5), so we pick the 4th item in the list.
# 深度残差网络ResNet,分别有50,101,152,200层
# 接下来进入网络 config.BACKBONE = resent101 的网络进行测试
_, C2, C3, C4, C5 = resnet_graph(input_image, config.BACKBONE,
stage5=True, train_bn=config.TRAIN_BN)
# Top-down Layers
# 自顶向下的层次
# TODO: add assert to varify feature map sizes match what's in config
# 添加断言来改变功能映射的大小,以匹配配置中的内容
P5 = KL.Conv2D(256, (1, 1), name='fpn_c5p5')(C5)
P4 = KL.Add(name="fpn_p4add")([
KL.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5),
KL.Conv2D(256, (1, 1), name='fpn_c4p4')(C4)])
P3 = KL.Add(name="fpn_p3add")([
KL.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4),
KL.Conv2D(256, (1, 1), name='fpn_c3p3')(C3)])
P2 = KL.Add(name="fpn_p2add")([
KL.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3),
KL.Conv2D(256, (1, 1), name='fpn_c2p2')(C2)])
# Attach 3x3 conv to all P layers to get the final feature maps.
# 将3x3 conv附加到所有P层上,得到最终的特征图。
P2 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p2")(P2)
P3 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p3")(P3)
P4 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p4")(P4)
P5 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p5")(P5)
# P6 is used for the 5th anchor scale in RPN. Generated by
# P6是RPN中第5个锚标。生成的
# subsampling from P5 with stride of 2.
# 步长为2的P5子采样。
P6 = KL.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5)
# Note that P6 is used in RPN, but not in the classifier heads.
# 注意,P6在RPN中使用,但不在分类器头部中使用。
# 其中rpn_feature_maps对应图中的实线输出,送入RPN网络分类/回归得到锚框的前景/背景鉴别结果
# 而mrcnn_feature_maps则是后面进行ROI Align时的切割目标
rpn_feature_maps = [P2, P3, P4, P5, P6]
mrcnn_feature_maps = [P2, P3, P4, P5]
跳转 resnet_graph :
def resnet_graph(input_image, architecture, stage5=False, train_bn=True):
"""Build a ResNet graph.
构建一个ResNet图
architecture: Can be resnet50 or resnet101
可以是resnet50还是resnet101
stage5: Boolean. If False, stage5 of the network is not created
如果为False,则不创建网络的阶段5
train_bn: Boolean. Train or freeze Batch Norm layres
培训或冻结批次标准层
"""
assert architecture in ["resnet50", "resnet101"]
# Stage 1
x = KL.ZeroPadding2D((3, 3))(input_image)
x = KL.Conv2D(64, (7, 7), strides=(2, 2), name='conv1', use_bias=True)(x)
x = BatchNorm(name='bn_conv1')(x, training=train_bn)
x = KL.Activation('relu')(x)
C1 = x = KL.MaxPooling2D((3, 3), strides=(2, 2), padding="same")(x)
# Stage 2
x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1), train_bn=train_bn)
x = identity_block(x, 3, [64, 64, 256], stage=2, block='b', train_bn=train_bn)
C2 = x = identity_block(x, 3, [64, 64, 256], stage=2, block='c', train_bn=train_bn)
# Stage 3
x = conv_block(x, 3, [128, 128, 512], stage=3, block='a', train_bn=train_bn)
x = identity_block(x, 3, [128, 128, 512], stage=3, block='b', train_bn=train_bn)
x = identity_block(x, 3, [128, 128, 512], stage=3, block='c', train_bn=train_bn)
C3 = x = identity_block(x, 3, [128, 128, 512], stage=3, block='d', train_bn=train_bn)
# Stage 4
x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a', train_bn=train_bn)
block_count = {"resnet50": 5, "resnet101": 22}[architecture]
for i in range(block_count):
x = identity_block(x, 3, [256, 256, 1024], stage=4, block=chr(98 + i), train_bn=train_bn)
C4 = x
# Stage 5
if stage5:
x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a', train_bn=train_bn)
x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b', train_bn=train_bn)
C5 = x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c', train_bn=train_bn)
else:
C5 = None
return [C1, C2, C3, C4, C5]
回到主函数:
我们看到有这两个 rpn_feature_maps mrcnn_feature_maps
rpn_feature_maps把网络聚集在一起,对应这个:
mrcnn_feature_maps把网络聚集在一起,对应这个:
接主:
# Anchors
if mode == "training":
anchors = self.get_anchors(config.IMAGE_SHAPE)
# Duplicate across the batch dimension because Keras requires it
# TODO: can this be optimized to avoid duplicating the anchors?
# np.broadcast_to 批量复制维度 因为Keras需要它
anchors = np.broadcast_to(anchors, (config.BATCH_SIZE,) + anchors.shape)
# A hack to get around Keras's bad support for constants
'''
keras的Module不能接收tf的Tensor作为数据流,所有需要使用KL.Lambda将之转化为keras的数据流,
如下这样将tf写好的函数输出直接转换为keras的Module可以接收的类型
'''
anchors = KL.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image)
else:
anchors = input_anchors
接主:
对应的层级,输出对应的结果
'''
拿到三种值:
rpn_logits: [batch, H, W, 2] Anchor classifier logits (before softmax)
背景和前景的二分类的log
rpn_probs: [batch, W, W, 2] Anchor classifier probabilities.
背景和前景的二分类的prob
rpn_bbox: [batch, H, W, (dy, dx, log(dh), log(dw))] Deltas to beapplied to anchors.
回归结果锚框的位置
'''
rpn = build_rpn_model(config.RPN_ANCHOR_STRIDE, # 1 3 256
len(config.RPN_ANCHOR_RATIOS), 256)
# Loop through pyramid layers
layer_outputs = [] # list of lists
for p in rpn_feature_maps:
layer_outputs.append(rpn([p]))
# Concatenate layer outputs
# 连接层的输出
# Convert from list of lists of level outputs to list of lists
# 将级别输出列表列表转换为列表列表
# of outputs across levels.
# 跨级别的输出。
# e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"]
outputs = list(zip(*layer_outputs))
outputs = [KL.Concatenate(axis=1, name=n)(list(o))
for o, n in zip(outputs, output_names)]
# # 保存各pyramid特征经过RPN之后的结果
rpn_class_logits, rpn_class, rpn_bbox = outputs
跳转build_rpn_model:
# 输入的值 1 3 256
def build_rpn_model(anchor_stride, anchors_per_location, depth):
"""Builds a Keras model of the Region Proposal Network.
It wraps the RPN graph so it can be used multiple times with sharedweights.
anchors_per_location: number of anchors per pixel in the feature map
特征图中每个像素的锚点数量
anchor_stride: Controls the density of anchors. Typically 1 (anchors forevery pixel in the feature map), or 2 (every other pixel).
控制锚的密度。通常为1(特征图中每个像素的前一个锚点)或2(每隔一个像素)。
depth: Depth of the backbone feature map.
骨干特征图的深度。
Returns a Keras Model object. The model outputs, when called, are:
rpn_logits: [batch, H, W, 2] Anchor classifier logits (before softmax)
背景和前景的二分类的log
rpn_probs: [batch, W, W, 2] Anchor classifier probabilities.
背景和前景的二分类的prob
rpn_bbox: [batch, H, W, (dy, dx, log(dh), log(dw))] Deltas to beapplied to anchors.
回归结果锚框的位置
"""
input_feature_map = KL.Input(shape=[None, None, depth],
name="input_rpn_feature_map")
outputs = rpn_graph(input_feature_map, anchors_per_location, anchor_stride)
return KM.Model([input_feature_map], outputs, name="rpn_model")
进一步跳转 rpn_graph:
def rpn_graph(feature_map, anchors_per_location, anchor_stride):
"""Builds the computation graph of Region Proposal Network.
建立区域建议网络的计算图
feature_map: backbone features [batch, height, width, depth]
骨干的特性
anchors_per_location: number of anchors per pixel in the feature map
特征图中每个像素的锚点数量
anchor_stride: Controls the density of anchors. Typically 1 (anchors for
every pixel in the feature map), or 2 (every other pixel).
控制锚的密度。通常为1(锚为或2(每隔一个像素)。
Returns:
rpn_logits: [batch, H, W, 2] Anchor classifier logits (before softmax)
背景和前景的二分类的log
rpn_probs: [batch, W, W, 2] Anchor classifier probabilities.
背景和前景的二分类的prob
rpn_bbox: [batch, H, W, (dy, dx, log(dh), log(dw))] Deltas to beapplied to anchors.
回归结果锚框的位置
"""
# TODO: check if stride of 2 causes alignment issues if the featuremap
# is not even.
# Shared convolutional base of the RPN
# 开始 rpn 卷积
shared = KL.Conv2D(512, (3, 3), padding='same', activation='relu',
strides=anchor_stride,
name='rpn_conv_shared')(feature_map)
# Anchor Score. [batch, height, width, anchors per location * 2].
x = KL.Conv2D(2 * anchors_per_location, (1, 1), padding='valid',
activation='linear', name='rpn_class_raw')(shared)
# Reshape to [batch, anchors, 2]
rpn_class_logits = KL.Lambda(
lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 2]))(x)
# Softmax on last dimension of BG/FG.
rpn_probs = KL.Activation(
"softmax", name="rpn_class_xxx")(rpn_class_logits)
# Bounding box refinement. [batch, H, W, anchors per location, depth]
# where depth is [x, y, log(w), log(h)]
x = KL.Conv2D(anchors_per_location * 4, (1, 1), padding="valid",
activation='linear', name='rpn_bbox_pred')(shared)
# Reshape to [batch, anchors, 4]
rpn_bbox = KL.Lambda(lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 4]))(x)
return [rpn_class_logits, rpn_probs, rpn_bbox]