Mask_rcnn中文详解

一.在学习Mask之前,建议先看看faster_rcnn,(faster_rcnn代码解读)

Mask_rcnn关键技术:

1.多尺度检测(最早在yolo3中使用),里面用到了FPN技术

2.rpn

2.ROI Align

二:系统学习mask_rcnn过程,B站视频讲解

三:代码中文注释

model.py

"""
Mask R-CNN
The main Mask R-CNN model implemenetation.
Copyright (c) 2017 Matterport, Inc.
Licensed under the MIT License (see LICENSE for details)
Written by Waleed Abdulla
"""import datetime
import math
import os
import random
import reimport numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variableimport utils
import visualize
#from nms.nms_wrapper import nms
from roialign.roi_align.crop_and_resize import CropAndResizeFunction############################################################
# nms
############################################################
# boxes=np.array([[100,100,210,210,0.72],# [250,250,420,420,0.8],# [220,220,320,330,0.92],# [100,100,210,210,0.72],# [230,240,325,330,0.81],# [220,230,315,340,0.9]]) 
def nms(dets, thresh):# dets:(m,5)  thresh:scalerx1 = dets[:,0]y1 = dets[:,1]x2 = dets[:,2]y2 = dets[:,3]areas = (y2-y1+1) * (x2-x1+1)scores = dets[:,4]keep = []index = scores.argsort()[::-1]while index.size >0:i = index[0]       # every time the first is the biggst, and add it directlykeep.append(i)x11 = np.maximum(x1[i], x1[index[1:]])    # calculate the points of overlap y11 = np.maximum(y1[i], y1[index[1:]])x22 = np.minimum(x2[i], x2[index[1:]])y22 = np.minimum(y2[i], y2[index[1:]])w = np.maximum(0, x22-x11+1)    # the weights of overlaph = np.maximum(0, y22-y11+1)    # the height of overlapoverlaps = w*hious = overlaps / (areas[i]+areas[index[1:]] - overlaps)idx = np.where(ious<=thresh)[0]index = index[idx+1]   # because index start from 1return keep
# import matplotlib.pyplot as plt
# def plot_bbox(dets, c='k'):# x1 = dets[:,0]# y1 = dets[:,1]# x2 = dets[:,2]# y2 = dets[:,3]# plt.plot([x1,x2], [y1,y1], c)# plt.plot([x1,x1], [y1,y2], c)# plt.plot([x1,x2], [y2,y2], c)# plt.plot([x2,x2], [y1,y2], c)# plt.title("after nms")  ############################################################
#  Logging Utility Functions
############################################################def log(text, array=None):"""Prints a text message. And, optionally, if a Numpy array is provided itprints it's shape, min, and max values.if array is not None  shape: (m,n)"""if array is not None:text = text.ljust(25) #方法返回一个原字符串左对齐,并使用空格填充至指定长度的新字符串。如果指定的长度小于原字符串的长度则返回原字符串text += ("shape: {:20}  min: {:10.5f}  max: {:10.5f}".format(str(array.shape), #将一个元组变成一个字符串 (m,n)-> '(m,n)'array.min() if array.size else "",  #array.size返回矩阵的元素数量m*n, array.min()返回数组最小值array.max() if array.size else ""))print(text)def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█'):"""Call in a loop to create terminal progress bar@params:iteration   - Required  : current iteration (Int)total       - Required  : total iterations (Int)prefix      - Optional  : prefix string (Str)suffix      - Optional  : suffix string (Str)decimals    - Optional  : positive number of decimals in percent complete (Int)length      - Optional  : character length of bar (Int)fill        - Optional  : bar fill character (Str)"""percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))filledLength = int(length * iteration // total)bar = fill * filledLength + '-' * (length - filledLength)print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '\n')# Print New Line on Completeif iteration == total:print()############################################################
#  Pytorch Utility Functions
############################################################def unique1d(tensor):if tensor.size()[0] == 0 or tensor.size()[0] == 1:return tensortensor = tensor.sort()[0]  #对tensor每行进行排序unique_bool = tensor[1:] != tensor [:-1]  #检测除了第一行和最后一行的数据之外的数据first_element = Variable(torch.ByteTensor([True]), requires_grad=False).bool()if tensor.is_cuda:first_element = first_element.cuda()unique_bool = torch.cat((first_element, unique_bool),dim=0)return tensor[unique_bool.data]#求两个一维度的行向量的交集
def intersect1d(tensor1, tensor2):  assert len(tensor1.shape)==1 and len(tensor2.shape)==1 and len(tensor1)>1 and len(tensor2)>1,"输入的维度为1且数据长度大于1"aux = torch.cat((tensor1, tensor2),dim=0)aux = aux.sort()[0]return aux[:-1][(aux[1:] == aux[:-1]).data]def log2(x):"""Implementatin of Log2. Pytorch doesn't have a native implemenation."""ln2 = Variable(torch.log(torch.FloatTensor([2.0])), requires_grad=False)if x.is_cuda:ln2 = ln2.cuda()return torch.log(x) / ln2class SamePad2d(nn.Module):"""Mimics tensorflow's 'SAME' padding."""def __init__(self, kernel_size, stride):super(SamePad2d, self).__init__()self.kernel_size = torch.nn.modules.utils._pair(kernel_size)  #函数将输入变成成对的元组self.stride = torch.nn.modules.utils._pair(stride) def forward(self, input):#input (batch,c,h,w)in_width = input.size()[3]in_height = input.size()[2]out_width = math.ceil(float(in_width) / float(self.stride[0]))  #向上取整out_height = math.ceil(float(in_height) / float(self.stride[1]))pad_along_width = ((out_width - 1) * self.stride[0] +self.kernel_size[0] - in_width)pad_along_height = ((out_height - 1) * self.stride[1] +self.kernel_size[1] - in_height)       #这里用到out_w=(in_w +pad-k+s)/s 反向求出padpad_left = math.floor(pad_along_width / 2)  #向下取整pad_top = math.floor(pad_along_height / 2)pad_right = pad_along_width - pad_leftpad_bottom = pad_along_height - pad_topreturn F.pad(input, (pad_left, pad_right, pad_top, pad_bottom), 'constant', 0)def __repr__(self):return self.__class__.__name__############################################################
#  FPN Graph
############################################################class TopDownLayer(nn.Module):def __init__(self, in_channels, out_channels):super(TopDownLayer, self).__init__()self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1)  #将通道压缩self.padding2 = SamePad2d(kernel_size=3, stride=1)   #初始化类SamePad2d,在特征图周围用0填充self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,stride=1)def forward(self, x, y):y = F.upsample(y, scale_factor=2) #将y的宽高扩充到2倍x = self.conv1(x)                 #改变x的通道数return self.conv2(self.padding2(x+y))  #先将x与y融合后的特征,再用0填充,保证卷积前后宽度不变class FPN(nn.Module):def __init__(self, C1, C2, C3, C4, C5, out_channels):super(FPN, self).__init__()self.out_channels = out_channels  #每层输出的通道数相同self.C1 = C1    #卷积操作self.C2 = C2self.C3 = C3self.C4 = C4self.C5 = C5self.P6 = nn.MaxPool2d(kernel_size=1, stride=2)self.P5_conv1 = nn.Conv2d(2048, self.out_channels, kernel_size=1, stride=1)self.P5_conv2 = nn.Sequential(SamePad2d(kernel_size=3, stride=1), #先将特征图padding,用于下面的卷积操作nn.Conv2d(self.out_channels, self.out_channels, kernel_size=3, stride=1), #消除混叠效应)self.P4_conv1 =  nn.Conv2d(1024, self.out_channels, kernel_size=1, stride=1)self.P4_conv2 = nn.Sequential(SamePad2d(kernel_size=3, stride=1),nn.Conv2d(self.out_channels, self.out_channels, kernel_size=3, stride=1),)self.P3_conv1 = nn.Conv2d(512, self.out_channels, kernel_size=1, stride=1)self.P3_conv2 = nn.Sequential(SamePad2d(kernel_size=3, stride=1),nn.Conv2d(self.out_channels, self.out_channels, kernel_size=3, stride=1),)self.P2_conv1 = nn.Conv2d(256, self.out_channels, kernel_size=1, stride=1)self.P2_conv2 = nn.Sequential(SamePad2d(kernel_size=3, stride=1),nn.Conv2d(self.out_channels, self.out_channels, kernel_size=3, stride=1),)def forward(self, x):x = self.C1(x)x = self.C2(x)c2_out = xx = self.C3(x)c3_out = xx = self.C4(x)c4_out = xx = self.C5(x)p5_out = self.P5_conv1(x)p4_out = self.P4_conv1(c4_out) + F.upsample(p5_out, scale_factor=2)p3_out = self.P3_conv1(c3_out) + F.upsample(p4_out, scale_factor=2)p2_out = self.P2_conv1(c2_out) + F.upsample(p3_out, scale_factor=2)p5_out = self.P5_conv2(p5_out)p4_out = self.P4_conv2(p4_out)p3_out = self.P3_conv2(p3_out)p2_out = self.P2_conv2(p2_out)# P6 is used for the 5th anchor scale in RPN. Generated by# subsampling from P5 with stride of 2.p6_out = self.P6(p5_out)return [p2_out, p3_out, p4_out, p5_out, p6_out]############################################################
#  Resnet Graph
############################################################
class Bottleneck(nn.Module):expansion = 4def __init__(self, inplanes, planes, stride=1, downsample=None):super(Bottleneck, self).__init__()self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride)  self.bn1 = nn.BatchNorm2d(planes, eps=0.001, momentum=0.01) self.padding2 = SamePad2d(kernel_size=3, stride=1)self.conv2 = nn.Conv2d(planes, planes, kernel_size=3)self.bn2 = nn.BatchNorm2d(planes, eps=0.001, momentum=0.01)self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1)   self.bn3 = nn.BatchNorm2d(planes * 4, eps=0.001, momentum=0.01) self.relu = nn.ReLU(inplace=True)self.downsample = downsampleself.stride = stridedef forward(self, x):residual = x#如果x是 (n,inplanes,h,w)out = self.conv1(x)  #通道压缩  (n,planes,h,w)out = self.bn1(out)  #归一化    (n,planes,h,w)out = self.relu(out) #非线性化  (n,planes,h,w)out = self.padding2(out)  #添加padding ((n,planes,h+pad,w+pad))out = self.conv2(out)     #卷积调整宽高 (n,planes,h,w)out = self.bn2(out)       #(n,planes,h,w)out = self.relu(out)      #(n,planes,h,w)out = self.conv3(out)     #(n,4*planes,h,w)out = self.bn3(out)       #(n,4*planes,h,w)if self.downsample is not None:residual = self.downsample(x)out += residualout = self.relu(out)return outclass ResNet(nn.Module):def __init__(self, architecture, stage5=False):super(ResNet, self).__init__()assert architecture in ["resnet50", "resnet101"]self.inplanes = 64self.layers = [3, 4, {"resnet50": 6, "resnet101": 23}[architecture], 3]self.block = Bottleneckself.stage5 = stage5self.C1 = nn.Sequential(nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),nn.BatchNorm2d(64, eps=0.001, momentum=0.01),nn.ReLU(inplace=True),SamePad2d(kernel_size=3, stride=2),nn.MaxPool2d(kernel_size=3, stride=2),)self.C2 = self.make_layer(self.block, 64, self.layers[0])self.C3 = self.make_layer(self.block, 128, self.layers[1], stride=2)self.C4 = self.make_layer(self.block, 256, self.layers[2], stride=2)if self.stage5:self.C5 = self.make_layer(self.block, 512, self.layers[3], stride=2)else:self.C5 = Nonedef forward(self, x):x = self.C1(x)x = self.C2(x)x = self.C3(x)x = self.C4(x)x = self.C5(x)return xdef stages(self):return [self.C1, self.C2, self.C3, self.C4, self.C5]def make_layer(self, block, planes, blocks, stride=1):downsample = Noneif stride != 1 or self.inplanes != planes * block.expansion:downsample = nn.Sequential(nn.Conv2d(self.inplanes, planes * block.expansion,kernel_size=1, stride=stride),nn.BatchNorm2d(planes * block.expansion, eps=0.001, momentum=0.01),)layers = []layers.append(block(self.inplanes, planes, stride, downsample))self.inplanes = planes * block.expansionfor i in range(1, blocks):layers.append(block(self.inplanes, planes))return nn.Sequential(*layers)############################################################
#  Proposal Layer
############################################################def apply_box_deltas(boxes, deltas):"""Applies the given deltas to the given boxes.boxes: [N, 4] where each row is y1, x1, y2, x2deltas: [N, 4] where each row is [dy, dx, log(dh), log(dw)]"""# Convert to y, x, h, wheight = boxes[:, 2] - boxes[:, 0]width = boxes[:, 3] - boxes[:, 1]center_y = boxes[:, 0] + 0.5 * heightcenter_x = boxes[:, 1] + 0.5 * width# Apply deltascenter_y += deltas[:, 0] * heightcenter_x += deltas[:, 1] * widthheight *= torch.exp(deltas[:, 2])width *= torch.exp(deltas[:, 3])# Convert back to y1, x1, y2, x2y1 = center_y - 0.5 * heightx1 = center_x - 0.5 * widthy2 = y1 + heightx2 = x1 + widthresult = torch.stack([y1, x1, y2, x2], dim=1)return resultdef clip_boxes(boxes, window):"""boxes: [N, 4] each col is y1, x1, y2, x2window: [4] in the form y1, x1, y2, x2"""boxes = torch.stack( \[boxes[:, 0].clamp(float(window[0]), float(window[2])),boxes[:, 1].clamp(float(window[1]), float(window[3])),boxes[:, 2].clamp(float(window[0]), float(window[2])),boxes[:, 3].clamp(float(window[1]), float(window[3]))], 1)return boxesdef proposal_layer(inputs, proposal_count, nms_threshold, anchors, config=None):"""Receives anchor scores and selects a subset to pass as proposalsto the second stage. Filtering is done based on anchor scores andnon-max suppression to remove overlaps. It also applies boundingbox refinment detals to anchors.anchors: (anchors_num,4) 它是在图片尺寸维度的Inputs:rpn_probs: [batch, anchors_num, (bg prob, fg prob)]rpn_bbox: [batch, anchors_num, (dy, dx, log(dh), log(dw))]Returns:Proposals in normalized coordinates [batch, rois, (y1, x1, y2, x2)]"""# Currently only supports batchsize 1# inputs是一个list [rpn_probs,rpn_bbox]# inputs[0] 是rpn_probs (batch,anchors_num,2)# inputs[1] 是rpn_bbox  (batch,anchors_num,4)inputs[0] = inputs[0].squeeze(0)  #去掉batch_size那个维度,因为每个batch只支持一张图片inputs[1] = inputs[1].squeeze(0)# Box Scores. Use the foreground class confidence. [Batch, num_rois, 1]scores = inputs[0][:, 1]# Box deltas [batch, num_rois, 4]deltas = inputs[1] #获得deltals,这是每张图片在第一阶段的预测输出#RPN_BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])std_dev = Variable(torch.from_numpy(np.reshape(config.RPN_BBOX_STD_DEV, [1, 4])).float(), requires_grad=False)if config.GPU_COUNT:   #GPU_COUNT=1,则表示使用GPU,为0,则表示使用CPUstd_dev = std_dev.cuda()deltas = deltas * std_dev# Improve performance by trimming to top anchors by score# and doing the rest on the smaller subset.pre_nms_limit = min(6000, anchors.size()[0])   #在进行nms之前取出多少个anchorsscores, order = scores.sort(descending=True)   #将scores进行倒序排序,返回排序后的结果,以及他们在原序列的的indexorder = order[:pre_nms_limit]                  #切片前pre_nms_limit个scoresscores = scores[:pre_nms_limit]                #切片  (pre_nms_limit,)deltas = deltas[order.data, :] # TODO: Support batch size > 1 ff.  #取出scores排名前pre_nms_limit的deltasanchors = anchors[order.data, :]               #同上   # Apply deltas to anchors to get refined anchors.# [batch, N, (y1, x1, y2, x2)]boxes = apply_box_deltas(anchors, deltas)      #根据输出偏差对anchors进行修正#将取出来的deltas进行变换成方框的左上角和右下角坐标(pre_nms_limit,4)#(pre_nms_limit,(y1, x1, y2, x2))# Clip to image boundaries. [batch, N, (y1, x1, y2, x2)]height, width = config.IMAGE_SHAPE[:2]         #图片的边界window = np.array([0, 0, height, width]).astype(np.float32)  boxes = clip_boxes(boxes, window)       #将boxs限制在图片边界内# Filter out small boxes# According to Xinlei Chen's paper, this reduces detection accuracy# for small objects, so we're skipping it.# Non-max suppression#torch.cat((boxes, scores.unsqueeze(1)), 1)将scores增加一个维度变成(pre_nms_limit,1),#与boxes (pre_nms_limit,4)在第dims=1的维度上拼接变成维度为(pre_nms_limit,5)keep = nms(torch.cat((boxes, scores.unsqueeze(1)), 1).data, nms_threshold)#keep是一个list,保存的是经过nms后剩下来的box的indexif len(keep)>proposal_count:keep = keep[:proposal_count]boxes = boxes[keep, :]# Normalize dimensions to range of 0 to 1.norm = Variable(torch.from_numpy(np.array([height, width, height, width])).float(), requires_grad=False)if config.GPU_COUNT:norm = norm.cuda()normalized_boxes = boxes / norm  #将boxs归一化# Add back batch dimensionnormalized_boxes = normalized_boxes.unsqueeze(0)  #增加一个维度 (proposal_count,4)--> (1,proposal_count,4)return normalized_boxes############################################################
#  ROIAlign Layer
############################################################def pyramid_roi_align(inputs, pool_size, image_shape):"""Implements ROI Pooling on multiple levels of the feature pyramid.Params:- pool_size: [height, width] of the output pooled regions. Usually [7, 7]- image_shape: [channels,height, width]. Shape of input image in pixels  Inputs:- boxes: [batch, num_boxes, (y1, x1, y2, x2)] in normalizedcoordinates.- Feature maps: List of feature maps from different levels of the pyramid.Each is [batch, channels, height, width]Output:Pooled regions in the shape: [num_boxes, channels, height, width,].The width and height are those specific in the pool_shape in the layerconstructor."""# Currently only supports batchsize 1for i in range(len(inputs)):inputs[i] = inputs[i].squeeze(0)  #去掉batch_size那个维度# Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coordsboxes = inputs[0]  #(num_boxes,4)# Feature Maps. List of feature maps from different level of the# feature pyramid. Each is [batch,channels, height, width]feature_maps = inputs[1:]  #[p2,p3,p4,p5,p6]# Assign each ROI to a level in the pyramid based on the ROI area.y1, x1, y2, x2 = boxes.chunk(4, dim=1)  #将boxes沿着类方向分成四个块h = y2 - y1        w = x2 - x1# Equation 1 in the Feature Pyramid Networks paper. Account for# the fact that our coordinates are normalized here.# a 224x224 ROI (in pixels) maps to P4image_area = Variable(torch.FloatTensor([float(image_shape[1]*image_shape[2])]), requires_grad=False)if boxes.is_cuda:  image_area = image_area.cuda()roi_level = 4 + log2(torch.sqrt(h*w)/(224.0/torch.sqrt(image_area)))  #选在哪个特征输出层上进行ROI_pooling roi_level = roi_level.round().int()   #round()向下取整roi_level = roi_level.clamp(2,5)   #FPN产生了[P2,P3,P4,P5,P6]五个特征层,但是只有[P2,P3,P4,P5]进行了roi_pooling# Loop through levels and apply ROI pooling to each. P2 to P5.pooled = []box_to_level = []for i, level in enumerate(range(2, 6)):  #i[0,1,2,3] level[2,3,4,5]ix  = roi_level==level   #boolif not ix.any(): #any() 函数用于判断给定的可迭代参数 iterable 是否全部为 False,则返回 False,如果有一个为 True,则返回 Truecontinueassert len(ix.shape)>1,"ix的维度<=1"ix = torch.nonzero(ix)[:,0] #torch.nonzero() 输出的非零元素在矩阵中的下标矩阵 维度(m,n1,n2,...)#m是非零元素的个数,n1,n2,...则是ix除了第一维度的其他维度(对于一维的行向量,则为1)#假设 ix=tensor([0,1,0,1])  >>> 输出tensor([[1],[3]])  维度(2,1)#假设 ix=tensor([[0,1,0,1],[0,1,0,1]]) >>>输出tensor([[0,1],[0,3],[1,0],[1,3]]) 维度(4,2)#因为ix的维度是[n,1],torch.nonzero(ix)生成的维度是(m,2),而通过[:,0],生成的则是,ix中不为0的indexlevel_boxes = boxes[ix.data, :]   #分别取出roi_level为2,3,4,5的roi# Keep track of which box is mapped to which levelbox_to_level.append(ix.data) #将roi的roi_level为2,3,4,5的index依次append到box_to_level中# Stop gradient propogation to ROI proposalslevel_boxes = level_boxes.detach()  #将抽离导数计算图,也就是说在进行反向传播是不进行求导# Crop and Resize# From Mask R-CNN paper: "We sample four regular locations, so# that we can evaluate either max or average pooling. In fact,# interpolating only a single value at each bin center (without# pooling) is nearly as effective."## Here we use the simplified approach of a single value per bin,# which is how it's done in tf.crop_and_resize()# Result: [batch * num_boxes, pool_height, pool_width, channels]per_level_boxes_num=level_boxes.size()[0]  #每一个level_layer有多少个boxind = Variable(torch.zeros(per_level_boxes_num),requires_grad=False).int()#每一个level层级的roi的个数,生成的一个行向量if level_boxes.is_cuda:ind = ind.cuda()feature_maps[i] = feature_maps[i].unsqueeze(0) #依次取出各个特征层的特征图,并且增加一个维度[1,1,c,h,w]           #CropAndResizeFunction needs batch dimensionpooled_features = CropAndResizeFunction(pool_size, pool_size, 0)(feature_maps[i], level_boxes, ind) #输出的维度为(per_level_boxes_num,c,pool_size,pool_size)pooled.append(pooled_features)# Pack pooled features into one tensorpooled = torch.cat(pooled, dim=0)   #横向拼接  # (num_rois,c,pool_size,pool_size)# Pack box_to_level mapping into one array and add another# column representing the order of pooled boxesbox_to_level = torch.cat(box_to_level, dim=0)    # Rearrange pooled features to match the order of the original boxes_, box_to_level = torch.sort(box_to_level)  #将因为pooling操作时是按照layer_level来处理的,#所以打乱了原来roi安装scores的高度排列的顺序#现在又按照rois原来的顺序来排列pooled = pooled[box_to_level, :, :, :]  #第一维是boxes的数量 (boxes_num,c,7,7)return pooled############################################################
#  Detection Target Layer
############################################################
def bbox_overlaps(boxes1, boxes2):"""Computes IoU overlaps between two sets of boxes.boxes1, boxes2: [N, (y1, x1, y2, x2)]."""# 1. Tile boxes2 and repeate boxes1. This allows us to compare# every boxes1 against every boxes2 without loops.# TF doesn't have an equivalent to np.repeate() so simulate it# using tf.tile() and tf.reshape.boxes1_repeat = boxes2.size()[0]   #box1有多少个框boxes2_repeat = boxes1.size()[0]   #box2有多少个框boxes1 = boxes1.repeat(1,boxes1_repeat).view(-1,4)  #一行一行的复制,先将第一行复制boxes1_repeat份,#然后将第二行复制boxes1_repeat份,依次类推boxes2 = boxes2.repeat(boxes2_repeat,1)  #整体赋值boxes2_repeat份# 2. Compute intersectionsb1_y1, b1_x1, b1_y2, b1_x2 = boxes1.chunk(4, dim=1) #这样分块能保持原数据的维度数量b2_y1, b2_x1, b2_y2, b2_x2 = boxes2.chunk(4, dim=1)  y1 = torch.max(b1_y1, b2_y1)[:, 0]  #返回比较大小的结果,经过[:,0]得到一个一维行向量x1 = torch.max(b1_x1, b2_x1)[:, 0]y2 = torch.min(b1_y2, b2_y2)[:, 0]x2 = torch.min(b1_x2, b2_x2)[:, 0]zeros = Variable(torch.zeros(y1.size()[0]), requires_grad=False)if y1.is_cuda:zeros = zeros.cuda()intersection = torch.max(x2 - x1, zeros) * torch.max(y2 - y1, zeros)# 3. Compute unionsb1_area = (b1_y2 - b1_y1) * (b1_x2 - b1_x1)b2_area = (b2_y2 - b2_y1) * (b2_x2 - b2_x1)union = b1_area[:,0] + b2_area[:,0] - intersection# 4. Compute IoU and reshape to [boxes1, boxes2]iou = intersection / unionoverlaps = iou.view(boxes2_repeat, boxes1_repeat)return overlapsdef ious(boxes1,boxes2):'''inputs:boxes1: [n,4]boxes2: [m,4]outputs:[n,m]'''b1_y1,b1_x1,b1_y2,b1_x2=boxes1[:,0],boxes1[:,1],boxes1[:,2],boxes1[:,3]b2_y1,b2_x1,b2_y2,b2_x2=boxes2[:,0],boxes2[:,1],boxes2[:,2],boxes2[:,3]y1=torch.max(b1_y1.unsqueeze(1),b2_y1)x1=torch.max(b1_x1.unsqueeze(1),b2_x1)y2=torch.min(b1_y2.unsqueeze(1),b2_y2)x2=torch.min(b1_x2.unsqueeze(1),b2_x2)intersection = torch.max(x2 - x1, torch.zeros(x1.size()))* torch.max(y2 - y1, torch.zeros(y1.size()))b1_area = ((b1_y2 - b1_y1) * (b1_x2 - b1_x1))b2_area = ((b2_y2 - b2_y1) * (b2_x2 - b2_x1))areas=b1_area.unsqueeze(1)+b2_areaious=intersection/(areas-intersection)return iousdef detection_target_layer(proposals, gt_class_ids, gt_boxes, gt_masks, config):"""Subsamples proposals and generates target box refinment, class_ids,and masks for each.Inputs:proposals: [batch, N, (y1, x1, y2, x2)] in normalized coordinates. Mightbe zero padded if there are not enough proposals.   rpn_roisgt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs.gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalizedcoordinates.gt_masks: [batch,MAX_GT_INSTANCES, height, width,] of boolean typeReturns: Target ROIs and corresponding class IDs, bounding box shifts,and masks.rois: [batch, TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalizedcoordinatestarget_class_ids: [batch, TRAIN_ROIS_PER_IMAGE]. Integer class IDs.target_deltas: [batch, TRAIN_ROIS_PER_IMAGE, NUM_CLASSES,(dy, dx, log(dh), log(dw), class_id)]Class-specific bbox refinments.target_mask: [batch, TRAIN_ROIS_PER_IMAGE, height, width)Masks cropped to bbox boundaries and resized to neuralnetwork output size."""# Currently only supports batchsize 1proposals = proposals.squeeze(0) #去掉batchgt_class_ids = gt_class_ids.squeeze(0)gt_boxes = gt_boxes.squeeze(0)gt_masks = gt_masks.squeeze(0)# Handle COCO crowds# A crowd box in COCO is a bounding box around several instances. Exclude# them from training. A crowd box is given a negative class ID.# gt_class_ids<0代表该区域是背景,# gt_class_ids>0代表该区域是前景。if torch.nonzero(gt_class_ids < 0).size():   #当gt_class_ids中存在小于0的数crowd_ix = torch.nonzero(gt_class_ids < 0)[:, 0]  #获得gt_class_ids<0的indexnon_crowd_ix = torch.nonzero(gt_class_ids > 0)[:, 0] #获得gt_class_ids>0的index,crowd_boxes = gt_boxes[crowd_ix.data, :]             #获取gt_class_ids<0的box,背景框crowd_masks = gt_masks[:,:,crowd_ix.data]            #获取gt_class_ids<0的maskgt_class_ids = gt_class_ids[non_crowd_ix.data]gt_boxes = gt_boxes[non_crowd_ix.data, :]            #获取正样本的真实框gt_masks = gt_masks[:,:,non_crowd_ix.data]           ## Compute overlaps with crowd boxes [anchors, crowds]crowd_overlaps = bbox_overlaps(proposals, crowd_boxes) #计算感兴趣框与属性为背景的真实框的iouscrowd_iou_max = torch.max(crowd_overlaps, dim=1)[0]    #计算每行的最大值no_crowd_bool = crowd_iou_max < 0.001                  #获得取每行最大值小于<0.001的index(bool类型,选中为true,未选中为false)else:#如果gt_class_ids全部大于0,则全为trueno_crowd_bool =  Variable(torch.ByteTensor(proposals.size()[0]*[True]), requires_grad=False)if config.GPU_COUNT:no_crowd_bool = no_crowd_bool.cuda()# Compute overlaps matrix [proposals, gt_boxes]overlaps = bbox_overlaps(proposals, gt_boxes)   #计算感兴趣框与全部真实框的ious# Determine postive and negative ROIsroi_iou_max = torch.max(overlaps, dim=1)[0]# 1. Positive ROIs are those with >= 0.5 IoU with a GT boxpositive_roi_bool = roi_iou_max >= 0.5        #获取取proposals与gt_boxes的ious值的最大值大于0.5的作为正样本# Subsample ROIs. Aim for 33% positive# Positive ROIsif torch.nonzero(positive_roi_bool).size():positive_indices = torch.nonzero(positive_roi_bool)[:, 0]  #获取正样本的索引positive_count = int(config.TRAIN_ROIS_PER_IMAGE *   config.ROI_POSITIVE_RATIO)         #自定义正样本的数量 int(200*0.33)rand_idx = torch.randperm(positive_indices.size()[0])   #给定参数n,返回一个从0到n-1的随机整数排列#rand_idx是长度为len(positive_indices)的打乱的自然数序列#也就是他是positive_indices这个一维向量的下标rand_idx = rand_idx[:positive_count]                    #取出一部下标if config.GPU_COUNT:rand_idx = rand_idx.cuda()positive_indices = positive_indices[rand_idx] #获取从positive_indices选出的下标的对应的值,而这些值又是proposals的下标positive_count = positive_indices.size()[0]   #获取最终选取的正样本数positive_rois = proposals[positive_indices.data,:]  #获取最终选取的正样本# Assign positive ROIs to GT boxes.positive_overlaps = overlaps[positive_indices.data,:]  #刷选出正样本与gt_boxes的iousroi_gt_box_assignment = torch.max(positive_overlaps, dim=1)[1] #获取每行的最大值处于哪一列,也就是看哪个真实框与该#正样本最匹配roi_gt_boxes = gt_boxes[roi_gt_box_assignment.data,:] #将每个gt_box与每个正样本计算的的ious,然后看哪个gt_box与#该正样本的ious最大,就将该样本的label(包括真实box,box框中的物体的类别)  #赋值该正样本,作为该正样本的label                                                               roi_gt_class_ids = gt_class_ids[roi_gt_box_assignment.data]# Compute bbox refinement for positive ROIs# 对RPN网络输出的正样本框,进行修正,计算正样本框与分配给它的真实框之间的偏差deltas = Variable(utils.box_refinement(positive_rois.data, roi_gt_boxes.data), requires_grad=False)# BBOX_STD_DEV=np.array([0.1, 0.1, 0.2, 0.2])std_dev = Variable(torch.from_numpy(config.BBOX_STD_DEV).float(), requires_grad=False)if config.GPU_COUNT:std_dev = std_dev.cuda()deltas /= std_dev         #这里把deltas除以tensor([0.1, 0.1, 0.2, 0.2]),投入网络,所以在proposal_layer#输出的deltas应该乘以tensor([0.1, 0.1, 0.2, 0.2]),才网络真正的输出# Assign positive ROIs to GT masksroi_masks = gt_masks[roi_gt_box_assignment.data,:,:]  #获取正样本的mask# Compute mask targetsboxes = positive_rois      if config.USE_MINI_MASK:# Transform ROI corrdinates from normalized image space# to normalized mini-mask space.y1, x1, y2, x2 = positive_rois.chunk(4, dim=1)   #获取正样本的y_min,x_min,y_max,x_maxgt_y1, gt_x1, gt_y2, gt_x2 = roi_gt_boxes.chunk(4, dim=1) #计算给这些正样本分配的真实标签中的box的y_min,x_min,y_max,x_maxgt_h = gt_y2 - gt_y1gt_w = gt_x2 - gt_x1dy1 = (y1 - gt_y1) / gt_hdx1 = (x1 - gt_x1) / gt_wdy2 = (y2 - gt_y1) / gt_hdx2 = (x2 - gt_x1) / gt_wboxes = torch.cat([dy1, dx1, dy2, dx2], dim=1)  #正样本与真实框的偏差box_ids = Variable(torch.arange(roi_masks.size()[0]), requires_grad=False).int()if config.GPU_COUNT:box_ids = box_ids.cuda()masks = Variable(CropAndResizeFunction(config.MASK_SHAPE[0], config.MASK_SHAPE[1], 0)(roi_masks.unsqueeze(1), boxes, box_ids).data, requires_grad=False)#将mask进行对其操作,其实就是把它从(56,56)变成(28,28)masks = masks.squeeze(1)  # Threshold mask pixels at 0.5 to have GT masks be 0 or 1 to use with# binary cross entropy loss.masks = torch.round(masks)else:positive_count = 0# 2. Negative ROIs are those with < 0.5 with every GT box. Skip crowds.negative_roi_bool = roi_iou_max < 0.5     ##获取取proposals与gt_boxes的ious值的最大值小于0.5的作为负样本negative_roi_bool = negative_roi_bool & no_crowd_bool    #负样本满足两个条件,一是iou<0.5,二是与背景框的iou<0.001# Negative ROIs. Add enough to maintain positive:negative ratio.if torch.nonzero(negative_roi_bool).size() and positive_count>0:negative_indices = torch.nonzero(negative_roi_bool)[:, 0]    #获取负样本的indexr = 1.0 / config.ROI_POSITIVE_RATIOnegative_count = int(r * positive_count - positive_count)rand_idx = torch.randperm(negative_indices.size()[0])rand_idx = rand_idx[:negative_count]if config.GPU_COUNT:rand_idx = rand_idx.cuda()negative_indices = negative_indices[rand_idx]negative_count = negative_indices.size()[0]negative_rois = proposals[negative_indices.data, :]else:negative_count = 0# Append negative ROIs and pad bbox deltas and masks that# are not used for negative ROIs with zeros.if positive_count > 0 and negative_count > 0:rois = torch.cat((positive_rois, negative_rois), dim=0)  #将正负样本纵向拼接zeros = Variable(torch.zeros(negative_count), requires_grad=False).int()   #将选择出来的负样本的类别label设置为0if config.GPU_COUNT:zeros = zeros.cuda()roi_gt_class_ids = torch.cat([roi_gt_class_ids, zeros], dim=0)        #将正负样本类别label纵向拼接zeros = Variable(torch.zeros(negative_count,4), requires_grad=False)  #将负样本的(dy1,dx1,dy2,dx2)设置为0if config.GPU_COUNT:zeros = zeros.cuda()deltas = torch.cat([deltas, zeros], dim=0)     #将正负样本的delats拼接zeros = Variable(torch.zeros(negative_count,config.MASK_SHAPE[0],config.MASK_SHAPE[1]), requires_grad=False)#将负样本的掩模矩阵设置为0if config.GPU_COUNT:zeros = zeros.cuda()masks = torch.cat([masks, zeros], dim=0)  #将正负样本的掩模矩阵拼接elif positive_count > 0:  #只有正样本没有负样本rois = positive_roiselif negative_count > 0:  #只有负样本没有正样本rois = negative_roiszeros = Variable(torch.zeros(negative_count), requires_grad=False)if config.GPU_COUNT:zeros = zeros.cuda()roi_gt_class_ids = zeroszeros = Variable(torch.zeros(negative_count,4), requires_grad=False).int()if config.GPU_COUNT:zeros = zeros.cuda()deltas = zeroszeros = Variable(torch.zeros(negative_count,config.MASK_SHAPE[0],config.MASK_SHAPE[1]), requires_grad=False)if config.GPU_COUNT:zeros = zeros.cuda()masks = zeroselse:   #既没有正样本又没有负样本rois = Variable(torch.FloatTensor(), requires_grad=False)roi_gt_class_ids = Variable(torch.IntTensor(), requires_grad=False)deltas = Variable(torch.FloatTensor(), requires_grad=False)masks = Variable(torch.FloatTensor(), requires_grad=False)if config.GPU_COUNT:rois = rois.cuda()roi_gt_class_ids = roi_gt_class_ids.cuda()deltas = deltas.cuda()masks = masks.cuda()return rois, roi_gt_class_ids, deltas, masks############################################################
#  Detection Layer
############################################################def clip_to_window(window, boxes):"""window: (y1, x1, y2, x2). The window in the image we want to clip to.boxes: [N, (y1, x1, y2, x2)]"""boxes[:, 0] = boxes[:, 0].clamp(float(window[0]), float(window[2]))boxes[:, 1] = boxes[:, 1].clamp(float(window[1]), float(window[3]))boxes[:, 2] = boxes[:, 2].clamp(float(window[0]), float(window[2]))boxes[:, 3] = boxes[:, 3].clamp(float(window[1]), float(window[3]))return boxesdef refine_detections(rois, probs, deltas, window, config):"""Refine classified proposals and filter overlaps and return finaldetections.Inputs:rois: [N, (y1, x1, y2, x2)] in normalized coordinatesprobs: [N, num_classes]. Class probabilities.deltas: [N, num_classes, (dy, dx, log(dh), log(dw))]. Class-specificbounding box deltas.window: (y1, x1, y2, x2) in image coordinates. The part of the imagethat contains the image excluding the padding.Returns detections shaped: [N, (y1, x1, y2, x2, class_id, score)]"""# Class IDs per ROIclass_scores, class_ids = torch.max(probs, dim=1) #计算每行的最大值,并且返回该最大值已经最大值在该行的索引#每行代表一个rois的预测对各个类别的预测分数#每行的最大值在该行的索引和最大值代表代表该rois最可能预测的类别和它的预测分数# Class probability of the top class of each ROI# Class-specific bounding box deltasidx=torch.arange(class_ids.size()[0]).long()if config.GPU_COUNT:idx = idx.cuda()#class_scores = probs[idx, class_ids.data]        #deltas_specific = deltas[idx, class_ids.data,:]  #idx可以保证每一个样本中只选择一个box的deltas#这条代码是选择预测类别分数最高的类别的那个box的deltas# Apply bounding box deltas# Shape: [boxes, (y1, x1, y2, x2)] in normalized coordinatesstd_dev = Variable(torch.from_numpy(np.reshape(config.RPN_BBOX_STD_DEV, [1, 4])).float(), requires_grad=False)if config.GPU_COUNT:std_dev = std_dev.cuda()refined_rois = apply_box_deltas(rois, deltas_specific * std_dev) #用这个样本最有可预测的类别的box的deltas去修正rois# Convert coordiates to image domainheight, width = config.IMAGE_SHAPE[:2] #图像shapescale = Variable(torch.from_numpy(np.array([height, width, height, width])).float(), requires_grad=False)if config.GPU_COUNT:scale = scale.cuda()refined_rois *= scale     #将修正后的rois尺寸放大到原图上来# Clip boxes to image windowrefined_rois = clip_to_window(window, refined_rois)  #将尺寸放大后的refined_rois压缩到图像尺寸里# Round and cast to int since we're deadling with pixels nowrefined_rois = torch.round(refined_rois) #对坐标取整数# TODO: Filter out boxes with zero area# Filter out background boxes# class_ids中结果为0的keep_bool = class_ids>0   # Filter out low confidence boxesif config.DETECTION_MIN_CONFIDENCE:keep_bool = keep_bool & (class_scores >= config.DETECTION_MIN_CONFIDENCE) keep = torch.nonzero(keep_bool)[:,0]  #取出前景样本中,类别分数大于最小置信度的样本index# Apply per-class NMSpre_nms_class_ids = class_ids[keep.data]  #取出样本  (m,)pre_nms_scores = class_scores[keep.data]  #(m,)pre_nms_rois = refined_rois[keep.data]   #(m,4)#对每张图上同一个类别的不同roi进行nms#因为pre_nms_class_ids是所有样本的类别,它们里面有很多重复的#unique1d的目的相对于set(list_),使得pre_nms_class_ids中每个种类只保留一个for i, class_id in enumerate(unique1d(pre_nms_class_ids)):# Pick detections of this classixs = torch.nonzero(pre_nms_class_ids == class_id)[:,0]# Sortix_rois = pre_nms_rois[ixs.data]ix_scores = pre_nms_scores[ixs]ix_scores, order = ix_scores.sort(descending=True)ix_rois = ix_rois[order.data,:]  #将同一类别的roi的类别分数排序class_keep = nms(torch.cat((ix_rois, ix_scores.unsqueeze(1)), dim=1).data, config.DETECTION_NMS_THRESHOLD)#nms#class_keep是ix_rois的下标# Map indicies# keep 前景中类别分数大于最小置信度的样本index# class_keep,是属于某类别的的index经过nms后保留的index# order 是所有属于某个类别的按照score从大到小排列的index# ixs是某个类别的索引,ixs与order的区别是order是排好序的,ixs没有排序class_keep = keep[ixs[order[class_keep].data].data]  #order[class_keep]是位于这些下标lass_keep的值if i==0:nms_keep = class_keepelse:nms_keep = unique1d(torch.cat((nms_keep, class_keep)))#这里有点多余了,因为class_keep本来就是某一类别经过nms后剩余的index#不同类别之间之间的index本来就不会重复,二类间的索引也不会重复keep = intersect1d(keep, nms_keep)  #求两个tensor的交集,注意:这里的两个tensor里面没有重复放入元素#这句程序感觉没什么用,因为每个class_keep本身不会重叠,而且都是从keep中取出来的#这里有个问题,上面for循环没有执行时,nms_keep为None,该句程序会报错# Keep top detectionsroi_count = config.DETECTION_MAX_INSTANCES  #最终检测样本100个rest_sample=len(class_scores[keep.data])if roi_count 0)[:,0]# Gather the deltas (predicted and true) that contribute to losstarget_bbox = target_bbox[positive_roi_ix,:]pred_bbox = pred_bbox[positive_roi_ix,:,:]# Smooth L1 lossloss = F.smooth_l1_loss(pred_bbox, target_bbox)else:loss = Variable(torch.FloatTensor([0]), requires_grad=False)if target_class_ids.is_cuda:loss = loss.cuda()return lossdef compute_mrcnn_mask_loss(target_masks, target_class_ids, pred_masks):"""Mask binary cross-entropy loss for the masks head.target_masks: [batch, num_rois, height, width].A float32 tensor of values 0 or 1. Uses zero padding to fill array.target_class_ids: [batch, num_rois]. Integer class IDs. Zero padded.pred_masks: [batch, proposals, height, width, num_classes] float32 tensorwith values from 0 to 1."""if target_class_ids.size():# Only positive ROIs contribute to the loss. And only# the class specific mask of each ROI.positive_ix = torch.nonzero(target_class_ids > 0)[:, 0]positive_class_ids = target_class_ids[positive_ix.data].long()indices = torch.stack((positive_ix, positive_class_ids), dim=1)# Gather the masks (predicted and true) that contribute to lossy_true = target_masks[indices[:,0].data,:,:]y_pred = pred_masks[indices[:,0].data,indices[:,1].data,:,:]# Binary cross entropyloss = F.binary_cross_entropy(y_pred, y_true)else:loss = Variable(torch.FloatTensor([0]), requires_grad=False)if target_class_ids.is_cuda:loss = loss.cuda()return loss#def compute_mrcnn_mask_loss(target_masks, target_class_ids, pred_masks):"""Mask binary cross-entropy loss for the masks head.target_masks: [batch, num_rois, height, width].A float32 tensor of values 0 or 1. Uses zero padding to fill array.target_class_ids: [batch, num_rois]. Integer class IDs. Zero padded.pred_masks: [batch, num_rois, height, width, num_classes] float32 tensorwith values from 0 to 1."""if target_class_ids.size():target_masks=target_masks.view(-1,target_masks.size(2),target_masks.size()[3]) #(m,height,width)target_class_ids=target_class_ids.view(-1)  #(m,)pred_masks=pred_masks.view(-1,pred_masks.size()[2],pred_masks.size()[3],pred_masks.size()[4]) #(N,height,width,num_classes)#pred_masks=pred_masks.permute(0, 3, 1, 2) #(N,num_classes,height,width)positive_ix=torch.nonzero(target_class_ids>0)[:,0]y_true=target_masks[positive_ix,:,:].view(-1)y_pred=pred_masks[positive_ix,:,:,:].view(-1,pred_masks.size()[-1])print(y_true.shape)print(y_pred.shape)loss = F.binary_cross_entropy(y_pred, y_true)else:loss = Variable(torch.FloatTensor([0]), requires_grad=False)if target_class_ids.is_cuda:loss = loss.cuda()return loss    def compute_losses(rpn_match,rpn_class_logits, rpn_target_bbox, rpn_pred_bbox,target_class_ids, mrcnn_class_logits, target_deltas, mrcnn_bbox, target_mask, mrcnn_mask):#rpn_match: 每个anchor的类别(-1(负样本),0(不需要参与计算的样本),1(正样本)#rpn_target_bbox:#rpn_class_logits, rpn输出的scores (b,num_anchor,2)#rpn_pred_bbox: rpn网络输出的boxes (b,num_anchor,4)#target_class_ids: (m, num_rois)#mrcnn_class_logits: (batch, num_rois, num_classes)#target_deltas: (batch,num_rois,4)#mrcnn_bbox: (batch,num_rois,num_classes,4)rpn_class_loss = compute_rpn_class_loss(rpn_match, rpn_class_logits)rpn_bbox_loss = compute_rpn_bbox_loss(rpn_target_bbox, rpn_match, rpn_pred_bbox)mrcnn_class_loss = compute_mrcnn_class_loss(target_class_ids, mrcnn_class_logits)mrcnn_bbox_loss = compute_mrcnn_bbox_loss(target_deltas, target_class_ids, mrcnn_bbox)mrcnn_mask_loss = compute_mrcnn_mask_loss(target_mask, target_class_ids, mrcnn_mask)  #这个还不清楚return [rpn_class_loss, rpn_bbox_loss, mrcnn_class_loss, mrcnn_bbox_loss, mrcnn_mask_loss]############################################################
#  Data Generator
############################################################def load_image_gt(dataset, config, image_id, augment=False,use_mini_mask=False):"""Load and return ground truth data for an image (image, mask, bounding boxes).augment: If true, apply random image augmentation. Currently, onlyhorizontal flipping is offered.use_mini_mask: If False, returns full-size masks that are the same heightand width as the original image. These can be big, for example1024x1024x100 (for 100 instances). Mini masks are smaller, typically,224x224 and are generated by extracting the bounding box of theobject and resizing it to MINI_MASK_SHAPE.如果为False,则返回高度和宽度与原始图像相同的全尺寸蒙版。它们可能很大,例如1024x1024x100(用于100个实例)。迷你遮罩较小,通常为224x224,是通过提取对象的边界框并将其大小调整为MINI_MASK_SHAPE生成的Returns:image: [3,height, width]shape: the original shape of the image before resizing and cropping.class_ids: [instance_count] Integer class IDsbbox: [instance_count, (y1, x1, y2, x2)]mask: [height, width, instance_count]. The height and width are thoseof the image unless use_mini_mask is True, in which case they aredefined in MINI_MASK_SHAPE."""# Load image and maskimage = dataset.load_image(image_id)  #image_id一个数,这是一张numpy数据类型的BGR图()#训练时,将图片全部add到dataset类中了,可以把dataset看成一个容器来理解#这里在用的时候,是从这个容器中取出一张图片mask, class_ids = dataset.load_mask(image_id) #tensor(,,), tensor()它是一个空的,但是shape分别是(1,3),(1)#不懂这么为什么要生成一个空的mask#现在懂了,这一块程序是作者留在这里要自己去改写的#反正这个函数就是取出图片上的mask区域(也就是待检测物体的区域),但是mask#区域是个方形的,它比待检测区域大#(num_instance,height,width) num_instance是该张图片上有多少个待检测的物体#这个mask是怎么生成的不重要,重要的是知道mask是什么shape = image.shape   #图片的形状image, window, scale, padding = utils.resize_image(image,min_dim=config.IMAGE_MIN_DIM,  #800max_dim=config.IMAGE_MAX_DIM,  #1024padding=config.IMAGE_PADDING)  #True#image 经过处理后的图片#window [0,0,height),width] 一个tuple,其中height,width是经常处理后的图片的高和宽#scale  图片放缩因子#padding 图片每个维度填充的宽度mask = utils.resize_mask(mask, scale, padding) #将mask做相同的处理#mask的维度(heigt,width,num_instance)#mask 第0维度和第1维度是图像是否有框住一个图片的的物体使用的框的大小,# Random horizontal flips.if augment:if random.randint(0, 1):image = np.fliplr(image)mask = np.fliplr(mask)  #随机水平切割# Bounding boxes. Note that some boxes might be all zeros# if the corresponding mask got cropped out.# bbox: [num_instances, (y1, x1, y2, x2)]bbox = utils.extract_bboxes(mask) #首先搞懂mask到底是什么,通俗来说mask就是图片上包含带检测物体的区域块,#这里用到包含,是因为它的mask是个矩形,它的面积要比物体的区域面积大,可以理解为#mask的最圈是物体的外接矩形,在这个矩形中有物体的区域像素是1,没有物体的区域像素为0#每一个物体都有一个mask,# Active classes# Different datasets have different classes, so track the# classes supported in the dataset of this image.active_class_ids = np.zeros([dataset.num_classes], dtype=np.int32)  #这里的num_classes投入网络训练的所有数据集上的所有类别source_class_ids = dataset.source_class_ids[dataset.image_info[image_id]["source"]] #source_class_ids它是将所有图片的ids作了一个分类,把具有相同的source的图片的id看成一类#source_class_ids={'cooc':[0,1,3,6],'dfa':[2,4,5]}#dataset.image_info[image_id]["source"]是返回该张图片属于哪个source#此代码就将属于这个source的ids以一个list的方式取出来   active_class_ids[source_class_ids] = 1 #将这张图片所属的source的ids置1#理解:在进行训练的时候,可以投入不同的数据集,但是不同的数据集上的类别不同,且类别数也不同#假设模型训练用了三个训练集合: #集合1的类别包括:人,车,房子#集合2的类别包括:苹果,香蕉,人,狗#集合3的类别包括:飞机,手机,米饭#那么dataset.num_classes=3+4+3=10,active_class_ids=np.array([0,0,0,0,0,0,0,0,0,0])#当正在参与训练的图片来自于集合2,此时只需要使用到四个类别苹果,香蕉,人,狗#也就是active_class_ids中的第3,4,5,6的位置需要置1#即active_class_ids=np.array([0,0,0,1,1,1,1,0,0,0])# Resize masks to smaller size to reduce memory usageif use_mini_mask:mask = utils.minimize_mask(bbox, mask, config.MINI_MASK_SHAPE) #config.MINI_MASK_SHAPE(56,56)# Image meta dataimage_meta = compose_image_meta(image_id, shape, window, active_class_ids)# class_ids: [instance_count,] 这张图上所有的类别数# (active_class_ids==1).sum()  这张图属于的数据集的所有类别数return image, image_meta, class_ids, bbox, mask def build_rpn_targets(image_shape, anchors, gt_class_ids, gt_boxes, config):"""Given the anchors and GT boxes, compute overlaps and identify positiveanchors and deltas to refine them to match their corresponding GT boxes.anchors: [num_anchors, (y1, x1, y2, x2)]gt_class_ids: [num_gt_boxes] Integer class IDs.gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)]Returns:rpn_match: [N] (int32) matches between anchors and GT boxes.1 = positive anchor, -1 = negative anchor, 0 = neutralrpn_bbox: [N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas."""# RPN Match: 1 = positive anchor, -1 = negative anchor, 0 = neutralrpn_match = np.zeros([anchors.shape[0]], dtype=np.int32)# RPN bounding boxes: [max anchors per image, (dy, dx, log(dh), log(dw))]rpn_bbox = np.zeros((config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4))# Handle COCO crowds# A crowd box in COCO is a bounding box around several instances. Exclude# them from training. A crowd box is given a negative class ID.crowd_ix = np.where(gt_class_ids < 0)[0]   #负样本索引if crowd_ix.shape[0] > 0: #如果存在负样本# Filter out crowds from ground truth class IDs and boxesnon_crowd_ix = np.where(gt_class_ids > 0)[0]  #获取ids>0的样本下标crowd_boxes = gt_boxes[crowd_ix]  #得到负样本boxgt_class_ids = gt_class_ids[non_crowd_ix] #得到ids>0的样本的indexgt_boxes = gt_boxes[non_crowd_ix]  #得到ids>0的样本的boxes# Compute overlaps with crowd boxes [anchors, crowds]crowd_overlaps = utils.compute_overlaps(anchors, crowd_boxes) #计算负样本与anchor的iouscrowd_iou_max = np.argmax(crowd_overlaps, axis=1) no_crowd_bool = (crowd_iou_max < 0.001) #将ids为0的样本中与anchors的ious<0.001的样本作为真正的负样本else: #如果不存在ids==0的样本# All anchors don't intersect a crowdno_crowd_bool = np.ones([anchors.shape[0]], dtype=bool)  #ids>0的样本的index为全部样本的索引# Compute overlaps [num_anchors, num_gt_boxes]overlaps = utils.compute_overlaps(anchors, gt_boxes) #计算ids>0的样本与anchor的ious# Match anchors to GT Boxes# If an anchor overlaps a GT box with IoU >= 0.7 then it's positive.# If an anchor overlaps a GT box with IoU < 0.3 then it's negative.# Neutral anchors are those that don't match the conditions above,# and they don't influence the loss function.# However, don't keep any GT box unmatched (rare, but happens). Instead,# match it to the closest anchor (even if its max IoU is < 0.3).## 1. Set negative anchors first. They get overwritten below if a GT box is# matched to them. Skip boxes in crowd areas.anchor_iou_argmax = np.argmax(overlaps, axis=1)  #返回每行的最大值在该行的索引anchor_iou_max = overlaps[np.arange(overlaps.shape[0]), anchor_iou_argmax]rpn_match[(anchor_iou_max < 0.3) & (no_crowd_bool)] = -1 #将从ids<0的样本中选出的负样本#加上ids>0的样本中与anchor的ious<0.3的样本一起作为负样本# 2. Set an anchor for each GT box (regardless of IoU value).# TODO: If multiple anchors have the same IoU match all of themgt_iou_argmax = np.argmax(overlaps, axis=0)  #返回overlaps(ious)矩阵中每列最大值rpn_match[gt_iou_argmax] = 1      #每列最大值作为正样本# 3. Set anchors with high overlap as positive.rpn_match[anchor_iou_max >= 0.7] = 1  #将ids>0的样本中与anchor的ious>0.7的样本作为正样本# Subsample to balance positive and negative anchors# Don't let positives be more than half the anchorsids = np.where(rpn_match == 1)[0] # 将正样本的index抽离出(正样本为true,其余为false)#RPN_TRAIN_ANCHORS_PER_IMAGE=256extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE // 2) #计算符合要求得正样本的数量与实际需要的正样本数量的差值if extra > 0: #如果符合要求的正样本数量多余需要的正样本数# Reset the extra ones to neutralids = np.random.choice(ids, extra, replace=False) #则从从中随机选择需要的正样本数量rpn_match[ids] = 0# Same for negative proposalsids = np.where(rpn_match == -1)[0]extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE -np.sum(rpn_match == 1))if extra > 0:# Rest the extra ones to neutralids = np.random.choice(ids, extra, replace=False) #同理rpn_match[ids] = 0# For positive anchors, compute shift and scale needed to transform them# to match the corresponding GT boxes.ids = np.where(rpn_match == 1)[0]ix = 0  # index into rpn_bbox# TODO: use box_refinment() rather than duplicating the code heregt=gt_boxes[ids]a=anchors[ids]gt_h=gt[:,2]-gt[:,0]gt_w=gt[:,3]-gt[:,1]gt_center_y=gt[0]+0.5*gt_hgt_center_x=gt[1]+0.5*gt_wa_h = a[:,2] - a[:,0]a_w = a[:,3] - a[:,1]a_center_y = a[:,0] + 0.5 * a_ha_center_x = a[:,1] + 0.5 * a_wrpn_bbox[:len(ids),0]=(gt_center_y - a_center_y) / a_h   #:len(ids)是因为这里只需要计算正样本与真实box的偏差#负样本的偏差为0rpn_bbox[:len(ids),1]=(gt_center_x - a_center_x) / a_wrpn_bbox[:len(ids),2]=np.log(gt_h / a_h)rpn_bbox[:len(ids),3]=np.log(gt_w / a_w)# Normalizerpn_bbox /= config.RPN_BBOX_STD_DEV#rpn_match 里面是(-1,0,1)记录哪些样本是正样本,哪些样本是负样本,哪些样本是无关紧要的样本#rpn_bbox:前128个是正样本与真实框的偏差,后128个是负样本与真实框的偏差(全为0)return rpn_match, rpn_bbox# for i, a in zip(ids, anchors[ids]):# # Closest gt box (it might have IoU < 0.7)# gt = gt_boxes[anchor_iou_argmax[i]]  #anchor_iou_argmax[i]返回的是第i行最大在在行方向的索引,# #也就是第几个真实box# # Convert coordinates to center plus width/height.# # GT Box# gt_h = gt[2] - gt[0]# gt_w = gt[3] - gt[1]# gt_center_y = gt[0] + 0.5 * gt_h# gt_center_x = gt[1] + 0.5 * gt_w# # Anchor# a_h = a[2] - a[0]# a_w = a[3] - a[1]# a_center_y = a[0] + 0.5 * a_h# a_center_x = a[1] + 0.5 * a_w# # Compute the bbox refinement that the RPN should predict.# rpn_bbox[ix] = [# (gt_center_y - a_center_y) / a_h,# (gt_center_x - a_center_x) / a_w,# np.log(gt_h / a_h),# np.log(gt_w / a_w),# ]# # Normalize# rpn_bbox[ix] /= config.RPN_BBOX_STD_DEV# ix += 1class Dataset(torch.utils.data.Dataset):def __init__(self, dataset, config, augment=True):"""A generator that returns images and corresponding target class ids,bounding box deltas, and masks.dataset: The Dataset object to pick data fromconfig: The model config objectshuffle: If True, shuffles the samples before every epochaugment: If True, applies image augmentation to images (currently onlyhorizontal flips are supported)Returns a Python generator. Upon calling next() on it, thegenerator returns two lists, inputs and outputs. The containtesof the lists differs depending on the received arguments:inputs list:- images: [batch, H, W, C]- image_metas: [batch, size of image meta]- rpn_match: [batch, N] Integer (1=positive anchor, -1=negative, 0=neutral)- rpn_bbox: [batch, M, (dy, dx, log(dh), log(dw))] Anchor bbox deltas.- gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs- gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)]- gt_masks: [batch, height, width, MAX_GT_INSTANCES]. The height and widthare those of the image unless use_mini_mask is True, in whichcase they are defined in MINI_MASK_SHAPE.outputs list: Usually empty in regular training. But if detection_targetsis True then the outputs list contains target class_ids, bbox deltas,and masks."""self.b = 0  # batch item indexself.image_index = -1self.image_ids = np.copy(dataset.image_ids)self.error_count = 0self.dataset = dataset self.config = configself.augment = augment  #图像增强方式# Anchors# [anchor_count, (y1, x1, y2, x2)]self.anchors = utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES,config.RPN_ANCHOR_RATIOS,config.BACKBONE_SHAPES,config.BACKBONE_STRIDES,config.RPN_ANCHOR_STRIDE)#一共有6个特征图,每个特征图上都要产生一组anchor#并且将每个特征层上产生的anchor都映射到原图上去了def __getitem__(self, image_index):  #image_index是属于这一批图像中的第几张图片# Get GT bounding boxes and masks for image.image_id = self.image_ids[image_index]image, image_metas, gt_class_ids, gt_boxes, gt_masks = \load_image_gt(self.dataset, self.config, image_id, augment=self.augment,use_mini_mask=self.config.USE_MINI_MASK)# Skip images that have no instances. This can happen in cases# where we train on a subset of classes and the image doesn't# have any of the classes we care about.#跳过图片上没有实例样本if not np.any(gt_class_ids > 0):return None# RPN Targetsrpn_match, rpn_bbox = build_rpn_targets(image.shape, self.anchors,gt_class_ids, gt_boxes, self.config)# If more instances than fits in the array, sub-sample from them.if gt_boxes.shape[0] > self.config.MAX_GT_INSTANCES:  #如果图片上的box超过了100个,则只从中挑选100个#图片上要检测的物体不能太多ids = np.random.choice(np.arange(gt_boxes.shape[0]), self.config.MAX_GT_INSTANCES, replace=False)gt_class_ids = gt_class_ids[ids]gt_boxes = gt_boxes[ids]gt_masks = gt_masks[:, :, ids]# Add to batchrpn_match = rpn_match[:, np.newaxis] #将其变成二维的 (N,1)images = mold_image(image.astype(np.float32), self.config)#减去平均值# Convertimages = torch.from_numpy(images.transpose(2, 0, 1)).float()image_metas = torch.from_numpy(image_metas)rpn_match = torch.from_numpy(rpn_match)rpn_bbox = torch.from_numpy(rpn_bbox).float()gt_class_ids = torch.from_numpy(gt_class_ids)gt_boxes = torch.from_numpy(gt_boxes).float()gt_masks = torch.from_numpy(gt_masks.astype(int).transpose(2, 0, 1)).float() #(hight,width,num_instance)return images, image_metas, rpn_match, rpn_bbox, gt_class_ids, gt_boxes, gt_masksdef __len__(self):return self.image_ids.shape[0]############################################################
#  MaskRCNN Class
############################################################class MaskRCNN(nn.Module):"""Encapsulates the Mask RCNN model functionality."""def __init__(self, config, model_dir):"""config: A Sub-class of the Config classmodel_dir: Directory to save training logs and trained weights"""super(MaskRCNN, self).__init__()self.config = configself.model_dir = model_dir  self.set_log_dir()  #设置log路径,以及每个eopch保存模型参数路径,并且self.epoch=0# 'C://Desktop/coco20171029T2315\\mask_rcnn_coco_{:04d}.pth'# 'C://Desktop' 是model_dir# 'coco' 是 config.NAME 数据集名字# '20171029T2315'  程序运行该条代码的时间# 'mask_rcnn_coco_{:04d}.pth' 固定字符#  如果set_log_dir传入参数(model_path),则生成的地址中的时间部分是model_path中的时间#  并且将self.epoch改为model_path('.../**.pth')的**部分的值#  该代码衍生物self.log_dir='C://Desktop/coco20171029T2315/'#  self.checkpoint_path='C://Desktop/coco20171029T2315\\mask_rcnn_coco_{:04d}.pth'self.build(config=config) #初始还网络模型self.initialize_weights() #初始化模型参数self.loss_history = []self.val_loss_history = []def build(self, config):"""Build Mask R-CNN architecture."""# Image size must be dividable by 2 multiple timesh, w = config.IMAGE_SHAPE[:2]if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6): #如果h,w不是64的整数倍raise Exception("Image size must be dividable by 2 at least 6 times ""to avoid fractions when downscaling and upscaling.""For example, use 256, 320, 384, 448, 512, ... etc. ")# Build the shared convolutional layers.# Bottom-up Layers# Returns a list of the last layers of each stage, 5 in total.# Don't create the thead (stage 5), so we pick the 4th item in the list.resnet = ResNet("resnet101", stage5=True)C1, C2, C3, C4, C5 = resnet.stages() # Top-down Layers# TODO: add assert to varify feature map sizes match what's in configself.fpn = FPN(C1, C2, C3, C4, C5, out_channels=256) #输出[p2_out, p3_out, p4_out, p5_out, p6_out]#特征融合再输出#p2_out(1,256,256,256)  4#p3_out(1,256,128,128)  8#p4_out(1,256,64,64)    16#p5_out(1,256,32,32)    32 #p6_out(1,256,16,16)                                                           # Generate Anchors#self.anchors = Variable(torch.from_numpy(utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES,config.RPN_ANCHOR_RATIOS,config.BACKBONE_SHAPES,config.BACKBONE_STRIDES,config.RPN_ANCHOR_STRIDE)).float(), requires_grad=False)if self.config.GPU_COUNT:self.anchors = self.anchors.cuda()# RPN          #len(config.RPN_ANCHOR_RATIOS)=3#config.RPN_ANCHOR_STRIDE=1#output: [rpn_class_logits, rpn_probs,   rpn_bbox] #        (1,256*256*3,2) (1,256*256*3,2) (1,256*256*3,4)#        (1,128*128*3,2) (1,128*128*3,2) (1,128*128*3,4)#        (1,64*64*3,2)   (1,64*64*3,2)   (1,64*64*3,4)#        (1,32*32*3,2)   (1,32*32*3,2)   (1,32*32*3,4)self.rpn = RPN(len(config.RPN_ANCHOR_RATIOS), config.RPN_ANCHOR_STRIDE, 256)# FPN Classifierself.classifier = Classifier(256, config.POOL_SIZE, config.IMAGE_SHAPE, config.NUM_CLASSES)# FPN Mask#config.MASK_POOL_SIZE=14#config.IMAGE_SHAPE=[28,28]#config.NUM_CLASSES=1self.mask = Mask(256, config.MASK_POOL_SIZE, config.IMAGE_SHAPE, config.NUM_CLASSES)# Fix batch norm layers# 固定BatchNorm层的参数def set_bn_fix(m):classname = m.__class__.__name__if classname.find('BatchNorm') != -1:for p in m.parameters(): p.requires_grad = Falseself.apply(set_bn_fix)def initialize_weights(self):"""Initialize model weights."""for m in self.modules():if isinstance(m, nn.Conv2d):nn.init.xavier_uniform(m.weight) #将nn.Conv2d层的w初始化付出(0,1)的均匀分布if m.bias is not None:m.bias.data.zero_()    #将nn.Conv2d层的bias置0elif isinstance(m, nn.BatchNorm2d):m.weight.data.fill_(1)     #对于nn.BatchNorm2d层w初始化参数置1m.bias.data.zero_()        #对于nn.BatchNorm2d层b初始化参数置0elif isinstance(m, nn.Linear):m.weight.data.normal_(0, 0.01) #(0,1)正态分布m.bias.data.zero_()def set_trainable(self, layer_regex, model=None, indent=0, verbose=1):"""Sets model layers as trainable if their names matchthe given regular expression."""for param in self.named_parameters():layer_name = param[0] #每层网络中的每个参数的名字trainable = bool(re.fullmatch(layer_regex, layer_name))#re.fullmatch(pattern, string) 完成匹配string#只有当pattern与string一模一样的时候#bool(re.fullmatch(pattern, string))返回trueif not trainable:param[1].requires_grad = False  #参数反向传播不改变数据def set_log_dir(self, model_path=None):"""Sets the model log directory and epoch counter.model_path: If None, or a format different from what this code usesthen set a new log directory and start epochs from 0. Otherwise,extract the log directory and the epoch counter from the filename."""# Set date and epoch counter as if starting a new modelself.epoch = 0now = datetime.datetime.now() #当前日期时间(精确到秒)# If we have a model path with date and epochs use themif model_path:  #区分是保存模型参数的路径还是保存其他的路径# Continue from we left of. Get epoch and date from the file name# A sample model path might look like:# /path/to/logs/coco20171029T2315/mask_rcnn_coco_0001.h5regex = r".*/\w+(\d{4})(\d{2})(\d{2})T(\d{2})(\d{2})/mask\_rcnn\_\w+(\d{4})\.pth"m = re.match(regex, model_path)  #查看model_path是不是regex这样的模式if m:now = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),int(m.group(4)), int(m.group(5)))#这里计算now主要保证模型的路径不会被下面的代码改变self.epoch = int(m.group(6)) #获取模型参数是哪个epoch的模型# Directory for training logs# 训练过程需要保存的路径self.log_dir = os.path.join(self.model_dir, "{}{:%Y%m%dT%H%M}".format(self.config.NAME.lower(), now))  #now是实时保存路径# Path to save after each epoch. Include placeholders that get filled by Keras.self.checkpoint_path = os.path.join(self.log_dir, "mask_rcnn_{}_*epoch*.pth".format(self.config.NAME.lower()))self.checkpoint_path = self.checkpoint_path.replace("*epoch*", "{:04d}")def find_last(self):"""Finds the last checkpoint file of the last trained model in themodel directory.Returns:log_dir: The directory where events and weights are savedcheckpoint_path: the path to the last checkpoint file"""# Get directory names. Each directory corresponds to a modeldir_names = next(os.walk(self.model_dir))[1] #获得self.model_dir路径下文件夹的名字key = self.config.NAME.lower()dir_names = filter(lambda f: f.startswith(key), dir_names)#过滤掉不是以key字符为开始的文件夹dir_names = sorted(dir_names)  #对dir_names进行排序if not dir_names:return None, None# Pick last directorydir_name = os.path.join(self.model_dir, dir_names[-1]) #选择最后一个eopch参数的模型参数文件夹# Find the last checkpointcheckpoints = next(os.walk(dir_name))[2] #选择文件夹dir_name下面的文件checkpoints = filter(lambda f: f.startswith("mask_rcnn"), checkpoints) #过滤不是以mask_rcnn开通的文件checkpoints = sorted(checkpoints) #对过滤后的文件进行排序if not checkpoints:return dir_name, Nonecheckpoint = os.path.join(dir_name, checkpoints[-1]) #选择排序后的文件的最后一个文件return dir_name, checkpointdef load_weights(self, filepath):"""Modified version of the correspoding Keras function withthe addition of multi-GPU support and the ability to excludesome layers from loading.exlude: list of layer names to excluce"""if os.path.exists(filepath):state_dict = torch.load(filepath)self.load_state_dict(state_dict, strict=False)else:print("Weight file not found ...")# Update the log directoryself.set_log_dir(filepath)  #set_log_dir对这个函数不是很了解if not os.path.exists(self.log_dir):  #如果log_dir不存在,则创建一个os.makedirs(self.log_dir)def detect(self, images):      """Runs the detection pipeline.images: List of images, potentially of different sizes.Returns a list of dicts, one dict per image. The dict contains:rois: [N, (y1, x1, y2, x2)] detection bounding boxesclass_ids: [N] int class IDsscores: [N] float probability scores for the class IDsmasks: [H, W, N] instance binary masks"""'''meta = np.array([image_id] +            # size=1list(image_shape) +     # size=3list(window) +          # size=4 (y1, x1, y2, x2) in image cooredinateslist(active_class_ids)  # size=num_classes)'''# Mold inputs to format expected by the neural network# 将images(list)中大小不同的图片,经过resize到(1024,1024,3),在经过normalize,最后将这些图片矩阵,合成一个四维度矩阵# molded_images (N,1024,1024,3) ,N是images的长度# 某一张图片的metas信息 (image_id,image_shape,window,active_class_ids)# 这里的每张图片的image_ids都是0# 每张图片的iamge_shape是(1024,1024)# window则是原始图片在加上边之后的图片中的左上角和右下角坐标# active_class_ids是每张图的需要激活哪些类别molded_images, image_metas, windows = self.mold_inputs(images)# Convert images to torch tensor#将图片变成tensormolded_images = torch.from_numpy(molded_images.transpose(0, 3, 1, 2)).float()# To GPUif self.config.GPU_COUNT:molded_images = molded_images.cuda()# Wrap in variable# 现在tensor和Variable合并了,下面这句可以省略molded_images = Variable(molded_images, volatile=True)  # Run object detection#detections (1,num_detections,6)#mrcnn_mask (1,num_detections,num_class,28,28)detections, mrcnn_mask = self.predict([molded_images, image_metas], mode='inference')# Convert to numpydetections = detections.data.cpu().numpy()mrcnn_mask = mrcnn_mask.permute(0, 1, 3, 4, 2).data.cpu().numpy() #(1,num_detections,28,28,num_class)################################################################################# Process detectionsresults = []#images是一个list#这里的images里面只能是一张图片#unmold_detectionss是将detections的box和mrcnn_mask反向缩放平移到原图上for i, image in enumerate(images):final_rois, final_class_ids, final_scores, final_masks =\self.unmold_detections(detections[i], mrcnn_mask[i],image.shape, windows[i])results.append({"rois": final_rois,"class_ids": final_class_ids,"scores": final_scores,"masks": final_masks,})return resultsdef predict(self, input, mode):molded_images = input[0]  image_metas = input[1]if mode == 'inference':self.eval()elif mode == 'training':self.train()# Set batchnorm always in eval mode during training# 在训练模式时冻结BatchNormdef set_bn_eval(m):classname = m.__class__.__name__if classname.find('BatchNorm') != -1:m.eval()self.apply(set_bn_eval)# Feature extraction[p2_out, p3_out, p4_out, p5_out, p6_out] = self.fpn(molded_images)#p2_out (N,256,256,256)#p3_out (N,256,128,128)#p4_out (N,256,64,64)#p5_out (N,256,32,32)#p6_out (N,256,16,16)# Note that P6 is used in RPN, but not in the classifier heads.rpn_feature_maps = [p2_out, p3_out, p4_out, p5_out, p6_out]mrcnn_feature_maps = [p2_out, p3_out, p4_out, p5_out]# Loop through pyramid layers# rpn 输出layer_outputs = []  # list of listsfor p in rpn_feature_maps:layer_outputs.append(self.rpn(p))# Concatenate layer outputs# Convert from list of lists of level outputs to list of lists# of outputs across levels.# e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]outputs = list(zip(*layer_outputs))outputs = [torch.cat(list(o), dim=1) for o in outputs]  #这里的o是一个三维的,dim=1,按照这里的第1维进行拼接rpn_class_logits, rpn_class, rpn_bbox = outputs# Generate proposals# Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates# and zero padded.#config.POST_NMS_ROIS_TRAINING=2000#config.POST_NMS_ROIS_INFERENCE=1000proposal_count = self.config.POST_NMS_ROIS_TRAINING if mode == "training" \else self.config.POST_NMS_ROIS_INFERENCErpn_rois = proposal_layer([rpn_class, rpn_bbox],proposal_count=proposal_count,nms_threshold=self.config.RPN_NMS_THRESHOLD,anchors=self.anchors,config=self.config)if mode == 'inference':# Network Heads# Proposal classifier and BBox regressor heads# self.classifier将rnp_rois algin_pooling成 (N,c,7,7)# 然后投入mrcnn回归和分类网络# mrcnn_class_logits  (num_rpn_rois,num_class)# mrcnn_class         (num_rpn_rois,num_class)# mrcnn_bbox          (num_rpn_rois,num_class,4)mrcnn_class_logits, mrcnn_class, mrcnn_bbox = self.classifier(mrcnn_feature_maps, rpn_rois)# Detections# detections [num_detections, (y1, x1, y2, x2, class_id, score)] in image coordinates# 根据预测结果对rpn_rois进行修正,最终选取前100个与gt_box匹配最好的# detections (100,6)detections = detection_layer(self.config, rpn_rois, mrcnn_class, mrcnn_bbox, image_metas)# Convert boxes to normalized coordinates# TODO: let DetectionLayer return normalized coordinates to avoid#       unnecessary conversionsh, w = self.config.IMAGE_SHAPE[:2]scale = Variable(torch.from_numpy(np.array([h, w, h, w])).float(), requires_grad=False)if self.config.GPU_COUNT:scale = scale.cuda()detection_boxes = detections[:, :4] / scale  #将坐标归一化# Add back batch dimensiondetection_boxes = detection_boxes.unsqueeze(0) #[1,num_detections,4]# Create masks for detections# mrcnn_feature_maps=[p2_out, p3_out, p4_out, p5_out]# mrcnn_mask (rois_num,num_classes,28,28)mrcnn_mask = self.mask(mrcnn_feature_maps, detection_boxes)# Add back batch dimensiondetections = detections.unsqueeze(0)  #(1,num_detections,6)mrcnn_mask = mrcnn_mask.unsqueeze(0)  #(1,rois_num ,num_classes,28,28)return [detections, mrcnn_mask]elif mode == 'training':gt_class_ids = input[2]gt_boxes = input[3]gt_masks = input[4]# Normalize coordinatesh, w = self.config.IMAGE_SHAPE[:2]scale = Variable(torch.from_numpy(np.array([h, w, h, w])).float(), requires_grad=False)if self.config.GPU_COUNT:scale = scale.cuda()gt_boxes = gt_boxes / scale   #gt_boxes归一化# Generate detection targets# Subsamples proposals and generates target outputs for training# Note that proposal class IDs, gt_boxes, and gt_masks are zero# padded. Equally, returned rois and targets are zero padded.# rois选出128个正样本和128负样本,# target_class_ids (正样本的class_ids是和rois匹配度最高的框的类别,负样本的class_ids为0)# target_deltas   同target_class_ids# target_mask     同target_class_idsrois, target_class_ids, target_deltas, target_mask = \detection_target_layer(rpn_rois, gt_class_ids, gt_boxes, gt_masks, self.config)if not rois.size():mrcnn_class_logits = Variable(torch.FloatTensor())mrcnn_class = Variable(torch.IntTensor())mrcnn_bbox = Variable(torch.FloatTensor())mrcnn_mask = Variable(torch.FloatTensor())if self.config.GPU_COUNT:mrcnn_class_logits = mrcnn_class_logits.cuda()mrcnn_class = mrcnn_class.cuda()mrcnn_bbox = mrcnn_bbox.cuda()mrcnn_mask = mrcnn_mask.cuda()else:# Network Heads# Proposal classifier and BBox regressor headsmrcnn_class_logits, mrcnn_class, mrcnn_bbox = self.classifier(mrcnn_feature_maps, rois)# Create masks for detectionsmrcnn_mask = self.mask(mrcnn_feature_maps, rois)return [rpn_class_logits, rpn_bbox, target_class_ids, mrcnn_class_logits, target_deltas, mrcnn_bbox, target_mask, mrcnn_mask]def train_model(self, train_dataset, val_dataset, learning_rate, epochs, layers):"""Train the model.train_dataset, val_dataset: Training and validation Dataset objects.learning_rate: The learning rate to train withepochs: Number of training epochs. Note that previous training epochsare considered to be done alreay, so this actually determinesthe epochs to train in total rather than in this particaularcall.layers: Allows selecting witch layers to train. It can be:- A regular expression to match layer names to train- One of these predefined values:heaads: The RPN, classifier and mask heads of the networkall: All the layers3+: Train Resnet stage 3 and up4+: Train Resnet stage 4 and up5+: Train Resnet stage 5 and up"""# Pre-defined layer regular expressionslayer_regex = {# all layers but the backbone"heads": r"(fpn.P5\_.*)|(fpn.P4\_.*)|(fpn.P3\_.*)|(fpn.P2\_.*)|(rpn.*)|(classifier.*)|(mask.*)",# From a specific Resnet stage and up"3+": r"(fpn.C3.*)|(fpn.C4.*)|(fpn.C5.*)|(fpn.P5\_.*)|(fpn.P4\_.*)|(fpn.P3\_.*)|(fpn.P2\_.*)|(rpn.*)|(classifier.*)|(mask.*)","4+": r"(fpn.C4.*)|(fpn.C5.*)|(fpn.P5\_.*)|(fpn.P4\_.*)|(fpn.P3\_.*)|(fpn.P2\_.*)|(rpn.*)|(classifier.*)|(mask.*)","5+": r"(fpn.C5.*)|(fpn.P5\_.*)|(fpn.P4\_.*)|(fpn.P3\_.*)|(fpn.P2\_.*)|(rpn.*)|(classifier.*)|(mask.*)",# All layers"all": ".*",}if layers in layer_regex.keys():layers = layer_regex[layers]# Data generators# results of Dataset# images, image_metas, rpn_match, rpn_bbox, gt_class_ids, gt_boxes, gt_maskstrain_set = Dataset(train_dataset, self.config, augment=True)train_generator = torch.utils.data.DataLoader(train_set, batch_size=1, shuffle=True, num_workers=4)val_set = Dataset(val_dataset, self.config, augment=True)val_generator = torch.utils.data.DataLoader(val_set, batch_size=1, shuffle=True, num_workers=4)# Train# self.epoch初始为0log("\nStarting at epoch {}. LR={}\n".format(self.epoch+1, learning_rate))#Checkpoint Path: C://desktop/coco20171029T2315\mask_rcnn_coco_{:04d}.pthlog("Checkpoint Path: {}".format(self.checkpoint_path))self.set_trainable(layers)# Optimizer object# Add L2 Regularization# Skip gamma and beta weights of batch normalization layers.# trainables_wo_bn 非BatchNorm层的参数# trainables_only_bn BatchNorm层的参数trainables_wo_bn = [param for name, param in self.named_parameters() if param.requires_grad and not 'bn' in name]trainables_only_bn = [param for name, param in self.named_parameters() if param.requires_grad and 'bn' in name]optimizer = optim.SGD([{'params': trainables_wo_bn, 'weight_decay': self.config.WEIGHT_DECAY}, #在batchnorm层中加入权重衰减{'params': trainables_only_bn} #其他层不使用权重衰减], lr=learning_rate, momentum=self.config.LEARNING_MOMENTUM)#self.epoch初始轮数,它在初始化的时候会改变#没次执行set_log_dir(model_path)是时候都会随着model_path的变化而变化#当在调节参数的时候,你想从第几轮开始调训练,你就导入第几层的模型参数,#如当你将这个地址的模型参数'.../mask_rcnn_coco_0003.pth'导入作为模型初始化参数#那么当你训练的时候相当于是第4轮进行训练了,这个是不是很方便for epoch in range(self.epoch+1, epochs+1):log("Epoch {}/{}.".format(epoch,epochs))# Training# 里面进行了正向传播求loss和打印loss,并进行反向传播更新参数,且执行1000次loss, loss_rpn_class, loss_rpn_bbox, loss_mrcnn_class, loss_mrcnn_bbox, loss_mrcnn_mask = self.train_epoch(train_generator, optimizer, self.config.STEPS_PER_EPOCH)# Validation# 里面进行了正向传播求loss和打印loss,且执行50次val_loss, val_loss_rpn_class, val_loss_rpn_bbox, val_loss_mrcnn_class, val_loss_mrcnn_bbox, val_loss_mrcnn_mask = self.valid_epoch(val_generator, self.config.VALIDATION_STEPS)# Statisticsself.loss_history.append([loss, loss_rpn_class, loss_rpn_bbox, loss_mrcnn_class, loss_mrcnn_bbox, loss_mrcnn_mask])self.val_loss_history.append([val_loss, val_loss_rpn_class, val_loss_rpn_bbox, val_loss_mrcnn_class, val_loss_mrcnn_bbox, val_loss_mrcnn_mask])visualize.plot_loss(self.loss_history, self.val_loss_history, save=True, log_dir=self.log_dir)# Save modeltorch.save(self.state_dict(), self.checkpoint_path.format(epoch))self.epoch = epochsdef train_epoch(self, datagenerator, optimizer, steps):batch_count = 0loss_sum = 0loss_rpn_class_sum = 0loss_rpn_bbox_sum = 0loss_mrcnn_class_sum = 0loss_mrcnn_bbox_sum = 0loss_mrcnn_mask_sum = 0step = 0optimizer.zero_grad()   #将梯度初始化为零#(因为一个batch的loss关于weight的导数是所有sample的loss关于weight的导数的累加和)for inputs in datagenerator:batch_count += 1images = inputs[0]image_metas = inputs[1]rpn_match = inputs[2]rpn_bbox = inputs[3]gt_class_ids = inputs[4]gt_boxes = inputs[5]gt_masks = inputs[6]# image_metas as numpy arrayimage_metas = image_metas.numpy()# Wrap in variablesimages = Variable(images)rpn_match = Variable(rpn_match)rpn_bbox = Variable(rpn_bbox)gt_class_ids = Variable(gt_class_ids)gt_boxes = Variable(gt_boxes)gt_masks = Variable(gt_masks)# To GPUif self.config.GPU_COUNT:images = images.cuda()rpn_match = rpn_match.cuda()rpn_bbox = rpn_bbox.cuda()gt_class_ids = gt_class_ids.cuda()gt_boxes = gt_boxes.cuda()gt_masks = gt_masks.cuda()# Run object detectionrpn_class_logits, rpn_pred_bbox, target_class_ids, mrcnn_class_logits, target_deltas, mrcnn_bbox, target_mask, mrcnn_mask = \self.predict([images, image_metas, gt_class_ids, gt_boxes, gt_masks], mode='training')# Compute lossesrpn_class_loss, rpn_bbox_loss, mrcnn_class_loss, mrcnn_bbox_loss, mrcnn_mask_loss = compute_losses(rpn_match, rpn_bbox, rpn_class_logits, rpn_pred_bbox, target_class_ids, mrcnn_class_logits, target_deltas, mrcnn_bbox, target_mask, mrcnn_mask)loss = rpn_class_loss + rpn_bbox_loss + mrcnn_class_loss + mrcnn_bbox_loss + mrcnn_mask_loss# Backpropagationloss.backward()   #loss反向传播torch.nn.utils.clip_grad_norm(self.parameters(), 5.0) #梯度裁剪if (batch_count % self.config.BATCH_SIZE) == 0:  #每隔config.BATCH_SIZE个batch进行一次参数更新optimizer.step()   #参数更新optimizer.zero_grad()  #梯度清零batch_count = 0  # ProgressprintProgressBar(step + 1, steps, prefix="\t{}/{}".format(step + 1, steps),suffix="Complete - loss: {:.5f} - rpn_class_loss: {:.5f} - rpn_bbox_loss: {:.5f} - mrcnn_class_loss: {:.5f} - mrcnn_bbox_loss: {:.5f} - mrcnn_mask_loss: {:.5f}".format(loss.data.cpu()[0], rpn_class_loss.data.cpu()[0], rpn_bbox_loss.data.cpu()[0],mrcnn_class_loss.data.cpu()[0], mrcnn_bbox_loss.data.cpu()[0],mrcnn_mask_loss.data.cpu()[0]), length=10)# Statisticsloss_sum += loss.data.cpu()[0]/stepsloss_rpn_class_sum += rpn_class_loss.data.cpu()[0]/stepsloss_rpn_bbox_sum += rpn_bbox_loss.data.cpu()[0]/stepsloss_mrcnn_class_sum += mrcnn_class_loss.data.cpu()[0]/stepsloss_mrcnn_bbox_sum += mrcnn_bbox_loss.data.cpu()[0]/stepsloss_mrcnn_mask_sum += mrcnn_mask_loss.data.cpu()[0]/steps# Break after 'steps' stepsif step==steps-1:breakstep += 1 #训练步数加1return loss_sum, loss_rpn_class_sum, loss_rpn_bbox_sum, loss_mrcnn_class_sum, loss_mrcnn_bbox_sum, loss_mrcnn_mask_sumdef valid_epoch(self, datagenerator, steps):step = 0loss_sum = 0loss_rpn_class_sum = 0loss_rpn_bbox_sum = 0loss_mrcnn_class_sum = 0loss_mrcnn_bbox_sum = 0loss_mrcnn_mask_sum = 0for inputs in datagenerator:images = inputs[0]image_metas = inputs[1]rpn_match = inputs[2]rpn_bbox = inputs[3]gt_class_ids = inputs[4]gt_boxes = inputs[5]gt_masks = inputs[6]# image_metas as numpy arrayimage_metas = image_metas.numpy()# Wrap in variablesimages = Variable(images, volatile=True)rpn_match = Variable(rpn_match, volatile=True)rpn_bbox = Variable(rpn_bbox, volatile=True)gt_class_ids = Variable(gt_class_ids, volatile=True)gt_boxes = Variable(gt_boxes, volatile=True)gt_masks = Variable(gt_masks, volatile=True)# To GPUif self.config.GPU_COUNT:images = images.cuda()rpn_match = rpn_match.cuda()rpn_bbox = rpn_bbox.cuda()gt_class_ids = gt_class_ids.cuda()gt_boxes = gt_boxes.cuda()gt_masks = gt_masks.cuda()# Run object detectionrpn_class_logits, rpn_pred_bbox, target_class_ids, mrcnn_class_logits, target_deltas, mrcnn_bbox, target_mask, mrcnn_mask = \self.predict([images, image_metas, gt_class_ids, gt_boxes, gt_masks], mode='training')if not target_class_ids.size():continue# Compute lossesrpn_class_loss, rpn_bbox_loss, mrcnn_class_loss, mrcnn_bbox_loss, mrcnn_mask_loss = compute_losses(rpn_match, rpn_bbox, rpn_class_logits, rpn_pred_bbox, target_class_ids, mrcnn_class_logits, target_deltas, mrcnn_bbox, target_mask, mrcnn_mask)loss = rpn_class_loss + rpn_bbox_loss + mrcnn_class_loss + mrcnn_bbox_loss + mrcnn_mask_loss# Progress# 花里胡哨# 只要知道下面这条代码是打印每个step的lossprintProgressBar(step + 1, steps, prefix="\t{}/{}".format(step + 1, steps),suffix="Complete - loss: {:.5f} - rpn_class_loss: {:.5f} - rpn_bbox_loss: {:.5f} - mrcnn_class_loss: {:.5f} - mrcnn_bbox_loss: {:.5f} - mrcnn_mask_loss: {:.5f}".format(loss.data.cpu()[0], rpn_class_loss.data.cpu()[0], rpn_bbox_loss.data.cpu()[0],mrcnn_class_loss.data.cpu()[0], mrcnn_bbox_loss.data.cpu()[0],mrcnn_mask_loss.data.cpu()[0]), length=10)# Statisticsloss_sum += loss.data.cpu()[0]/stepsloss_rpn_class_sum += rpn_class_loss.data.cpu()[0]/stepsloss_rpn_bbox_sum += rpn_bbox_loss.data.cpu()[0]/stepsloss_mrcnn_class_sum += mrcnn_class_loss.data.cpu()[0]/stepsloss_mrcnn_bbox_sum += mrcnn_bbox_loss.data.cpu()[0]/stepsloss_mrcnn_mask_sum += mrcnn_mask_loss.data.cpu()[0]/steps# Break after 'steps' stepsif step==steps-1:breakstep += 1return loss_sum, loss_rpn_class_sum, loss_rpn_bbox_sum, loss_mrcnn_class_sum, loss_mrcnn_bbox_sum, loss_mrcnn_mask_sumdef mold_inputs(self, images):"""Takes a list of images and modifies them to the format expectedas an input to the neural network.images: List of image matricies [height,width,depth]. Images can havedifferent sizes.Returns 3 Numpy matricies:molded_images: [N, h, w, 3]. Images resized and normalized.image_metas: [N, length of meta data]. Details about each image.windows: [N, (y1, x1, y2, x2)]. The portion of the image that has theoriginal image (padding excluded)."""molded_images = []image_metas = []windows = []for image in images:'''依次将每张图片进行尺寸处理,然后做平均值平移'''# Resize image to fit the model expected size# TODO: move resizing to mold_image()molded_image, window, scale, padding = utils.resize_image(image,min_dim=self.config.IMAGE_MIN_DIM,max_dim=self.config.IMAGE_MAX_DIM,padding=self.config.IMAGE_PADDING)molded_image = mold_image(molded_image, self.config)#将图片减去平均值# Build image_metaimage_meta = compose_image_meta(0, image.shape, window,np.zeros([self.config.NUM_CLASSES], dtype=np.int32))#compose_image_meta()将括号内的数据打包成一个元组# Appendmolded_images.append(molded_image)windows.append(window)image_metas.append(image_meta)# Pack into arraysmolded_images = np.stack(molded_images) #将每张图片拼成一个大矩阵image_metas = np.stack(image_metas)windows = np.stack(windows)return molded_images, image_metas, windowsdef unmold_detections(self, detections, mrcnn_mask, image_shape, window):"""Reformats the detections of one image from the format of the neuralnetwork output to a format suitable for use in the rest of theapplication.detections: [N, (y1, x1, y2, x2, class_id, score)]  #(N,6)mrcnn_mask: [N, height, width, num_classes]         image_shape: [height, width, depth] Original size of the image before resizingwindow: [y1, x1, y2, x2] Box in the image where the real image isexcluding the padding.Returns:boxes: [N, (y1, x1, y2, x2)] Bounding boxes in pixelsclass_ids: [N] Integer class IDs for each bounding boxscores: [N] Float probability scores of the class_idmasks: [height, width, num_instances] Instance masks"""# How many detections do we have?# Detections array is padded with zeros. Find the first class_id == 0.zero_ix = np.where(detections[:, 4] == 0)[0]   #找出类别为0的index#如果存检测类别为0的,则将检测为0的分数最高的那个index赋值给N(因为detections的中的class_ids是按照score从高到低的顺序排的)#如果不存在检测为0的,则将检测框的个数赋值给NN = zero_ix[0] if zero_ix.shape[0] > 0 else detections.shape[0]# Extract boxes, class_ids, scores, and class-specific masksboxes = detections[:N, :4]    #取出前N个框class_ids = detections[:N, 4].astype(np.int32)  #取出前N个类别scores = detections[:N, 5]   #取出钱N个分数#masks(N,hight,width,num_classes)#每一个mask预测一个物体#这里有N个mask就会预测N个物体#那么mask预测的物体它是什么类别呢?#事实上每个mask会把所有的类别都预测并且给他的预测结果打分,这也是一个优点消除了类间竞争#至于最终mask会把它判断成什么类别是由这对应的预测框的class_ids来决定的#所以我们只要把masks中队class_ids进行打分的那个mask取出来就行#比如在苹果,香蕉,梨子,已知一个box预测的是苹果,对应的mask相对于有3个预测器#每个预测器分别对苹果,香蕉,梨子进行打分,而我们只关心对苹果进行打分的那个预测器打出的分数masks = mrcnn_mask[np.arange(N), :, :, class_ids]  #从每一张图中选择一个出前N个class_ids的mask#(N,hight,width)# Compute scale and shift to translate coordinates to image domain.# img1:原始图像# img2: 经过放缩之后的图像# img3: 经过放缩之后的图像,并且还进行了边缘填充,window是将img2贴的到img3上按照左下角对齐后img2在img3的坐标#仔细品品window的坐标(top_pad, left_pad, h + top_pad, w + left_pad),#img3的宽高是在img2的基础上加上pad后的长宽#通过上面的描述img2和window的其实是一个东西,只是他们的坐标的表示做了一个平移#window的长宽与img2的长宽相等h_scale = image_shape[0] / (window[2] - window[0]) #这里是原图在高度上进行放缩的比例因子w_scale = image_shape[1] / (window[3] - window[1]) #这里是原图在宽度上进行放缩的比例因子  scale = min(h_scale, w_scale) #比较两个放缩因子谁更小shift = window[:2]  # y, x    #获取top_pad,left_padscales = np.array([scale, scale, scale, scale])shifts = np.array([shift[0], shift[1], shift[0], shift[1]])# Translate bounding boxes to image domain# 将box以相反的缩放和平移到元素图像上去boxes = np.multiply(boxes - shifts, scales).astype(np.int32)# Filter out detections with zero area. Often only happens in early# stages of training when the network weights are still a bit random.# 进一步筛选取出合格的边框exclude_ix = np.where((boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) <= 0)[0]if exclude_ix.shape[0] > 0:boxes = np.delete(boxes, exclude_ix, axis=0)class_ids = np.delete(class_ids, exclude_ix, axis=0)scores = np.delete(scores, exclude_ix, axis=0)masks = np.delete(masks, exclude_ix, axis=0)N = class_ids.shape[0]# Resize masks to original image size and set boundary threshold.full_masks = []for i in range(N):# Convert neural network mask to full size maskfull_mask = utils.unmold_mask(masks[i], boxes[i], image_shape)full_masks.append(full_mask)if full_masks:full_masks = np.stack(full_masks, axis=-1)else:full_masks = np.empty((0,) + masks.shape[1:3])return boxes, class_ids, scores, full_masks############################################################
#  Data Formatting
############################################################def compose_image_meta(image_id, image_shape, window, active_class_ids):"""Takes attributes of an image and puts them in one 1D array. Useparse_image_meta() to parse the values back.image_id: An int ID of the image. Useful for debugging.image_shape: [height, width, channels]window: (y1, x1, y2, x2) in pixels. The area of the image where the realimage is (excluding the padding)active_class_ids: List of class_ids available in the dataset from whichthe image came. Useful if training on images from multiple datasetswhere not all classes are present in all datasets."""meta = np.array([image_id] +            # size=1list(image_shape) +     # size=3list(window) +          # size=4 (y1, x1, y2, x2) in image cooredinateslist(active_class_ids)  # size=num_classes)return meta# Two functions (for Numpy and TF) to parse image_meta tensors.
def parse_image_meta(meta):"""Parses an image info Numpy array to its components.See compose_image_meta() for more details."""image_id = meta[:, 0]image_shape = meta[:, 1:4]window = meta[:, 4:8]   # (y1, x1, y2, x2) window of image in in pixelsactive_class_ids = meta[:, 8:]return image_id, image_shape, window, active_class_idsdef parse_image_meta_graph(meta):"""Parses a tensor that contains image attributes to its components.See compose_image_meta() for more details.meta: [batch, meta length] where meta length depends on NUM_CLASSES"""image_id = meta[:, 0]image_shape = meta[:, 1:4]window = meta[:, 4:8]active_class_ids = meta[:, 8:]return [image_id, image_shape, window, active_class_ids]def mold_image(images, config):"""Takes RGB images with 0-255 values and subtracesthe mean pixel and converts it to float. Expects imagecolors in RGB order."""return images.astype(np.float32) - config.MEAN_PIXELdef unmold_image(normalized_images, config):"""Takes a image normalized with mold() and returns the original."""return (normalized_images + config.MEAN_PIXEL).astype(np.uint8)

 utils.py

"""
Mask R-CNN
Common utility functions and classes.Copyright (c) 2017 Matterport, Inc.
Licensed under the MIT License (see LICENSE for details)
Written by Waleed Abdulla
"""import sys
import os
import math
import random
import numpy as np
import scipy.misc
import scipy.ndimage
import skimage.color
import skimage.io
import torch
import cv2############################################################
#  Bounding Boxes
############################################################def extract_bboxes(mask):"""Compute bounding boxes from masks.mask: [height, width, num_instances]. Mask pixels are either 1 or 0.num_instances# 代表图片上实际的类别数 mask中的元素不是0就是1,1代表该点是像素点,0表示该点不是像素点Returns: bbox array [num_instances, (y1, x1, y2, x2)]."""#mask应该的数据分布应该是一个方形,且外围是用0填充,里面全是1,boxes = np.zeros([mask.shape[-1], 4], dtype=np.int32)  #num_instances,4)for i in range(mask.shape[-1]):m = mask[:, :, i]  #[height,width]# Bounding box.horizontal_indicies = np.where(np.any(m, axis=0))[0] #获得m中不全为0的行的indexvertical_indicies = np.where(np.any(m, axis=1))[0]   #获得m中不全为0的列的indexif horizontal_indicies.shape[0]:x1, x2 = horizontal_indicies[[0, -1]] #返回horizontal_indicies的第一个数和最后一个数y1, y2 = vertical_indicies[[0, -1]]   #返回vertical_indicies的第一个数和最后一个数(这里的数代表第几个像素)# x2 and y2 should not be part of the box. Increment by 1.x2 += 1y2 += 1                    #以上操作时找出mask全为1的方形块的位置else:# No mask for this instance. Might happen due to# resizing or cropping. Set bbox to zerosx1, x2, y1, y2 = 0, 0, 0, 0boxes[i] = np.array([y1, x1, y2, x2])return boxes.astype(np.int32)def compute_iou(box, boxes, box_area, boxes_area):"""Calculates IoU of the given box with the array of the given boxes.box: 1D vector [y1, x1, y2, x2]boxes: [boxes_count, (y1, x1, y2, x2)]box_area: float. the area of 'box'boxes_area: array of length boxes_count.Note: the areas are passed in rather than calculated here forefficency. Calculate once in the caller to avoid duplicate work."""# Calculate intersection areasy1 = np.maximum(box[0], boxes[:, 0])y2 = np.minimum(box[2], boxes[:, 2])x1 = np.maximum(box[1], boxes[:, 1])x2 = np.minimum(box[3], boxes[:, 3])intersection = np.maximum(x2 - x1, 0) * np.maximum(y2 - y1, 0)union = box_area + boxes_area[:] - intersection[:]iou = intersection / unionreturn ioudef compute_overlaps(boxes1, boxes2):"""Computes IoU overlaps between two sets of boxes.boxes1, boxes2: [N, (y1, x1, y2, x2)].For better performance, pass the largest set first and the smaller second."""# Areas of anchors and GT boxesarea1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])# Compute overlaps to generate matrix [boxes1 count, boxes2 count]# Each cell contains the IoU value.overlaps = np.zeros((boxes1.shape[0], boxes2.shape[0]))for i in range(overlaps.shape[1]):box2 = boxes2[i]overlaps[:, i] = compute_iou(box2, boxes1, area2[i], area1)return overlapsdef box_refinement(box, gt_box):"""Compute refinement needed to transform box to gt_box.box and gt_box are [N, (y1, x1, y2, x2)]"""height = box[:, 2] - box[:, 0]width = box[:, 3] - box[:, 1]center_y = box[:, 0] + 0.5 * heightcenter_x = box[:, 1] + 0.5 * widthgt_height = gt_box[:, 2] - gt_box[:, 0]gt_width = gt_box[:, 3] - gt_box[:, 1]gt_center_y = gt_box[:, 0] + 0.5 * gt_heightgt_center_x = gt_box[:, 1] + 0.5 * gt_widthdy = (gt_center_y - center_y) / heightdx = (gt_center_x - center_x) / widthdh = torch.log(gt_height / height)dw = torch.log(gt_width / width)result = torch.stack([dy, dx, dh, dw], dim=1)return result############################################################
#  Dataset
############################################################class Dataset(object):"""The base class for dataset classes.To use it, create a new class that adds functions specific to the datasetyou want to use. For example:class CatsAndDogsDataset(Dataset):def load_cats_and_dogs(self):...def load_mask(self, image_id):...def image_reference(self, image_id):...See COCODataset and ShapesDataset as examples."""def __init__(self, class_map=None):self._image_ids = []self.image_info = []# Background is always the first classself.class_info = [{"source": "", "id": 0, "name": "BG"}]self.source_class_ids = {}def add_class(self, source, class_id, class_name):assert "." not in source, "Source name cannot contain a dot"# Does the class exist already?for info in self.class_info:if info['source'] == source and info["id"] == class_id: #如果增加的source为空,id==0,则放弃这次添加# source.class_id combination already available, skipreturn# Add the classself.class_info.append({"source": source,"id": class_id,"name": class_name,})   #add_class在训练中会执行20次(数据集中所有的类别数之和)def add_image(self, source, image_id, path, **kwargs):image_info = {"id": image_id,    #图片读进来的序号"source": source,  #图片属于什么数据集"path": path,}image_info.update(kwargs)  #这里可以出传入多个dictself.image_info.append(image_info)     #add_image数据集有多少图片就执行多少次def image_reference(self, image_id):"""Return a link to the image in its source Website or details aboutthe image that help looking it up or debugging it.Override for your dataset, but pass to this functionif you encounter images not in your dataset."""return ""def prepare(self, class_map=None):"""Prepares the Dataset class for use.TODO: class map is not supported yet. When done, it should handle mappingclasses from different datasets to the same class ID."""def clean_name(name):"""Returns a shorter version of object names for cleaner display."""return ",".join(name.split(",")[:1])# Build (or rebuild) everything else from the info dicts.self.num_classes = len(self.class_info)   #添加进dataset中的有类别的种类,假设为20类self.class_ids = np.arange(self.num_classes)  #生成一个序列 [0,1,2,3,....19]self.class_names = [clean_name(c["name"]) for c in self.class_info]  #将类别名字简化用于显示self.num_images = len(self.image_info)  #添加进dataset中的图片数量self._image_ids = np.arange(self.num_images)   #将添加的图片按照顺序生成一个id序列self.class_from_source_map = {"{}.{}".format(info['source'], info['id']): idfor info, id in zip(self.class_info, self.class_ids)} #将数据来与和数据类别作出一张map# Map sources to class_ids they supportself.sources = list(set([i['source'] for i in self.class_info]))  #去掉相同的类别self.source_class_ids = {} #这个dict记录一张图# Loop over datasetsfor source in self.sources:self.source_class_ids[source] = []# Find classes that belong to this datasetfor i, info in enumerate(self.class_info):# Include BG class in all datasets#if i == 0 or source == info['source']: #源码是这样的,感觉应该改成下面的这个if source == info['source']:            #理由:看model.py在调用self.source_class_ids中的注解,self.source_class_ids[source].append(i)def map_source_class_id(self, source_class_id):"""Takes a source class ID and returns the int class ID assigned to it.For example:dataset.map_source_class_id("coco.12") -> 23"""return self.class_from_source_map[source_class_id]def get_source_class_id(self, class_id, source):"""Map an internal class ID to the corresponding class ID in the source dataset."""info = self.class_info[class_id]assert info['source'] == sourcereturn info['id']def append_data(self, class_info, image_info):self.external_to_class_id = {}for i, c in enumerate(self.class_info):for ds, id in c["map"]:self.external_to_class_id[ds + str(id)] = i# Map external image IDs to internal ones.self.external_to_image_id = {}for i, info in enumerate(self.image_info):self.external_to_image_id[info["ds"] + str(info["id"])] = i@propertydef image_ids(self):return self._image_idsdef source_image_link(self, image_id):"""Returns the path or URL to the image.Override this to return a URL to the image if it's availble online for easydebugging."""return self.image_info[image_id]["path"]def load_image(self, image_id):"""Load the specified image and return a [H,W,3] Numpy array."""# Load imageimage = cv2.imread(self.image_info[image_id]['path'])  #numpyimage=image.transpose((2,0,1) return imagedef load_mask(self, image_id):"""Load instance masks for the given image.Different datasets use different ways to store masks. Override thismethod to load instance masks and return them in the form of amarray of binary masks of shape [height, width, instances].Returns:masks: A bool array of shape [height, width, instance count] witha binary mask per instance.class_ids: a 1D array of class IDs of the instance masks."""# Override this function to load a mask from your dataset.# Otherwise, it returns an empty mask.# 覆盖此功能以从数据集中加载遮罩,这里需要从写该函数mask = np.empty([0, 0, 0])class_ids = np.empty([0], np.int32)return mask, class_idsdef resize_image(image, min_dim=None, max_dim=None, padding=False):"""Resizes an image keeping the aspect ratio.min_dim: if provided, resizes the image such that it's smallerdimension == min_dimmax_dim: if provided, ensures that the image longest side doesn'texceed this value.padding: If true, pads image with zeros so it's size is max_dim x max_dimReturns:image: the resized imagewindow: (y1, x1, y2, x2). If max_dim is provided, padding mightbe inserted in the returned image. If so, this window is thecoordinates of the image part of the full image (excludingthe padding). The x2, y2 pixels are not included.scale: The scale factor used to resize the imagepadding: Padding added to the image [(top, bottom), (left, right), (0, 0)]"""# Default window (y1, x1, y2, x2) and default scale == 1.h, w = image.shape[:2]  #image(h,w) h,w是nrray数据类型window = (0, 0, h, w)scale = 1# Scale?if min_dim: #(800) # Scale up but not downscale1 = max(1, min_dim / min(h, w)) #当最小边都比800大了,则图像最小边不需要resize,# Does it exceed max dim?if max_dim: #(1024)image_max = max(h, w)if round(image_max * scale1) > max_dim: #将最最大边大于1024,则需要将最大便压缩scale = max_dim / (image_max)# Resize image and maskif scale != 1:image = scipy.misc.imresize(  #scipy.misc.imresizeimage, (round(h * scale), round(w * scale)))# Need padding?if padding:# Get new height and widthh, w = image.shape[:2]top_pad = (max_dim - h) // 2bottom_pad = max_dim - h - top_padleft_pad = (max_dim - w) // 2right_pad = max_dim - w - left_padpadding = [(top_pad, bottom_pad), (left_pad, right_pad), (0, 0)] image = np.pad(image, padding, mode='constant', constant_values=0)window = (top_pad, left_pad, h + top_pad, w + left_pad) #在padding之后原始图像的左上角和右下角的位置return image, window, scale, paddingdef resize_mask(mask, scale, padding):"""Resizes a mask using the given scale and padding.Typically, you get the scale and padding from resize_image() toensure both, the image and the mask, are resized consistently.scale: mask scaling factorpadding: Padding to add to the mask in the form[(top, bottom), (left, right), (0, 0)]"""h, w = mask.shape[:2]mask = scipy.ndimage.zoom(mask, zoom=[scale, scale, 1], order=0) #order=0最临近插值#zoom 各个维度的缩放系数         #order=1双线性插值#order=3立方体插值mask = np.pad(mask, padding, mode='constant', constant_values=0)  return maskdef minimize_mask(bbox, mask, mini_shape):"""Resize masks to a smaller version to cut memory load.Mini-masks can then resized back to image scale using expand_masks()See inspect_data.ipynb notebook for more details."""#mini_shape=(56,56)#mask.shape[-1] 一张图片有几个掩膜mini_mask = np.zeros(mini_shape + (mask.shape[-1],), dtype=bool)for i in range(mask.shape[-1]):m = mask[:, :, i]y1, x1, y2, x2 = bbox[i][:4]m = m[y1:y2, x1:x2] #先将mask第一次缩小到box的高宽if m.size == 0:raise Exception("Invalid bounding box with area of zero")m = scipy.misc.imresize(m.astype(float), mini_shape, interp='bilinear')mini_mask[:, :, i] = np.where(m >= 128, 1, 0) #图像二值化return mini_maskdef expand_mask(bbox, mini_mask, image_shape):"""Resizes mini masks back to image size. Reverses the changeof minimize_mask().See inspect_data.ipynb notebook for more details."""mask = np.zeros(image_shape[:2] + (mini_mask.shape[-1],), dtype=bool)for i in range(mask.shape[-1]):m = mini_mask[:, :, i]y1, x1, y2, x2 = bbox[i][:4]h = y2 - y1w = x2 - x1m = scipy.misc.imresize(m.astype(float), (h, w), interp='bilinear')mask[y1:y2, x1:x2, i] = np.where(m >= 128, 1, 0)return mask# TODO: Build and use this function to reduce code duplication
def mold_mask(mask, config):passdef unmold_mask(mask, bbox, image_shape):"""Converts a mask generated by the neural network into a format similarto it's original shape.mask: [height, width] of type float. A small, typically 28x28 mask.bbox: [y1, x1, y2, x2]. The box to fit the mask in.Returns a binary mask with the same size as the original image."""threshold = 0.5y1, x1, y2, x2 = bbox#输出固定大小的maskmask = scipy.misc.imresize(mask, (y2 - y1, x2 - x1), interp='bilinear').astype(np.float32) / 255.0mask = np.where(mask >= threshold, 1, 0).astype(np.uint8) ## Put the mask in the right location.# 生成一张全0的原始图片大小的图片,然后将mask进行安装box的位置贴到这张图片上full_mask = np.zeros(image_shape[:2], dtype=np.uint8)full_mask[y1:y2, x1:x2] = maskreturn full_mask############################################################
#  Anchors
############################################################def generate_anchors(scales, ratios, shape, feature_stride, anchor_stride):"""scales: 1D array of anchor sizes in pixels. Example: [32, 64, 128]ratios: 1D array of anchor ratios of width/height. Example: [0.5, 1, 2]shape: [height, width] spatial shape of the feature map over whichto generate anchors.feature_stride: Stride of the feature map relative to the image in pixels.anchor_stride: Stride of anchors on the feature map. For example, if thevalue is 2 then generate anchors for every other feature map pixel."""# Get all combinations of scales and ratiosscales, ratios = np.meshgrid(np.array(scales), np.array(ratios)) ## array([[ 32],           array([[0.5], # [ 32],                  [1.],# [ 32]])                 [2.]])scales = scales.flatten() #array([ 32,32,32])ratios = ratios.flatten() #array([0.5,1.,2.])# Enumerate heights and widths from scales and ratiosheights = scales / np.sqrt(ratios) #(3,)widths = scales * np.sqrt(ratios)  #(3,)# Enumerate shifts in feature space'''shape是特征图的长宽, anchor_stride是将特征图shape均分的步长,这样一张特征图上就有(shape[0]//anchor_stride)*(shape[1]//anchor_stride)个网格点将其乘以feature_stride,就将这些特征点坐标映射到原图尺寸上'''shifts_y = np.arange(0, shape[0], anchor_stride) * feature_strideshifts_x = np.arange(0, shape[1], anchor_stride) * feature_strideshifts_x, shifts_y = np.meshgrid(shifts_x, shifts_y)  # Enumerate combinations of shifts, widths, and heights# 将widths纵向复制len(shifts_x)份,# 将shifts_x每一个元素横向复制len(widths)份,再纵向将其拼接box_widths, box_centers_x = np.meshgrid(widths, shifts_x) box_heights, box_centers_y = np.meshgrid(heights, shifts_y)# Reshape to get a list of (y, x) and a list of (h, w)box_centers = np.stack([box_centers_y, box_centers_x], axis=2).reshape([-1, 2])box_sizes = np.stack([box_heights, box_widths], axis=2).reshape([-1, 2])# Convert to corner coordinates (y1, x1, y2, x2)boxes = np.concatenate([box_centers - 0.5 * box_sizes,box_centers + 0.5 * box_sizes], axis=1)return boxesdef generate_pyramid_anchors(scales, ratios, feature_shapes, feature_strides,anchor_stride):"""Generate anchors at different levels of a feature pyramid. Each scaleis associated with a level of the pyramid, but each ratio is used inall levels of the pyramid.Returns:anchors: [N, (y1, x1, y2, x2)]. All generated anchors in one array. Sortedwith the same order of the given scales. So, anchors of scale[0] comefirst, then anchors of scale[1], and so on."""# Anchors# [anchor_count, (y1, x1, y2, x2)]anchors = []for i in range(len(scales)):anchors.append(generate_anchors(scales[i], ratios, feature_shapes[i],feature_strides[i], anchor_stride))return np.concatenate(anchors, axis=0)

全部代码(pytoch版,有中文解读),本人水平有限对(ROI Align)的gpu版未能注释,还望谅解

代码思维导图

github源码(keras版)

 

 


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部