结合Keras代码透彻分析SSD网络

之前一直在做分类网路的工作，物体检测网络有了解，但是还没有从头训练过，这次通过几天时间的集中学习，终于理解了整个网络的大部分细节，这里作简要记录。

思维导图

简要总结

网络部分： 最终输出的是一个封装好的数据，封装的什么呢？三部分，1、每个default box的分类损失，2、每个default box的回归损失，3、生成的default box的坐标，长度为8，四个坐标加上四个varance值辅助训练。
数据部分：我么先读取训练数据，可以获得gt box的两个坐标以及所属类别，然后我们也知道，并不是多有的default box都参与训练，而是符合一定要求的才参与训练（IOU大于阈值，并且一个default box只对应一个gt box），经过筛选后，每个gt box会和一个或几个default box对应，此时我们称这些default box为prior box，这也是很多博客里面说的，default box是客观存在的，不管你输入是是什么，而prior box是实际采用的，因为随着图片的不同，gt box的位置不同，prior box也不同。最后再对原始的绝对坐标进行编码。
损失部分：损失是计算上述两者的输出，均是(-1,num_default_box,4+21+8)(在VOC2007下，base model为VGG16)，在损失这部分还有一个值得注意的地方就是hard mining，通过控制正负样本的比例来辅助训练。
代码

随后我会将代码(参考：Github)上传到我的github上，对比参考的代码，我只是在里面添加了中文注释，并根据最新版本的keras进行了API修改。

xml_processor.py

#coding:utf-8
from xml.etree import ElementTree
import numpy as np
import os
import pickle
class xml_processor(object):
    def __init__(self,data_path):
        self.path_prefix = data_path
        self.num_classes = 20 #20 + 1 bg
        self.data = dict()
        self._preprocess_XML()
	#这里读取xml的内容，并将坐标转化为[x_min,y_min,x_max,y_max]的格式
    def _preprocess_XML(self):
        filenames = os.listdir(self.path_prefix)
        for filename in filenames:
            tree = ElementTree.parse(os.path.join(self.path_prefix,filename))
            root = tree.getroot()
            bounding_boxes = []
            one_hot_classes = []
            size_tree = root.find('size')
            width = float(size_tree.find('width').text)
            height = float(size_tree.find('height').text)
            for object_tree in root.findall('object'):
                for bounding_box in object_tree.iter('bndbox'):
                    xmin = float(bounding_box.find('xmin').text) / width
                    ymin = float(bounding_box.find('ymin').text) / height
                    xmax = float(bounding_box.find('xmax').text) / width
                    ymax = float(bounding_box.find('ymax').text) / height
                bounding_box = [xmin, ymin, xmax, ymax]
                bounding_boxes.append(bounding_box)
                class_name = object_tree.find('name').text
                one_hot_class = self._to_one_hot(class_name)
                one_hot_classes.append(one_hot_class)
            image_name = root.find('filename').text
            bounding_boxes = np.asarray(bounding_boxes)
            one_hot_classes = np.asarray(one_hot_classes)
            image_data = np.hstack((bounding_boxes, one_hot_classes))
            self.data[image_name] = image_data

    def _to_one_hot(self, name):
        one_hot_vector = [0] * self.num_classes
        if name == 'aeroplane':
            one_hot_vector[0] = 1
        elif name == 'bicycle':
            one_hot_vector[1] = 1
        elif name == 'bird':
            one_hot_vector[2] = 1
        elif name == 'boat':
            one_hot_vector[3] = 1
        elif name == 'bottle':
            one_hot_vector[4] = 1
        elif name == 'bus':
            one_hot_vector[5] = 1
        elif name == 'car':
            one_hot_vector[6] = 1
        elif name == 'cat':
            one_hot_vector[7] = 1
        elif name == 'chair':
            one_hot_vector[8] = 1
        elif name == 'cow':
            one_hot_vector[9] = 1
        elif name == 'diningtable':
            one_hot_vector[10] = 1
        elif name == 'dog':
            one_hot_vector[11] = 1
        elif name == 'horse':
            one_hot_vector[12] = 1
        elif name == 'motorbike':
            one_hot_vector[13] = 1
        elif name == 'person':
            one_hot_vector[14] = 1
        elif name == 'pottedplant':
            one_hot_vector[15] = 1
        elif name == 'sheep':
            one_hot_vector[16] = 1
        elif name == 'sofa':
            one_hot_vector[17] = 1
        elif name == 'train':
            one_hot_vector[18] = 1
        elif name == 'tvmonitor':
            one_hot_vector[19] = 1
        else:
            print('unknown label: %s' % name)
        return one_hot_vector

if __name__ == '__main__':
    data = xml_processor('./VOC2007/Annotations').data
    print(data['000005.jpg'])
    pickle.dump(data,open('./pkl/VOC2007.pkl','wb'))

最后将读取到的数据（主要是文件名、类别和box坐标）设计成字典：{file_name: [num_box,4+20]}，其中4是四个坐标为物体的坐标，20为one_hot类型的类别标签。这是个通用的处理函数，一般voc格式的xml文件都可以使用。

ssd_train.py

def generate(self, train=True):
    while True:
        #根据模式选择数据集
        if train:
            shuffle(self.train_keys)
            keys = self.train_keys
        else:
            shuffle(self.val_keys)
            keys = self.val_keys

        inputs = []
        targets = []
        #开始遍历数据集
        for key in keys:
            #获得图片路径
            img_path = os.path.join(self.path_prefix,key)
            #读取图片
            img = imread(img_path).astype('float32')
            #获得当前图片的 gt boxes
            y = self.gt[key].copy()
            #训练模式下是否进行裁剪
            if train and self.do_crop:
                #此时的img,y已经更新
                img, y = self.random_sized_crop(img, y)
            #将图片resize到模型需要的尺寸
            img = imresize(img, self.image_size).astype('float32')

            #只在训练模式下进行
            if train:
                #self.color_jitter里面保存了需要做的颜色抖动的函数
                shuffle(self.color_jitter) #几个过程随机
                for jitter in self.color_jitter: #依次进行变换
                    img = jitter(img)
                #光噪声
                if self.lighting_std:
                    img = self.lighting(img)
                #随机水平翻转
                if self.hflip_prob > 0:
                    img, y = self.horizontal_flip(img, y)
                #随机垂直翻转
                if self.vflip_prob > 0:
                    img, y = self.vertical_flip(img, y)
            #
            y = self.bbox_util.assign_boxes(y) #输出的y后面加上了8列，表示的是prior box的8个属性
            inputs.append(img)
            targets.append(y)
            if len(targets) == self.batch_size:
                tmp_inp = np.array(inputs)
                tmp_targets = np.array(targets)
                inputs = []
                targets = []
                yield preprocess_input(tmp_inp), tmp_targets

这里面主要是y = self.bbox_util.assign_boxes(y)这句代码，作用是为gt box（ground truth box）配default box，为什么要分配呢？我们知道，default box的数量是根据feature maps的尺寸决定的，也就是不管你来的是哪张训练图片，图片中有几个gt box，default box就在那里，不增不减，但是我们并不会将所有的default box参与训练，我们只训练那些接近gt box的default box，这也是为了让网络能够顺利训练设计的，选取出来的default box又被成为prior box。

bbox_utils.py

class bbox_utils(object):

    def __init__(self,
                 num_classes,
                 priors=None,
                 overlap_threshold=0.5,
                 nms_thresh=0.45,
                 top_k=400):
		#......

    @property
    def nms_thresh(self):
        #......
    @nms_thresh.setter
    def nms_thresh(self, value):
        #......
    @property
    def top_k(self):
        #......
    @top_k.setter
    def top_k(self, value):
        #......
    def iou(self, box): #(7308,4)
		#......
    def encode_box(self, box, return_iou=True): #这里的box是gt
        iou = self.iou(box) #求所有box与default box的IOU大于阈值的default box，并作为prior box
        encoded_box = np.zeros((self.num_priors, 4 + return_iou))
        assign_mask = iou > self.overlap_threshold #(7308,)
        if not assign_mask.any(): #如果没有一个大于0.5的，那么就取最大的一个
            assign_mask[iou.argmax()] = True
        if return_iou:
            encoded_box[:, -1][assign_mask] = iou[assign_mask] #如果需要返回iou，则将prior box与gt的iou添加在最后一列
        assigned_priors = self.priors[assign_mask] #这里的assigned_priors其实就是已经过筛选符合要求的default box，之后就称之为prior box 形状为(-1,8)
        box_center = 0.5 * (box[:2] + box[2:]) #求gt box得到中心(cx,cy)
        box_wh = box[2:] - box[:2] #(cw,ch) gt box的宽高
        assigned_priors_center = 0.5 * (assigned_priors[:, :2] + assigned_priors[:, 2:4]) #求出所有prior box的中心
        assigned_priors_wh = (assigned_priors[:, 2:4] - assigned_priors[:, :2]) #求出所有prior box的宽高
        # we encode variance
        #求出了l_cx和l_cy
        encoded_box[:, :2][assign_mask] = box_center - assigned_priors_center
        encoded_box[:, :2][assign_mask] /= assigned_priors_wh
        #将误差计算在内，不太明白为什么这样做
        encoded_box[:, :2][assign_mask] /= assigned_priors[:, -4:-2]
        #求出l_w和l_h
        encoded_box[:, 2:4][assign_mask] = np.log(box_wh / assigned_priors_wh)
        encoded_box[:, 2:4][assign_mask] /= assigned_priors[:, -2:]
        return encoded_box.ravel()

    def assign_boxes(self, boxes):
       assignment = np.zeros((self.num_priors, 4 + self.num_classes + 8)) #(7308,4+21+8)
        assignment[:, 4] = 1.0 #全部初始化为背景
        if len(boxes) == 0: #如果boxes为空，即图片中没有目标
            return assignment
        encoded_boxes = np.apply_along_axis(self.encode_box, 1, boxes[:, :4]) #对box的坐标进行编码,其中encoded_boxes是ravel后的值，形状为(num_gt_box,7308*(4+1))
        encoded_boxes = encoded_boxes.reshape(-1, self.num_priors, 5) #再变回来(num_gt_box,7308,5)
        #这里是为了满足一个prior box只对应一个gt box的要求，进行二次筛选
        best_iou = encoded_boxes[:, :, -1].max(axis=0) #iou最大的那个prior box (7308,)
        best_iou_idx = encoded_boxes[:, :, -1].argmax(axis=0) #最大iou的索引 (7308,)
        best_iou_mask = best_iou > 0 #best_iou_mask下的prior box才是最终的prior box

        best_iou_idx = best_iou_idx[best_iou_mask]
        assign_num = len(best_iou_idx)
        encoded_boxes = encoded_boxes[:, best_iou_mask, :] #(1,?,5)
        assignment[:, :4][best_iou_mask] = encoded_boxes[best_iou_idx,
                                           np.arange(assign_num),
                                           :4]
        assignment[:, 4][best_iou_mask] = 0 #这里将所有prior box的背景那个类别置为0，因为只要是prior box肯定不属于背景
        assignment[:, 5:-8][best_iou_mask] = boxes[best_iou_idx, 4:] #这里是one hot类型的概率
        assignment[:, -8][best_iou_mask] = 1 #最后的8列没有意义，只是为了使shape与网络输出的shape保持一致
        return assignment

    def decode_boxes(self, mbox_loc, mbox_priorbox, variances):
        prior_width = mbox_priorbox[:, 2] - mbox_priorbox[:, 0]
        prior_height = mbox_priorbox[:, 3] - mbox_priorbox[:, 1]
        prior_center_x = 0.5 * (mbox_priorbox[:, 2] + mbox_priorbox[:, 0])
        prior_center_y = 0.5 * (mbox_priorbox[:, 3] + mbox_priorbox[:, 1])
        decode_bbox_center_x = mbox_loc[:, 0] * prior_width * variances[:, 0]
        decode_bbox_center_x += prior_center_x
        decode_bbox_center_y = mbox_loc[:, 1] * prior_width * variances[:, 1]
        decode_bbox_center_y += prior_center_y
        decode_bbox_width = np.exp(mbox_loc[:, 2] * variances[:, 2])
        decode_bbox_width *= prior_width
        decode_bbox_height = np.exp(mbox_loc[:, 3] * variances[:, 3])
        decode_bbox_height *= prior_height
        decode_bbox_xmin = decode_bbox_center_x - 0.5 * decode_bbox_width
        decode_bbox_ymin = decode_bbox_center_y - 0.5 * decode_bbox_height
        decode_bbox_xmax = decode_bbox_center_x + 0.5 * decode_bbox_width
        decode_bbox_ymax = decode_bbox_center_y + 0.5 * decode_bbox_height
        decode_bbox = np.concatenate((decode_bbox_xmin[:, None],
                                      decode_bbox_ymin[:, None],
                                      decode_bbox_xmax[:, None],
                                      decode_bbox_ymax[:, None]), axis=-1)
        decode_bbox = np.minimum(np.maximum(decode_bbox, 0.0), 1.0)
        return decode_bbox

    def detection_out(self, predictions, background_label_id=0, keep_top_k=200,
                      confidence_threshold=0.01):
        mbox_loc = predictions[:, :, :4]
        variances = predictions[:, :, -4:]
        mbox_priorbox = predictions[:, :, -8:-4]
        mbox_conf = predictions[:, :, 4:-8]
        results = []
        for i in range(len(mbox_loc)):
            results.append([])
            decode_bbox = self.decode_boxes(mbox_loc[i],
                                            mbox_priorbox[i], variances[i])
            for c in range(self.num_classes):
                if c == background_label_id:
                    continue
                c_confs = mbox_conf[i, :, c]
                c_confs_m = c_confs > confidence_threshold
                if len(c_confs[c_confs_m]) > 0:
                    boxes_to_process = decode_bbox[c_confs_m]
                    confs_to_process = c_confs[c_confs_m]
                    feed_dict = {self.boxes: boxes_to_process,
                                 self.scores: confs_to_process}
                    idx = self.sess.run(self.nms, feed_dict=feed_dict)
                    good_boxes = boxes_to_process[idx]
                    confs = confs_to_process[idx][:, None]
                    labels = c * np.ones((len(idx), 1))
                    c_pred = np.concatenate((labels, confs, good_boxes),
                                            axis=1)
                    results[-1].extend(c_pred)
            if len(results[-1]) > 0:
                results[-1] = np.array(results[-1])
                argsort = np.argsort(results[-1][:, 1])[::-1]
                results[-1] = results[-1][argsort]
                results[-1] = results[-1][:keep_top_k]
        return results