docs/config_example/ssd_vgg16_300.yml

# Architecture of detection, which is also the prefix of data feed module.
architecture: SSD
# Data feed module.
# Data feed in training.
train_feed: SSDTrainFeed
# Data feed in Evaluation.
eval_feed: SSDEvalFeed
# Data feed in infer.
test_feed: SSDTestFeed
# Use GPU or CPU, true by default.
use_gpu: true
# Maximum number of iteration.
max_iters: 400000
# Snapshot period. If training and test at same time, evaluate model at each snapshot_iter. 10000 by default.
snapshot_iter: 10000
# Smooth the log output in specified iterations, 20 by default.
log_smooth_window: 20
# The log in training is displayed once every period.
log_iter: 20
# Evaluation method, COCO and VOC are available.
metric: COCO
# Evaluation mAP calculation method in VOC metric, 11point and integral are available.
map_type: 11point
# The path of final model for evaluation and test.
pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/VGG16_caffe_pretrained.tar
# The directory to save models.
save_dir: output
# The path of final model for evaluation and test.
weights: output/ssd_vgg16_300/model_final
# Number of classes, 81 for COCO and 21 for VOC.
num_classes: 81

# SSD architecture, see https://arxiv.org/abs/1512.02325
SSD:
  # backbone instance, defined below.
  backbone: VGG
  # `MultiBoxHead` instance, defined below.
  multi_box_head: MultiBoxHead

  # fluid.layers.detection_output, Detection Output Layer for SSD.
  # This operation is to get the detection results by performing following two steps:
  #   1. Decode input bounding box predictions according to the prior boxes.
  #   2. Get the final detection results by applying multi-class non maximum suppression (NMS).
  # this operation doesn’t clip the final output bounding boxes to the image window.
  output_decoder:
    # The index of background label, the background label will be ignored.
    # If set to -1, then all categories will be considered.
    background_label: 0
    # Number of total bboxes to be kept per image after NMS.
    keep_top_k: 200
    # The parameter for adaptive NMS.
    nms_eta: 1.0
    # The threshold to be used in NMS.
    nms_threshold: 0.45
    # Maximum number of detections to be kept according to the confidences
    # aftern the filtering detections based on score_threshold.
    nms_top_k: 400
    # Threshold to filter out bounding boxes with low confidence score.
    # If not provided, consider all boxes.
    score_threshold: 0.01

# VGG backbone, see https://arxiv.org/abs/1409.1556
VGG:
  # the VGG net depth (16 or 19
  depth: 16
  # whether or not extra blocks should be added
  with_extra_blocks: true
  # in each extra block, params:
  # [in_channel, out_channel, padding_size, stride_size, filter_size]
  extra_block_filters:
  - [256, 512, 1, 2, 3]
  - [128, 256, 1, 2, 3]
  - [128, 256, 0, 1, 3]
  - [128, 256, 0, 1, 3]
  # params list of init scale in l2 norm, skip init scale if param is -1.
  normalizations: [20., -1, -1, -1, -1, -1]

# fluid.layers.multi_box_head, Generate prior boxes for SSD algorithm.
# Generate `prior_box`  according to the inputs list and other parameters
# Each position of the input produce N prior boxes, N is determined by 
# the count of min_sizes, max_sizes and aspect_ratios, The size of the box 
# is in range(min_size, max_size) interval, which is generated in sequence 
# according to the aspect_ratios.
MultiBoxHead:
  # the base_size is used to get min_size and max_size according to min_ratio and max_ratio.
  base_size: 300
  # the aspect ratios of generated prior boxes. The length of input and aspect_ratios must be equal.
  aspect_ratios: [[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]]
  # the min ratio of generated prior boxes.
  min_ratio: 15
  # the max ratio of generated prior boxes.
  max_ratio: 90
  # If len(inputs) <=2, min_sizes must be set up, and the length of min_sizes
  # should equal to the length of inputs. Default: None.
  min_sizes: [30.0, 60.0, 111.0, 162.0, 213.0, 264.0]
  # If len(inputs) <=2, max_sizes must be set up, and the length of min_sizes
  # should equal to the length of inputs. Default: None.
  max_sizes: [60.0, 111.0, 162.0, 213.0, 264.0, 315.0]
  # If step_w and step_h are the same, step_w and step_h can be replaced by steps.
  steps: [8, 16, 32, 64, 100, 300]
  # Prior boxes center offset. Default: 0.5
  offset: 0.5
  # Whether to flip aspect ratios. Default:False.
  flip: true
  # The kernel size of conv2d. Default: 1.
  kernel_size: 3
  # The padding of conv2d. Default:0.
  pad: 1

# Learning rate configuration
LearningRate:
  # Base learning rate, 0.01 by default
  base_lr: 0.001
  # Learning rate schedulers, PiecewiseDecay and LinearWarmup by default
  schedulers:
  # fluid.layers.piecewise_decay
  # Values has higher priority and if values is null, learning rate is multipled by gamma at each stage
  - !PiecewiseDecay
    gamma: 0.1
    milestones: [280000, 360000]
  # fluid.layers.linear_lr_warmup
  # Start learning rate equals to base_lr * start_factor
  - !LinearWarmup
    start_factor: 0.3333333333333333
    steps: 500

# Optimizer module
OptimizerBuilder:
  # fluid.optimizer, Neural network in essence is a Optimization problem .
  # With forward computing and back propagation , Optimizer use back-propagation
  # gradients to optimize parameters in a neural network.
  optimizer:
    # Momentum optimizer adds momentum on the basis of SGD ,
    # reducing noise problem in the process of random gradient descent.
    momentum: 0.9
    type: Momentum
  # fluid.regularizer
  regularizer:
    # implements the L2 Weight Decay Regularization
    # Small values of L2 can help prevent over fitting the training data.
    factor: 0.0005
    type: L2

# Data feed module for training
SSDTrainFeed:
  # Batch size per device
  batch_size: 16
  # list of batch transformations to use
  batch_transforms: []
  # The data buffer size
  bufsize: 10
  # Dataset module
  dataset:
    # Dataset directory
    dataset_dir: dataset/coco
    # Annotation file path
    annotation: annotations/instances_train2017.json
    # Directory where image files are stored
    image_dir: train2017
  # Drop last batch if size is uneven, false by default
  drop_last: true
  # List of data fields needed
  fields: [image, gt_box, gt_label]
  # list of image dims
  image_shape: [3, 300, 300]
  # number of workers processes (or threads)
  num_workers: 8
  # List of sample transformations to use
  sample_transforms:
  # Transform the image data to numpy format.
  - !DecodeImage
    # whether to convert BGR to RGB
    to_rgb: true  # default: true
    # whether or not to mixup image and gt_bbbox/gt_score
    with_mixup: false   # default: false
  # Transform the bounding box's coornidates to [0,1].
  - !NormalizeBox {}
  # modify image brightness，contrast，saturation，hue，reordering channels and etc.
  - !RandomDistort
    # brightness_lower/ brightness_upper (float): the brightness
    # between brightness_lower and brightness_upper
    brightness_lower: 0.875
    brightness_upper: 1.125
    # brightness_prob (float): the probability of changing brightness
    brightness_prob: 0.5
    # contrast_lower/ contrast_upper (float): the contrast between
    # contrast_lower and contrast_lower
    contrast_lower: 0.5
    contrast_upper: 1.5
    # contrast_prob (float): the probability of changing contrast
    contrast_prob: 0.5
    # count (int): the kinds of doing distrot
    count: 4
    # hue_lower/ hue_upper (float): the hue between hue_lower and hue_upper
    hue_lower: -18
    hue_upper: 18
    # hue_prob (float): the probability of changing hue
    hue_prob: 0.5
    # is_order (bool): whether determine the order of distortion
    is_order: true
    # saturation_lower/ saturation_upper (float): the saturation
    # between saturation_lower and saturation_upper
    saturation_lower: 0.5
    saturation_upper: 1.5
    # saturation_prob (float): the probability of changing saturation
    saturation_prob: 0.5
  #Expand the image and modify bounding box.
  #   Operators:
  #     1. Scale the image weight and height.
  #     2. Construct new images with new height and width.
  #     3. Fill the new image with the mean.
  #     4. Put original imge into new image.
  #     5. Rescale the bounding box.
  #     6. Determine if the new bbox is satisfied in the new image.
  - !ExpandImage
    # max_ratio (float): the ratio of expanding
    max_ratio: 4
    # mean (list): the pixel mean
    mean: [104, 117, 123]
    # prob (float): the probability of expanding image
    prob: 0.5
  # Crop the image and modify bounding box.
  #   Operators:
  #     1. Scale the image weight and height.
  #     2. Crop the image according to a radom sample.
  #     3. Rescale the bounding box.
  #     4. Determine if the new bbox is satisfied in the new image.
  - !CropImage
    # avoid_no_bbox (bool): whether to to avoid the
    # situation where the box does not appear.
    avoid_no_bbox: false
    # batch_sampler (list): Multiple sets of different parameters for cropping.
    batch_sampler:
    - [1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]
    - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 0.0]
    - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 0.0]
    - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 0.0]
    - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 0.0]
    - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 0.0]
    - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0]
    # satisfy_all (bool): whether all boxes must satisfy.
    satisfy_all: false
  # Rescale image to the specified target size, and capped at max_size if max_size != 0.
  # If target_size is list, selected a scale randomly as the specified target size.
  - !ResizeImage
    # Resize method, cv2.INTER_LINEAR(1) by default
    interp: 1
    # max_size (int): the max size of image
    max_size: 0
    # target_size (int|list): the target size of image's short side,
    # multi-scale training is adopted when type is list.
    target_size: 300
    # use_cv2 (bool): use the cv2 interpolation method or use PIL interpolation method
    use_cv2: false
  # Filp the image and bounding box.
  #   Operators:
  #     1. Flip the image numpy.
  #     2. Transform the bboxes' x coordinates. (Must judge whether the coordinates are normalized!)
  #     3. Transform the segmentations' x coordinates. (Must judge whether the coordinates are normalized!)
  - !RandomFlipImage
    # is_mask_flip (bool): whether flip the segmentation
    is_mask_flip: false
    # is_normalized (bool): whether the bbox scale to [0,1]
    is_normalized: true
    # prob (float): the probability of flipping image
    prob: 0.5
  # Change the channel
  - !Permute
    # The format of image, [H, W, C]/[C, H, W], true by default
    channel_first: true
    # to_bgr (bool): confirm whether to convert RGB to BGR
    to_bgr: true
  # Normalize the image.
  #   Operators:
  #     1.(optional) Scale the image to [0,1]
  #     2. Each pixel minus mean and is divided by std
  - !NormalizeImage
    # The format of image, [H, W, C]/[C, H, W], true by default
    is_channel_first: true
    # Whether divide by 255, true by default
    is_scale: false
    # mean (list): the pixel mean
    mean: [104, 117, 123]
    # std (list): the pixel variance
    std: [1, 1, 1]
  # Number of samples, -1 represents all samples. -1 by default
  samples: -1
  # If samples should be shuffled, true by default
  shuffle: true
  # If use multi-process, false by default
  use_process: true

# Data feed module for Eval
SSDEvalFeed:
  # Batch size per device
  batch_size: 32
  # list of batch transformations to use
  batch_transforms: []
  # The data buffer size
  bufsize: 10
  # Dataset module
  dataset:
    # Dataset directory
    dataset_dir: dataset/coco
    # Annotation file path
    annotation: annotations/instances_val2017.json
    # Directory where image files are stored
    image_dir: val2017
  # Drop last batch if size is uneven, false by default
  drop_last: true
  # List of data fields needed
  fields: [image, im_shape, im_id, gt_box, gt_label, is_difficult]
  # list of image dims
  image_shape: [3, 300, 300]
  # number of workers processes (or threads)
  num_workers: 8
  # List of sample transformations to use
  sample_transforms:
  # Transform the image data to numpy format.
  - !DecodeImage
  # whether to convert BGR to RGB
    to_rgb: true  # default: true
    # whether or not to mixup image and gt_bbbox/gt_score
    with_mixup: false   # default: false
  # Transform the bounding box's coornidates to [0,1].
  - !NormalizeBox {}
  # Rescale image to the specified target size, and capped at max_size if max_size != 0.
  # If target_size is list, selected a scale randomly as the specified target size.
  - !ResizeImage
    # Resize method, cv2.INTER_LINEAR(1) by default
    interp: 1
    # max_size (int): the max size of image
    max_size: 0
    # target_size (int|list): the target size of image's short side,
    # multi-scale training is adopted when type is list.
    target_size: 300
    # use_cv2 (bool): use the cv2 interpolation method or use PIL interpolation method
    use_cv2: false
  - !Permute
    # The format of image, [H, W, C]/[C, H, W], true by default
    channel_first: true
    # to_bgr (bool): confirm whether to convert RGB to BGR
    to_bgr: true
  # Normalize the image.
  #   Operators:
  #     1.(optional) Scale the image to [0,1]
  #     2. Each pixel minus mean and is divided by std
  - !NormalizeImage
    # The format of image, [H, W, C]/[C, H, W], true by default
    is_channel_first: true
    # Whether divide by 255, true by default
    is_scale: false
    # mean (list): the pixel mean
    mean: [104, 117, 123]
    # std (list): the pixel variance
    std: [1, 1, 1]
  # Number of samples, -1 represents all samples. -1 by default
  samples: -1
  # If samples should be shuffled, true by default
  shuffle: false
  # If use multi-process, false by default
  use_process: false

# Data feed module for test
SSDTestFeed:
  # Batch size per device
  batch_size: 1
  # list of batch transformations to use
  batch_transforms: []
  # The data buffer size
  bufsize: 10
  # Dataset module
  dataset:
    # Annotation file path
    annotation: dataset/coco/annotations/instances_val2017.json
  # Drop last batch if size is uneven, false by default
  drop_last: false
  # List of data fields needed
  fields: [image, im_id]
  # list of image dims
  image_shape: [3, 300, 300]
  # number of workers processes (or threads)
  num_workers: 8
  # List of sample transformations to use
  sample_transforms:
  # Transform the image data to numpy format.
  - !DecodeImage
  # whether to convert BGR to RGB
    to_rgb: true  # default: true
    # whether or not to mixup image and gt_bbbox/gt_score
    with_mixup: false   # default: false
  # Rescale image to the specified target size, and capped at max_size if max_size != 0.
  # If target_size is list, selected a scale randomly as the specified target size.
  - !ResizeImage
    # Resize method, cv2.INTER_LINEAR(1) by default
    interp: 1
    # max_size (int): the max size of image
    max_size: 0
    # target_size (int|list): the target size of image's short side,
    # multi-scale training is adopted when type is list.
    target_size: 300
    # use_cv2 (bool): use the cv2 interpolation method or use PIL interpolation method
    use_cv2: false
  - !Permute
    # The format of image, [H, W, C]/[C, H, W], true by default
    channel_first: true
    # to_bgr (bool): confirm whether to convert RGB to BGR
    to_bgr: true
  # Normalize the image.
  #   Operators:
  #     1.(optional) Scale the image to [0,1]
  #     2. Each pixel minus mean and is divided by std
  - !NormalizeImage
    # The format of image, [H, W, C]/[C, H, W], true by default
    is_channel_first: true
    # Whether divide by 255, true by default
    is_scale: false
    # mean (list): the pixel mean
    mean: [104, 117, 123]
    # std (list): the pixel variance
    std: [1, 1, 1]
  # Number of samples, -1 represents all samples. -1 by default
  samples: -1
  # If samples should be shuffled, true by default
  shuffle: false
  # If use multi-process, false by default
  use_process: false