diff --git a/cvdata/deploy.prototxt b/cvdata/deploy.prototxt new file mode 100644 index 0000000..a128515 --- /dev/null +++ b/cvdata/deploy.prototxt @@ -0,0 +1,1790 @@ +input: "data" +input_shape { + dim: 1 + dim: 3 + dim: 300 + dim: 300 +} + +layer { + name: "data_bn" + type: "BatchNorm" + bottom: "data" + top: "data_bn" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "data_scale" + type: "Scale" + bottom: "data_bn" + top: "data_bn" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "conv1_h" + type: "Convolution" + bottom: "data_bn" + top: "conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 32 + pad: 3 + kernel_size: 7 + stride: 2 + weight_filler { + type: "msra" + variance_norm: FAN_OUT + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "conv1_bn_h" + type: "BatchNorm" + bottom: "conv1_h" + top: "conv1_h" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "conv1_scale_h" + type: "Scale" + bottom: "conv1_h" + top: "conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "conv1_relu" + type: "ReLU" + bottom: "conv1_h" + top: "conv1_h" +} +layer { + name: "conv1_pool" + type: "Pooling" + bottom: "conv1_h" + top: "conv1_pool" + pooling_param { + kernel_size: 3 + stride: 2 + } +} +layer { + name: "layer_64_1_conv1_h" + type: "Convolution" + bottom: "conv1_pool" + top: "layer_64_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 32 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_64_1_bn2_h" + type: "BatchNorm" + bottom: "layer_64_1_conv1_h" + top: "layer_64_1_conv1_h" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_64_1_scale2_h" + type: "Scale" + bottom: "layer_64_1_conv1_h" + top: "layer_64_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_64_1_relu2" + type: "ReLU" + bottom: "layer_64_1_conv1_h" + top: "layer_64_1_conv1_h" +} +layer { + name: "layer_64_1_conv2_h" + type: "Convolution" + bottom: "layer_64_1_conv1_h" + top: "layer_64_1_conv2_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 32 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_64_1_sum" + type: "Eltwise" + bottom: "layer_64_1_conv2_h" + bottom: "conv1_pool" + top: "layer_64_1_sum" +} +layer { + name: "layer_128_1_bn1_h" + type: "BatchNorm" + bottom: "layer_64_1_sum" + top: "layer_128_1_bn1_h" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_128_1_scale1_h" + type: "Scale" + bottom: "layer_128_1_bn1_h" + top: "layer_128_1_bn1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_128_1_relu1" + type: "ReLU" + bottom: "layer_128_1_bn1_h" + top: "layer_128_1_bn1_h" +} +layer { + name: "layer_128_1_conv1_h" + type: "Convolution" + bottom: "layer_128_1_bn1_h" + top: "layer_128_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_128_1_bn2" + type: "BatchNorm" + bottom: "layer_128_1_conv1_h" + top: "layer_128_1_conv1_h" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_128_1_scale2" + type: "Scale" + bottom: "layer_128_1_conv1_h" + top: "layer_128_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_128_1_relu2" + type: "ReLU" + bottom: "layer_128_1_conv1_h" + top: "layer_128_1_conv1_h" +} +layer { + name: "layer_128_1_conv2" + type: "Convolution" + bottom: "layer_128_1_conv1_h" + top: "layer_128_1_conv2" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_128_1_conv_expand_h" + type: "Convolution" + bottom: "layer_128_1_bn1_h" + top: "layer_128_1_conv_expand_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + pad: 0 + kernel_size: 1 + stride: 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_128_1_sum" + type: "Eltwise" + bottom: "layer_128_1_conv2" + bottom: "layer_128_1_conv_expand_h" + top: "layer_128_1_sum" +} +layer { + name: "layer_256_1_bn1" + type: "BatchNorm" + bottom: "layer_128_1_sum" + top: "layer_256_1_bn1" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_256_1_scale1" + type: "Scale" + bottom: "layer_256_1_bn1" + top: "layer_256_1_bn1" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_256_1_relu1" + type: "ReLU" + bottom: "layer_256_1_bn1" + top: "layer_256_1_bn1" +} +layer { + name: "layer_256_1_conv1" + type: "Convolution" + bottom: "layer_256_1_bn1" + top: "layer_256_1_conv1" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_256_1_bn2" + type: "BatchNorm" + bottom: "layer_256_1_conv1" + top: "layer_256_1_conv1" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_256_1_scale2" + type: "Scale" + bottom: "layer_256_1_conv1" + top: "layer_256_1_conv1" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_256_1_relu2" + type: "ReLU" + bottom: "layer_256_1_conv1" + top: "layer_256_1_conv1" +} +layer { + name: "layer_256_1_conv2" + type: "Convolution" + bottom: "layer_256_1_conv1" + top: "layer_256_1_conv2" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_256_1_conv_expand" + type: "Convolution" + bottom: "layer_256_1_bn1" + top: "layer_256_1_conv_expand" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 0 + kernel_size: 1 + stride: 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_256_1_sum" + type: "Eltwise" + bottom: "layer_256_1_conv2" + bottom: "layer_256_1_conv_expand" + top: "layer_256_1_sum" +} +layer { + name: "layer_512_1_bn1" + type: "BatchNorm" + bottom: "layer_256_1_sum" + top: "layer_512_1_bn1" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_512_1_scale1" + type: "Scale" + bottom: "layer_512_1_bn1" + top: "layer_512_1_bn1" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_512_1_relu1" + type: "ReLU" + bottom: "layer_512_1_bn1" + top: "layer_512_1_bn1" +} +layer { + name: "layer_512_1_conv1_h" + type: "Convolution" + bottom: "layer_512_1_bn1" + top: "layer_512_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 # 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_512_1_bn2_h" + type: "BatchNorm" + bottom: "layer_512_1_conv1_h" + top: "layer_512_1_conv1_h" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_512_1_scale2_h" + type: "Scale" + bottom: "layer_512_1_conv1_h" + top: "layer_512_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_512_1_relu2" + type: "ReLU" + bottom: "layer_512_1_conv1_h" + top: "layer_512_1_conv1_h" +} +layer { + name: "layer_512_1_conv2_h" + type: "Convolution" + bottom: "layer_512_1_conv1_h" + top: "layer_512_1_conv2_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 2 # 1 + kernel_size: 3 + stride: 1 + dilation: 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_512_1_conv_expand_h" + type: "Convolution" + bottom: "layer_512_1_bn1" + top: "layer_512_1_conv_expand_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 0 + kernel_size: 1 + stride: 1 # 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_512_1_sum" + type: "Eltwise" + bottom: "layer_512_1_conv2_h" + bottom: "layer_512_1_conv_expand_h" + top: "layer_512_1_sum" +} +layer { + name: "last_bn_h" + type: "BatchNorm" + bottom: "layer_512_1_sum" + top: "layer_512_1_sum" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "last_scale_h" + type: "Scale" + bottom: "layer_512_1_sum" + top: "layer_512_1_sum" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "last_relu" + type: "ReLU" + bottom: "layer_512_1_sum" + top: "fc7" +} + +layer { + name: "conv6_1_h" + type: "Convolution" + bottom: "fc7" + top: "conv6_1_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6_1_relu" + type: "ReLU" + bottom: "conv6_1_h" + top: "conv6_1_h" +} +layer { + name: "conv6_2_h" + type: "Convolution" + bottom: "conv6_1_h" + top: "conv6_2_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6_2_relu" + type: "ReLU" + bottom: "conv6_2_h" + top: "conv6_2_h" +} +layer { + name: "conv7_1_h" + type: "Convolution" + bottom: "conv6_2_h" + top: "conv7_1_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv7_1_relu" + type: "ReLU" + bottom: "conv7_1_h" + top: "conv7_1_h" +} +layer { + name: "conv7_2_h" + type: "Convolution" + bottom: "conv7_1_h" + top: "conv7_2_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv7_2_relu" + type: "ReLU" + bottom: "conv7_2_h" + top: "conv7_2_h" +} +layer { + name: "conv8_1_h" + type: "Convolution" + bottom: "conv7_2_h" + top: "conv8_1_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv8_1_relu" + type: "ReLU" + bottom: "conv8_1_h" + top: "conv8_1_h" +} +layer { + name: "conv8_2_h" + type: "Convolution" + bottom: "conv8_1_h" + top: "conv8_2_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv8_2_relu" + type: "ReLU" + bottom: "conv8_2_h" + top: "conv8_2_h" +} +layer { + name: "conv9_1_h" + type: "Convolution" + bottom: "conv8_2_h" + top: "conv9_1_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv9_1_relu" + type: "ReLU" + bottom: "conv9_1_h" + top: "conv9_1_h" +} +layer { + name: "conv9_2_h" + type: "Convolution" + bottom: "conv9_1_h" + top: "conv9_2_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv9_2_relu" + type: "ReLU" + bottom: "conv9_2_h" + top: "conv9_2_h" +} +layer { + name: "conv4_3_norm" + type: "Normalize" + bottom: "layer_256_1_bn1" + top: "conv4_3_norm" + norm_param { + across_spatial: false + scale_filler { + type: "constant" + value: 20 + } + channel_shared: false + } +} +layer { + name: "conv4_3_norm_mbox_loc" + type: "Convolution" + bottom: "conv4_3_norm" + top: "conv4_3_norm_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 16 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv4_3_norm_mbox_loc_perm" + type: "Permute" + bottom: "conv4_3_norm_mbox_loc" + top: "conv4_3_norm_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv4_3_norm_mbox_loc_flat" + type: "Flatten" + bottom: "conv4_3_norm_mbox_loc_perm" + top: "conv4_3_norm_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv4_3_norm_mbox_conf" + type: "Convolution" + bottom: "conv4_3_norm" + top: "conv4_3_norm_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 8 # 84 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv4_3_norm_mbox_conf_perm" + type: "Permute" + bottom: "conv4_3_norm_mbox_conf" + top: "conv4_3_norm_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv4_3_norm_mbox_conf_flat" + type: "Flatten" + bottom: "conv4_3_norm_mbox_conf_perm" + top: "conv4_3_norm_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv4_3_norm_mbox_priorbox" + type: "PriorBox" + bottom: "conv4_3_norm" + bottom: "data" + top: "conv4_3_norm_mbox_priorbox" + prior_box_param { + min_size: 30.0 + max_size: 60.0 + aspect_ratio: 2 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 8 + offset: 0.5 + } +} +layer { + name: "fc7_mbox_loc" + type: "Convolution" + bottom: "fc7" + top: "fc7_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 24 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "fc7_mbox_loc_perm" + type: "Permute" + bottom: "fc7_mbox_loc" + top: "fc7_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "fc7_mbox_loc_flat" + type: "Flatten" + bottom: "fc7_mbox_loc_perm" + top: "fc7_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "fc7_mbox_conf" + type: "Convolution" + bottom: "fc7" + top: "fc7_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 12 # 126 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "fc7_mbox_conf_perm" + type: "Permute" + bottom: "fc7_mbox_conf" + top: "fc7_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "fc7_mbox_conf_flat" + type: "Flatten" + bottom: "fc7_mbox_conf_perm" + top: "fc7_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "fc7_mbox_priorbox" + type: "PriorBox" + bottom: "fc7" + bottom: "data" + top: "fc7_mbox_priorbox" + prior_box_param { + min_size: 60.0 + max_size: 111.0 + aspect_ratio: 2 + aspect_ratio: 3 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 16 + offset: 0.5 + } +} +layer { + name: "conv6_2_mbox_loc" + type: "Convolution" + bottom: "conv6_2_h" + top: "conv6_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 24 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6_2_mbox_loc_perm" + type: "Permute" + bottom: "conv6_2_mbox_loc" + top: "conv6_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv6_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv6_2_mbox_loc_perm" + top: "conv6_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv6_2_mbox_conf" + type: "Convolution" + bottom: "conv6_2_h" + top: "conv6_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 12 # 126 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6_2_mbox_conf_perm" + type: "Permute" + bottom: "conv6_2_mbox_conf" + top: "conv6_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv6_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv6_2_mbox_conf_perm" + top: "conv6_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv6_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv6_2_h" + bottom: "data" + top: "conv6_2_mbox_priorbox" + prior_box_param { + min_size: 111.0 + max_size: 162.0 + aspect_ratio: 2 + aspect_ratio: 3 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 32 + offset: 0.5 + } +} +layer { + name: "conv7_2_mbox_loc" + type: "Convolution" + bottom: "conv7_2_h" + top: "conv7_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 24 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv7_2_mbox_loc_perm" + type: "Permute" + bottom: "conv7_2_mbox_loc" + top: "conv7_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv7_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv7_2_mbox_loc_perm" + top: "conv7_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv7_2_mbox_conf" + type: "Convolution" + bottom: "conv7_2_h" + top: "conv7_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 12 # 126 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv7_2_mbox_conf_perm" + type: "Permute" + bottom: "conv7_2_mbox_conf" + top: "conv7_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv7_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv7_2_mbox_conf_perm" + top: "conv7_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv7_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv7_2_h" + bottom: "data" + top: "conv7_2_mbox_priorbox" + prior_box_param { + min_size: 162.0 + max_size: 213.0 + aspect_ratio: 2 + aspect_ratio: 3 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 64 + offset: 0.5 + } +} +layer { + name: "conv8_2_mbox_loc" + type: "Convolution" + bottom: "conv8_2_h" + top: "conv8_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 16 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv8_2_mbox_loc_perm" + type: "Permute" + bottom: "conv8_2_mbox_loc" + top: "conv8_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv8_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv8_2_mbox_loc_perm" + top: "conv8_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv8_2_mbox_conf" + type: "Convolution" + bottom: "conv8_2_h" + top: "conv8_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 8 # 84 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv8_2_mbox_conf_perm" + type: "Permute" + bottom: "conv8_2_mbox_conf" + top: "conv8_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv8_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv8_2_mbox_conf_perm" + top: "conv8_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv8_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv8_2_h" + bottom: "data" + top: "conv8_2_mbox_priorbox" + prior_box_param { + min_size: 213.0 + max_size: 264.0 + aspect_ratio: 2 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 100 + offset: 0.5 + } +} +layer { + name: "conv9_2_mbox_loc" + type: "Convolution" + bottom: "conv9_2_h" + top: "conv9_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 16 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv9_2_mbox_loc_perm" + type: "Permute" + bottom: "conv9_2_mbox_loc" + top: "conv9_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv9_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv9_2_mbox_loc_perm" + top: "conv9_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv9_2_mbox_conf" + type: "Convolution" + bottom: "conv9_2_h" + top: "conv9_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 8 # 84 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv9_2_mbox_conf_perm" + type: "Permute" + bottom: "conv9_2_mbox_conf" + top: "conv9_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv9_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv9_2_mbox_conf_perm" + top: "conv9_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv9_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv9_2_h" + bottom: "data" + top: "conv9_2_mbox_priorbox" + prior_box_param { + min_size: 264.0 + max_size: 315.0 + aspect_ratio: 2 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 300 + offset: 0.5 + } +} +layer { + name: "mbox_loc" + type: "Concat" + bottom: "conv4_3_norm_mbox_loc_flat" + bottom: "fc7_mbox_loc_flat" + bottom: "conv6_2_mbox_loc_flat" + bottom: "conv7_2_mbox_loc_flat" + bottom: "conv8_2_mbox_loc_flat" + bottom: "conv9_2_mbox_loc_flat" + top: "mbox_loc" + concat_param { + axis: 1 + } +} +layer { + name: "mbox_conf" + type: "Concat" + bottom: "conv4_3_norm_mbox_conf_flat" + bottom: "fc7_mbox_conf_flat" + bottom: "conv6_2_mbox_conf_flat" + bottom: "conv7_2_mbox_conf_flat" + bottom: "conv8_2_mbox_conf_flat" + bottom: "conv9_2_mbox_conf_flat" + top: "mbox_conf" + concat_param { + axis: 1 + } +} +layer { + name: "mbox_priorbox" + type: "Concat" + bottom: "conv4_3_norm_mbox_priorbox" + bottom: "fc7_mbox_priorbox" + bottom: "conv6_2_mbox_priorbox" + bottom: "conv7_2_mbox_priorbox" + bottom: "conv8_2_mbox_priorbox" + bottom: "conv9_2_mbox_priorbox" + top: "mbox_priorbox" + concat_param { + axis: 2 + } +} + +layer { + name: "mbox_conf_reshape" + type: "Reshape" + bottom: "mbox_conf" + top: "mbox_conf_reshape" + reshape_param { + shape { + dim: 0 + dim: -1 + dim: 2 + } + } +} +layer { + name: "mbox_conf_softmax" + type: "Softmax" + bottom: "mbox_conf_reshape" + top: "mbox_conf_softmax" + softmax_param { + axis: 2 + } +} +layer { + name: "mbox_conf_flatten" + type: "Flatten" + bottom: "mbox_conf_softmax" + top: "mbox_conf_flatten" + flatten_param { + axis: 1 + } +} + +layer { + name: "detection_out" + type: "DetectionOutput" + bottom: "mbox_loc" + bottom: "mbox_conf_flatten" + bottom: "mbox_priorbox" + top: "detection_out" + include { + phase: TEST + } + detection_output_param { + num_classes: 2 + share_location: true + background_label_id: 0 + nms_param { + nms_threshold: 0.45 + top_k: 400 + } + code_type: CENTER_SIZE + keep_top_k: 200 + confidence_threshold: 0.01 + clip: 1 + } +} diff --git a/cvdata/res10_300x300_ssd_iter_140000_fp16.caffemodel b/cvdata/res10_300x300_ssd_iter_140000_fp16.caffemodel new file mode 100644 index 0000000..0e9cd4a Binary files /dev/null and b/cvdata/res10_300x300_ssd_iter_140000_fp16.caffemodel differ diff --git a/src/cv.cpp b/src/cv.cpp index a2e08df..2d3ecb6 100644 --- a/src/cv.cpp +++ b/src/cv.cpp @@ -7,13 +7,19 @@ #include cv::Ptr facemark; -cv::CascadeClassifier faceDetector; +cv::CascadeClassifier haarFaceDetector; +cv::dnn::Net dnnFaceDetector; cv::VideoCapture vid; cv::Mat frame, gray, small; +bool useHaar; -void initCV() { - //TODO: switch to DNN face detection - faceDetector = cv::CascadeClassifier (resolvePath("cvdata/haarcascade_frontalface_alt2.xml")); +void initCV(bool haar) { + useHaar = haar; + + haarFaceDetector = cv::CascadeClassifier (resolvePath("cvdata/haarcascade_frontalface_alt2.xml")); + dnnFaceDetector = cv::dnn::readNetFromCaffe( + resolvePath("cvdata/deploy.prototxt"), + resolvePath("cvdata/res10_300x300_ssd_iter_140000_fp16.caffemodel") ); facemark = cv::face::FacemarkLBF::create(); facemark->loadModel (resolvePath("cvdata/lbfmodel.yaml")); @@ -21,24 +27,56 @@ void initCV() { vid = cv::VideoCapture (0); } +void dnnFaceDetect(cv::Mat inFrame, std::vector* faces) { + cv::Mat inputBlob = cv::dnn::blobFromImage(inFrame, 1.0f, cv::Size(300, 300), cv::Scalar(104, 177, 123, 0), false, false); + + dnnFaceDetector.setInput(inputBlob, "data"); + cv::Mat output = dnnFaceDetector.forward("detection_out"); + cv::Mat detection(output.size[2], output.size[3], CV_32F, output.ptr()); + + for (int i = 0; i < detection.rows; i++) { + float confidence = detection.at(i, 2); + + if (confidence > 0.75f) { + int x1 = detection.at(i, 3) * inFrame.cols; + int y1 = detection.at(i, 4) * inFrame.rows; + int x2 = detection.at(i, 5) * inFrame.cols; + int y2 = detection.at(i, 6) * inFrame.rows; + + cv::Point2f pt1(x1, y1); + cv::Point2f pt2(x2, y2); + + faces->push_back(cv::Rect(pt1, pt2)); + } + } +} + //process image and send controls to graphics void cvFrame() { vid.read(frame); cv::cvtColor (frame, gray, cv::COLOR_BGR2GRAY); - //downsample image for face detection, works too slow on full res - cv::pyrDown (gray, small); - cv::pyrDown (small, small); std::vector faces; - faceDetector.detectMultiScale(small, faces); + + if (useHaar) { + //downsample image for face detection, works too slow on full res + cv::pyrDown (gray, small); + cv::pyrDown (small, small); + + haarFaceDetector.detectMultiScale(small, faces); + } else { + dnnFaceDetect(frame, &faces); + } //get biggest face int biggestFace = 0; int biggestArea = 0; for (int i = 0; i < faces.size(); i++) { //convert face region to full res, because we perform facemark on full res - faces[i] = cv::Rect (faces[i].x * 4, faces[i].y * 4, faces[i].width * 4, faces[i].height * 4); + if (useHaar) { + faces[i] = cv::Rect (faces[i].x * 4, faces[i].y * 4, faces[i].width * 4, faces[i].height * 4); + } int iArea = faces[i].area(); if (iArea > biggestArea) { diff --git a/src/cv.hpp b/src/cv.hpp index adf1146..bd0e1b7 100644 --- a/src/cv.hpp +++ b/src/cv.hpp @@ -1,7 +1,7 @@ #ifndef CV_HPP #define CV_HPP -void initCV(); +void initCV(bool haar); void cvFrame(); diff --git a/src/main.cpp b/src/main.cpp index 614f7f2..90e7de2 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -3,8 +3,9 @@ #include #include +#include -int main () { +int main (int argc, char** argv) { std::cout << "Facecam2D is starting..." << std::endl; initPrefixes(); @@ -12,7 +13,8 @@ int main () { std::cout << "Default asset prefix: " << prefixDefault << std::endl; initGraphics(); - initCV(); + //TODO: real argument parsing + initCV(argc > 1 && strcmp(argv[1], "--haar-cascade") == 0); while (true) { cvFrame();