|
| 1 | +# -------------------------------------------------------- |
| 2 | +# InternImage |
| 3 | +# Copyright (c) 2022 OpenGVLab |
| 4 | +# Licensed under The MIT License [see LICENSE for details] |
| 5 | +# -------------------------------------------------------- |
| 6 | +_base_ = [ |
| 7 | + '../_base_/datasets/coco_detection.py', |
| 8 | + '../_base_/default_runtime.py' |
| 9 | +] |
| 10 | +model = dict( |
| 11 | + type='DINO', |
| 12 | + backbone=dict( |
| 13 | + type='InternImage', |
| 14 | + core_op='DCNv3', |
| 15 | + channels=512, |
| 16 | + depths=[2, 2, 48, 4], |
| 17 | + groups=[16, 32, 64, 128], |
| 18 | + mlp_ratio=4., |
| 19 | + drop_path_rate=0.5, |
| 20 | + norm_layer='LN', |
| 21 | + layer_scale=None, |
| 22 | + offset_scale=1.0, |
| 23 | + post_norm=True, |
| 24 | + dw_kernel_size=5, # for InternImage-H/G |
| 25 | + res_post_norm=False, # for InternImage-H/G |
| 26 | + level2_post_norm=True, # for InternImage-H/G |
| 27 | + level2_post_norm_block_ids=[5, 11, 17, 23, 29, 35, 41, 47], # for InternImage-H/G |
| 28 | + center_feature_scale=True, # for InternImage-H/G |
| 29 | + with_cp=True, |
| 30 | + out_indices=(1, 2, 3), |
| 31 | + init_cfg=None # dict(type='Pretrained', checkpoint=pretrained) |
| 32 | + ), |
| 33 | + neck=dict( |
| 34 | + type='ChannelMapper', |
| 35 | + in_channels=[1024, 2048, 4096], |
| 36 | + kernel_size=1, |
| 37 | + out_channels=256, |
| 38 | + act_cfg=None, |
| 39 | + norm_cfg=dict(type='GN', num_groups=32), |
| 40 | + num_outs=4), |
| 41 | + bbox_head=dict( |
| 42 | + type='DINOHead', |
| 43 | + num_query=900, |
| 44 | + num_classes=80, |
| 45 | + in_channels=2048, # TODO |
| 46 | + sync_cls_avg_factor=True, |
| 47 | + as_two_stage=True, |
| 48 | + with_box_refine=True, |
| 49 | + dn_cfg=dict( |
| 50 | + type='CdnQueryGenerator', |
| 51 | + noise_scale=dict(label=0.5, box=1.0), # 0.5, 0.4 for DN-DETR |
| 52 | + group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=1000)), |
| 53 | + transformer=dict( |
| 54 | + type='DinoTransformer', |
| 55 | + two_stage_num_proposals=900, |
| 56 | + encoder=dict( |
| 57 | + type='DetrTransformerEncoder', |
| 58 | + num_layers=6, |
| 59 | + transformerlayers=dict( |
| 60 | + type='BaseTransformerLayer', |
| 61 | + attn_cfgs=dict( |
| 62 | + type='MultiScaleDeformableAttention', |
| 63 | + embed_dims=256, |
| 64 | + dropout=0.0), # 0.1 for DeformDETR |
| 65 | + feedforward_channels=2048, # 1024 for DeformDETR |
| 66 | + ffn_cfgs=dict( |
| 67 | + type='EfficientFFN', |
| 68 | + embed_dims=256, |
| 69 | + feedforward_channels=2048, |
| 70 | + num_fcs=2, |
| 71 | + ffn_drop=0., |
| 72 | + use_checkpoint=True, |
| 73 | + act_cfg=dict(type='ReLU', inplace=True),), |
| 74 | + ffn_dropout=0.0, # 0.1 for DeformDETR |
| 75 | + operation_order=('self_attn', 'norm', 'ffn', 'norm'))), |
| 76 | + decoder=dict( |
| 77 | + type='DinoTransformerDecoder', |
| 78 | + num_layers=6, |
| 79 | + return_intermediate=True, |
| 80 | + transformerlayers=dict( |
| 81 | + type='DetrTransformerDecoderLayer', |
| 82 | + attn_cfgs=[ |
| 83 | + dict( |
| 84 | + type='MultiheadAttention', |
| 85 | + embed_dims=256, |
| 86 | + num_heads=8, |
| 87 | + dropout=0.0), # 0.1 for DeformDETR |
| 88 | + dict( |
| 89 | + type='MultiScaleDeformableAttention', |
| 90 | + num_levels=4, |
| 91 | + embed_dims=256, |
| 92 | + dropout=0.0), # 0.1 for DeformDETR |
| 93 | + ], |
| 94 | + feedforward_channels=2048, # 1024 for DeformDETR |
| 95 | + ffn_cfgs=dict( |
| 96 | + type='EfficientFFN', |
| 97 | + embed_dims=256, |
| 98 | + feedforward_channels=2048, |
| 99 | + num_fcs=2, |
| 100 | + ffn_drop=0., |
| 101 | + use_checkpoint=True, |
| 102 | + act_cfg=dict(type='ReLU', inplace=True),), |
| 103 | + ffn_dropout=0.0, # 0.1 for DeformDETR |
| 104 | + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', |
| 105 | + 'ffn', 'norm')))), |
| 106 | + positional_encoding=dict( |
| 107 | + type='SinePositionalEncoding', |
| 108 | + num_feats=128, |
| 109 | + temperature=20, |
| 110 | + normalize=True), |
| 111 | + loss_cls=dict( |
| 112 | + type='FocalLoss', |
| 113 | + use_sigmoid=True, |
| 114 | + gamma=2.0, |
| 115 | + alpha=0.25, |
| 116 | + loss_weight=1.0), # 2.0 in DeformDETR |
| 117 | + loss_bbox=dict(type='L1Loss', loss_weight=5.0), |
| 118 | + loss_iou=dict(type='GIoULoss', loss_weight=2.0)), |
| 119 | + # training and testing settings |
| 120 | + train_cfg=dict( |
| 121 | + assigner=dict( |
| 122 | + type='HungarianAssigner', |
| 123 | + cls_cost=dict(type='FocalLossCost', weight=2.0), |
| 124 | + reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), |
| 125 | + iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0)), |
| 126 | + snip_cfg=dict( |
| 127 | + type='v3', |
| 128 | + weight=0.1)), |
| 129 | + test_cfg=dict(max_per_img=300)) |
| 130 | +img_norm_cfg = dict( |
| 131 | + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) |
| 132 | +# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different |
| 133 | +# from the default setting in mmdet. |
| 134 | +train_pipeline = [ |
| 135 | + dict(type='LoadImageFromFile'), |
| 136 | + dict(type='LoadAnnotations', with_bbox=True), |
| 137 | + dict(type='RandomFlip', flip_ratio=0.5), |
| 138 | + dict(type='Resize', |
| 139 | + img_scale=[(2000, 600), (2000, 1800)], |
| 140 | + multiscale_mode='range', |
| 141 | + keep_ratio=True), |
| 142 | + dict(type='Normalize', **img_norm_cfg), |
| 143 | + dict(type='Pad', size_divisor=32), |
| 144 | + dict(type='DefaultFormatBundle'), |
| 145 | + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) |
| 146 | +] |
| 147 | +test_pipeline = [ |
| 148 | + dict(type='LoadImageFromFile'), |
| 149 | + dict( |
| 150 | + type='MultiScaleFlipAug', |
| 151 | + img_scale=(2000, 1000), |
| 152 | + flip=False, |
| 153 | + transforms=[ |
| 154 | + dict(type='Resize', keep_ratio=True), |
| 155 | + dict(type='RandomFlip'), |
| 156 | + dict(type='Normalize', **img_norm_cfg), |
| 157 | + dict(type='Pad', size_divisor=32), |
| 158 | + dict(type='ImageToTensor', keys=['img']), |
| 159 | + dict(type='Collect', keys=['img']) |
| 160 | + ]) |
| 161 | +] |
| 162 | +data = dict( |
| 163 | + samples_per_gpu=1, |
| 164 | + workers_per_gpu=2, |
| 165 | + train=dict(filter_empty_gt=True, pipeline=train_pipeline), |
| 166 | + val=dict(pipeline=test_pipeline), |
| 167 | + test=dict(pipeline=test_pipeline)) |
| 168 | +# optimizer |
| 169 | +optimizer = dict( |
| 170 | + type='AdamW', lr=0.0001, weight_decay=0.0001, |
| 171 | + constructor='CustomLayerDecayOptimizerConstructor', |
| 172 | + paramwise_cfg=dict(num_layers=56, layer_decay_rate=0.94, |
| 173 | + depths=[2, 2, 48, 4], offset_lr_scale=1e-3)) |
| 174 | +optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2)) |
| 175 | +# learning policy |
| 176 | +lr_config = dict( |
| 177 | + policy='step', |
| 178 | + warmup='linear', |
| 179 | + warmup_iters=500, |
| 180 | + warmup_ratio=0.001, |
| 181 | + step=[]) |
| 182 | +runner = dict(type='IterBasedRunner', max_iters=20000) |
| 183 | +checkpoint_config = dict(interval=200, max_keep_ckpts=3) |
| 184 | +evaluation = dict(interval=200, save_best='auto') |
| 185 | +# resume_from = None |
| 186 | +# custom_hooks = [ |
| 187 | +# dict( |
| 188 | +# type='ExpMomentumEMAHook', |
| 189 | +# resume_from=resume_from, |
| 190 | +# momentum=0.0003, |
| 191 | +# priority=49) |
| 192 | +# ] |
0 commit comments