Skip to content

Commit 00af501

Browse files
committed
Release detection models
1 parent 0dec021 commit 00af501

6 files changed

+502
-4
lines changed

detection/README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,12 @@ Prepare datasets according to the guidelines in [MMDetection v2.28.1](https://gi
112112
| :--------: | :--------------: | :--: | :-----: | :----: | :----------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
113113
| DINO | InternImage-T | 1x | 53.9 | 49M | [config](./configs/coco/dino_4scale_internimage_t_1x_coco_layer_wise_lr.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_t_1x_coco.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_t_1x_coco.json) |
114114
| DINO | InternImage-L | 1x | 57.6 | 241M | [config](./configs/coco/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.log.json) |
115-
| DINO | CB-InternImage-H | 1x | 64.5 | 2.18B | [config](./configs/coco/dino_4scale_cbinternimage_h_objects365_coco_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_cbinternimage_h_objects365_coco.pth) |
116-
| DINO (TTA) | CB-InternImage-H | 1x | 65.0 | 2.18B | - | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_cbinternimage_h_objects365_coco.pth) |
115+
| DINO | InternImage-H | 1x | 63.4 | 1.1B | [config](./configs/coco/dino_4scale_internimage_h_objects365_coco_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_h_objects365_coco.pth) |
116+
| DINO | CB-InternImage-H | 1x | 64.5 | 2.2B | [config](./configs/coco/dino_4scale_cbinternimage_h_objects365_coco_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_cbinternimage_h_objects365_coco.pth) |
117+
| DINO (TTA) | CB-InternImage-H | 1x | 65.0 | 2.2B | TODO | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_cbinternimage_h_objects365_coco.pth) |
118+
| DINO | InternImage-G | 1x | 64.2 | 3.1B | [config](./configs/coco/dino_4scale_internimage_g_objects365_coco_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_g_objects365_coco.pth) |
119+
| DINO (TTA) | CB-InternImage-G | 1x | 65.1 | 6B | TODO | TODO |
120+
| DINO (TTA) | CB-InternImage-G | 1x | 65.3 | 6B | TODO | TODO |
117121

118122
</div>
119123

detection/configs/coco/dino_4scale_cbinternimage_h_objects365_coco_ss.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
# --------------------------------------------------------
2+
# InternImage
3+
# Copyright (c) 2022 OpenGVLab
4+
# Licensed under The MIT License [see LICENSE for details]
5+
# --------------------------------------------------------
16
_base_ = [
27
'../_base_/datasets/coco_detection.py',
38
'../_base_/default_runtime.py'
@@ -122,7 +127,7 @@
122127
snip_cfg=dict(
123128
type='v3',
124129
weight=0.1)),
125-
test_cfg=dict(max_per_img=300)) # TODO: Originally 100
130+
test_cfg=dict(max_per_img=300))
126131
img_norm_cfg = dict(
127132
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
128133
# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
# --------------------------------------------------------
2+
# InternImage
3+
# Copyright (c) 2022 OpenGVLab
4+
# Licensed under The MIT License [see LICENSE for details]
5+
# --------------------------------------------------------
6+
_base_ = [
7+
'../_base_/datasets/coco_detection.py',
8+
'../_base_/default_runtime.py'
9+
]
10+
model = dict(
11+
type='DINO',
12+
backbone=dict(
13+
type='InternImage',
14+
core_op='DCNv3',
15+
channels=512,
16+
depths=[2, 2, 48, 4],
17+
groups=[16, 32, 64, 128],
18+
mlp_ratio=4.,
19+
drop_path_rate=0.5,
20+
norm_layer='LN',
21+
layer_scale=None,
22+
offset_scale=1.0,
23+
post_norm=True,
24+
dw_kernel_size=5, # for InternImage-H/G
25+
res_post_norm=False, # for InternImage-H/G
26+
level2_post_norm=True, # for InternImage-H/G
27+
level2_post_norm_block_ids=[5, 11, 17, 23, 29, 35, 41, 47], # for InternImage-H/G
28+
center_feature_scale=True, # for InternImage-H/G
29+
with_cp=True,
30+
out_indices=(1, 2, 3),
31+
init_cfg=None # dict(type='Pretrained', checkpoint=pretrained)
32+
),
33+
neck=dict(
34+
type='ChannelMapper',
35+
in_channels=[1024, 2048, 4096],
36+
kernel_size=1,
37+
out_channels=256,
38+
act_cfg=None,
39+
norm_cfg=dict(type='GN', num_groups=32),
40+
num_outs=4),
41+
bbox_head=dict(
42+
type='DINOHead',
43+
num_query=900,
44+
num_classes=80,
45+
in_channels=2048, # TODO
46+
sync_cls_avg_factor=True,
47+
as_two_stage=True,
48+
with_box_refine=True,
49+
dn_cfg=dict(
50+
type='CdnQueryGenerator',
51+
noise_scale=dict(label=0.5, box=1.0), # 0.5, 0.4 for DN-DETR
52+
group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=1000)),
53+
transformer=dict(
54+
type='DinoTransformer',
55+
two_stage_num_proposals=900,
56+
encoder=dict(
57+
type='DetrTransformerEncoder',
58+
num_layers=6,
59+
transformerlayers=dict(
60+
type='BaseTransformerLayer',
61+
attn_cfgs=dict(
62+
type='MultiScaleDeformableAttention',
63+
embed_dims=256,
64+
dropout=0.0), # 0.1 for DeformDETR
65+
feedforward_channels=2048, # 1024 for DeformDETR
66+
ffn_cfgs=dict(
67+
type='EfficientFFN',
68+
embed_dims=256,
69+
feedforward_channels=2048,
70+
num_fcs=2,
71+
ffn_drop=0.,
72+
use_checkpoint=True,
73+
act_cfg=dict(type='ReLU', inplace=True),),
74+
ffn_dropout=0.0, # 0.1 for DeformDETR
75+
operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
76+
decoder=dict(
77+
type='DinoTransformerDecoder',
78+
num_layers=6,
79+
return_intermediate=True,
80+
transformerlayers=dict(
81+
type='DetrTransformerDecoderLayer',
82+
attn_cfgs=[
83+
dict(
84+
type='MultiheadAttention',
85+
embed_dims=256,
86+
num_heads=8,
87+
dropout=0.0), # 0.1 for DeformDETR
88+
dict(
89+
type='MultiScaleDeformableAttention',
90+
num_levels=4,
91+
embed_dims=256,
92+
dropout=0.0), # 0.1 for DeformDETR
93+
],
94+
feedforward_channels=2048, # 1024 for DeformDETR
95+
ffn_cfgs=dict(
96+
type='EfficientFFN',
97+
embed_dims=256,
98+
feedforward_channels=2048,
99+
num_fcs=2,
100+
ffn_drop=0.,
101+
use_checkpoint=True,
102+
act_cfg=dict(type='ReLU', inplace=True),),
103+
ffn_dropout=0.0, # 0.1 for DeformDETR
104+
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
105+
'ffn', 'norm')))),
106+
positional_encoding=dict(
107+
type='SinePositionalEncoding',
108+
num_feats=128,
109+
temperature=20,
110+
normalize=True),
111+
loss_cls=dict(
112+
type='FocalLoss',
113+
use_sigmoid=True,
114+
gamma=2.0,
115+
alpha=0.25,
116+
loss_weight=1.0), # 2.0 in DeformDETR
117+
loss_bbox=dict(type='L1Loss', loss_weight=5.0),
118+
loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
119+
# training and testing settings
120+
train_cfg=dict(
121+
assigner=dict(
122+
type='HungarianAssigner',
123+
cls_cost=dict(type='FocalLossCost', weight=2.0),
124+
reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
125+
iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0)),
126+
snip_cfg=dict(
127+
type='v3',
128+
weight=0.1)),
129+
test_cfg=dict(max_per_img=300))
130+
img_norm_cfg = dict(
131+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
132+
# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
133+
# from the default setting in mmdet.
134+
train_pipeline = [
135+
dict(type='LoadImageFromFile'),
136+
dict(type='LoadAnnotations', with_bbox=True),
137+
dict(type='RandomFlip', flip_ratio=0.5),
138+
dict(type='Resize',
139+
img_scale=[(2000, 600), (2000, 1800)],
140+
multiscale_mode='range',
141+
keep_ratio=True),
142+
dict(type='Normalize', **img_norm_cfg),
143+
dict(type='Pad', size_divisor=32),
144+
dict(type='DefaultFormatBundle'),
145+
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
146+
]
147+
test_pipeline = [
148+
dict(type='LoadImageFromFile'),
149+
dict(
150+
type='MultiScaleFlipAug',
151+
img_scale=(2000, 1000),
152+
flip=False,
153+
transforms=[
154+
dict(type='Resize', keep_ratio=True),
155+
dict(type='RandomFlip'),
156+
dict(type='Normalize', **img_norm_cfg),
157+
dict(type='Pad', size_divisor=32),
158+
dict(type='ImageToTensor', keys=['img']),
159+
dict(type='Collect', keys=['img'])
160+
])
161+
]
162+
data = dict(
163+
samples_per_gpu=1,
164+
workers_per_gpu=2,
165+
train=dict(filter_empty_gt=True, pipeline=train_pipeline),
166+
val=dict(pipeline=test_pipeline),
167+
test=dict(pipeline=test_pipeline))
168+
# optimizer
169+
optimizer = dict(
170+
type='AdamW', lr=0.0001, weight_decay=0.0001,
171+
constructor='CustomLayerDecayOptimizerConstructor',
172+
paramwise_cfg=dict(num_layers=56, layer_decay_rate=0.94,
173+
depths=[2, 2, 48, 4], offset_lr_scale=1e-3))
174+
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
175+
# learning policy
176+
lr_config = dict(
177+
policy='step',
178+
warmup='linear',
179+
warmup_iters=500,
180+
warmup_ratio=0.001,
181+
step=[])
182+
runner = dict(type='IterBasedRunner', max_iters=20000)
183+
checkpoint_config = dict(interval=200, max_keep_ckpts=3)
184+
evaluation = dict(interval=200, save_best='auto')
185+
# resume_from = None
186+
# custom_hooks = [
187+
# dict(
188+
# type='ExpMomentumEMAHook',
189+
# resume_from=resume_from,
190+
# momentum=0.0003,
191+
# priority=49)
192+
# ]

0 commit comments

Comments
 (0)