File size: 1,525 Bytes
bc120ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
_base_ = ['models']
# model settings
model = dict(
type='R2Tuning',
arch='ViT-B/32',
init=False,
dims=256,
strides=(1, 2, 4, 8),
buffer_size=1024,
max_num_moment=50,
adapter_cfg=dict(
type='R2Block',
k=4,
dropout=0.5,
use_tef=True,
pos_cfg=dict(type='PositionalEncoding', normalize=True, max_len=1024),
tem_cfg=dict(
type='TransformerDecoderLayer',
heads=8,
ratio=4,
att_dropout=0.0,
ffn_dropout=0.0,
att_out_dropout=0.0,
ffn_out_dropout=0.0,
droppath=0.1,
pre_norm=False,
bias=True,
norm_cfg=dict(type='LN'),
act_cfg=dict(type='ReLU', inplace=True),
order=('cross_att', 'self_att', 'ffn'),
att_init_cfg=dict(type='xavier', distribution='uniform'),
ffn_init_cfg=dict(type='kaiming'))),
pyramid_cfg=dict(type='ConvPyramid'),
pooling_cfg=dict(type='AdaPooling'),
class_head_cfg=dict(type='ConvHead', kernal_size=3),
coord_head_cfg=dict(type='ConvHead', kernal_size=3),
loss_cfg=dict(
type='BundleLoss',
sample_radius=1.5,
loss_cls=dict(type='FocalLoss', loss_weight=1.0),
loss_reg=dict(type='L1Loss', loss_weight=0.2),
loss_sal=dict(type='SampledNCELoss', loss_weight=0.1),
loss_video_cal=dict(type='InfoNCELoss', loss_weight=0.1),
loss_layer_cal=dict(type='InfoNCELoss', loss_weight=0.1)))
|