|
_base_ = ['models'] |
|
|
|
model = dict( |
|
type='R2Tuning', |
|
arch='ViT-B/32', |
|
init=False, |
|
dims=256, |
|
strides=(1, 2, 4, 8), |
|
buffer_size=1024, |
|
max_num_moment=50, |
|
adapter_cfg=dict( |
|
type='R2Block', |
|
k=4, |
|
dropout=0.5, |
|
use_tef=True, |
|
pos_cfg=dict(type='PositionalEncoding', normalize=True, max_len=1024), |
|
tem_cfg=dict( |
|
type='TransformerDecoderLayer', |
|
heads=8, |
|
ratio=4, |
|
att_dropout=0.0, |
|
ffn_dropout=0.0, |
|
att_out_dropout=0.0, |
|
ffn_out_dropout=0.0, |
|
droppath=0.1, |
|
pre_norm=False, |
|
bias=True, |
|
norm_cfg=dict(type='LN'), |
|
act_cfg=dict(type='ReLU', inplace=True), |
|
order=('cross_att', 'self_att', 'ffn'), |
|
att_init_cfg=dict(type='xavier', distribution='uniform'), |
|
ffn_init_cfg=dict(type='kaiming'))), |
|
pyramid_cfg=dict(type='ConvPyramid'), |
|
pooling_cfg=dict(type='AdaPooling'), |
|
class_head_cfg=dict(type='ConvHead', kernal_size=3), |
|
coord_head_cfg=dict(type='ConvHead', kernal_size=3), |
|
loss_cfg=dict( |
|
type='BundleLoss', |
|
sample_radius=1.5, |
|
loss_cls=dict(type='FocalLoss', loss_weight=1.0), |
|
loss_reg=dict(type='L1Loss', loss_weight=0.2), |
|
loss_sal=dict(type='SampledNCELoss', loss_weight=0.1), |
|
loss_video_cal=dict(type='InfoNCELoss', loss_weight=0.1), |
|
loss_layer_cal=dict(type='InfoNCELoss', loss_weight=0.1))) |
|
|