diff --git a/model_examples/BEVFormer/bev_former_config.patch b/model_examples/BEVFormer/bev_former_config.patch index 8c28df83dca4f9899c7c2ab7ac4ceeb8a05821a0..f9be43bd304d116870e86c5d0e0f32ace67601aa 100644 --- a/model_examples/BEVFormer/bev_former_config.patch +++ b/model_examples/BEVFormer/bev_former_config.patch @@ -1,17 +1,26 @@ diff --git a/projects/configs/bevformer/bevformer_base.py b/projects/configs/bevformer/bevformer_base.py -index fda635c..315ef73 100644 +index fda635c..e6197c6 100644 --- a/projects/configs/bevformer/bevformer_base.py +++ b/projects/configs/bevformer/bevformer_base.py -@@ -29,12 +29,13 @@ input_modality = dict( - use_external=True) - - _dim_ = 256 +@@ -26,18 +26,19 @@ input_modality = dict( + use_camera=True, + use_radar=False, + use_map=False, +- use_external=True) +- +-_dim_ = 256 -_pos_dim_ = _dim_//2 -_ffn_dim_ = _dim_*2 -_num_levels_ = 4 -bev_h_ = 200 -bev_w_ = 200 -queue_length = 4 # each sequence contains `queue_length` frames. +- +-model = dict( +- type='BEVFormer', ++ use_external=True) ++ ++_dim_ = 256 +_pos_dim_ = _dim_//2 +_ffn_dim_ = _dim_*2 +_num_levels_ = 4 @@ -19,19 +28,31 @@ index fda635c..315ef73 100644 +bev_h_ = 200 +bev_w_ = 200 +queue_length = 4 # each sequence contains `queue_length` frames. - - model = dict( - type='BEVFormer', -@@ -58,12 +59,13 @@ model = dict( - start_level=0, - add_extra_convs='on_output', - num_outs=4, ++ ++model = dict( ++ type='BEVFormer', + use_grid_mask=True, + video_test_mode=True, + img_backbone=dict( +@@ -55,18 +56,19 @@ model = dict( + type='FPN', + in_channels=[512, 1024, 2048], + out_channels=_dim_, +- start_level=0, +- add_extra_convs='on_output', +- num_outs=4, - relu_before_extra_convs=True), - pts_bbox_head=dict( - type='BEVFormerHead', - bev_h=bev_h_, - bev_w=bev_w_, - num_query=900, +- num_classes=10, +- in_channels=_dim_, +- sync_cls_avg_factor=True, ++ start_level=0, ++ add_extra_convs='on_output', ++ num_outs=4, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='BEVFormerHead', @@ -39,19 +60,31 @@ index fda635c..315ef73 100644 + bev_h=bev_h_, + bev_w=bev_w_, + num_query=900, - num_classes=10, - in_channels=_dim_, - sync_cls_avg_factor=True, -@@ -80,12 +82,14 @@ model = dict( - num_layers=6, - pc_range=point_cloud_range, - num_points_in_pillar=4, ++ num_classes=10, ++ in_channels=_dim_, ++ sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + transformer=dict( +@@ -77,18 +79,20 @@ model = dict( + embed_dims=_dim_, + encoder=dict( + type='BEVFormerEncoder', +- num_layers=6, +- pc_range=point_cloud_range, +- num_points_in_pillar=4, - return_intermediate=False, - transformerlayers=dict( - type='BEVFormerLayer', - attn_cfgs=[ - dict( - type='TemporalSelfAttention', +- embed_dims=_dim_, +- num_levels=1), +- dict( ++ num_layers=6, ++ pc_range=point_cloud_range, ++ num_points_in_pillar=4, + return_intermediate=False, + transformerlayers=dict( + type='BEVFormerLayer', @@ -60,13 +93,19 @@ index fda635c..315ef73 100644 + attn_cfgs=[ + dict( + type='TemporalSelfAttention', - embed_dims=_dim_, - num_levels=1), - dict( -@@ -223,13 +227,14 @@ data = dict( - classes=class_names, modality=input_modality), - shuffler_sampler=dict(type='DistributedGroupSampler'), - nonshuffler_sampler=dict(type='DistributedSampler') ++ embed_dims=_dim_, ++ num_levels=1), ++ dict( + type='SpatialCrossAttention', + pc_range=point_cloud_range, + deformable_attention=dict( +@@ -220,35 +224,36 @@ data = dict( + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), +- classes=class_names, modality=input_modality), +- shuffler_sampler=dict(type='DistributedGroupSampler'), +- nonshuffler_sampler=dict(type='DistributedSampler') -) - -optimizer = dict( @@ -74,6 +113,12 @@ index fda635c..315ef73 100644 - lr=2e-4, - paramwise_cfg=dict( - custom_keys={ +- 'img_backbone': dict(lr_mult=0.1), +- }), +- weight_decay=0.01) ++ classes=class_names, modality=input_modality), ++ shuffler_sampler=dict(type='DistributedGroupSampler'), ++ nonshuffler_sampler=dict(type='DistributedSampler') +) + +optimizer = dict( @@ -82,13 +127,15 @@ index fda635c..315ef73 100644 + lr=2e-4, + paramwise_cfg=dict( + custom_keys={ - 'img_backbone': dict(lr_mult=0.1), - }), - weight_decay=0.01) -@@ -239,13 +244,13 @@ optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) - lr_config = dict( - policy='CosineAnnealing', - warmup='linear', ++ 'img_backbone': dict(lr_mult=0.1), ++ }), ++ weight_decay=0.01) + + optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) + # learning policy +-lr_config = dict( +- policy='CosineAnnealing', +- warmup='linear', - warmup_iters=500, - warmup_ratio=1.0 / 3, - min_lr_ratio=1e-3) @@ -96,6 +143,12 @@ index fda635c..315ef73 100644 -evaluation = dict(interval=1, pipeline=test_pipeline) - -runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) +-load_from = 'ckpts/r101_dcn_fcos3d_pretrain.pth' +-log_config = dict( +- interval=50, ++lr_config = dict( ++ policy='CosineAnnealing', ++ warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3) @@ -103,9 +156,12 @@ index fda635c..315ef73 100644 +evaluation = dict(interval=1, pipeline=test_pipeline) + +runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) - load_from = 'ckpts/r101_dcn_fcos3d_pretrain.pth' - log_config = dict( - interval=50, ++load_from = 'ckpts/r101_dcn_fcos3d_pretrain.pth' ++log_config = dict( ++ interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') diff --git a/projects/configs/bevformer_fp16/bevformer_base_fp16.py b/projects/configs/bevformer_fp16/bevformer_base_fp16.py new file mode 100644 index 0000000..7f39f01 @@ -458,20 +514,41 @@ index 93c7cd7..4ca3e63 100644 if only_bev: # only use encoder to obtain BEV features, TODO: refine the workaround return self.transformer.get_bev_features( +diff --git a/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py b/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py +index 5325e3c..144f3f2 100644 +--- a/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py ++++ b/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py +@@ -4,7 +4,6 @@ + # Modified by Zhiqi Li + # --------------------------------------------- + +-from tkinter.messagebox import NO + import torch + from mmcv.runner import force_fp32, auto_fp16 + from mmdet.models import DETECTORS diff --git a/projects/mmdet3d_plugin/bevformer/modules/decoder.py b/projects/mmdet3d_plugin/bevformer/modules/decoder.py -index 33024f8..4598eed 100644 +index 33024f8..e6debc2 100644 --- a/projects/mmdet3d_plugin/bevformer/modules/decoder.py +++ b/projects/mmdet3d_plugin/bevformer/modules/decoder.py -@@ -26,12 +26,13 @@ from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning, - from mmcv.utils import ext_loader - from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \ - MultiScaleDeformableAttnFunction_fp16 +@@ -23,18 +23,19 @@ from mmcv.runner.base_module import BaseModule, ModuleList, Sequential + from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning, + to_2tuple) + +-from mmcv.utils import ext_loader +-from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \ +- MultiScaleDeformableAttnFunction_fp16 - -ext_module = ext_loader.load_ext( - '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) - - -def inverse_sigmoid(x, eps=1e-5): +- """Inverse function of sigmoid. +- Args: +- x (Tensor): The tensor to do the ++from mmcv.utils import ext_loader ++from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \ ++ MultiScaleDeformableAttnFunction_fp16 + +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) @@ -479,13 +556,19 @@ index 33024f8..4598eed 100644 + + +def inverse_sigmoid(x, eps=1e-5): - """Inverse function of sigmoid. - Args: - x (Tensor): The tensor to do the -@@ -320,21 +321,14 @@ class CustomMSDeformableAttention(BaseModule): - * 0.5 - else: - raise ValueError( ++ """Inverse function of sigmoid. ++ Args: ++ x (Tensor): The tensor to do the + inverse. + eps (float): EPS avoid numerical + overflow. Defaults 1e-5. +@@ -317,27 +318,20 @@ class CustomMSDeformableAttention(BaseModule): + sampling_locations = reference_points[:, :, None, :, None, :2] \ + + sampling_offsets / self.num_points \ + * reference_points[:, :, None, :, None, 2:] \ +- * 0.5 +- else: +- raise ValueError( - f'Last dim of reference_points must be' - f' 2 or 4, but get {reference_points.shape[-1]} instead.') - if torch.cuda.is_available() and value.is_cuda: @@ -501,6 +584,12 @@ index 33024f8..4598eed 100644 - else: - output = multi_scale_deformable_attn_pytorch( - value, spatial_shapes, sampling_locations, attention_weights) +- +- output = self.output_proj(output) +- ++ * 0.5 ++ else: ++ raise ValueError( + f'Last dim of reference_points must be' + f' 2 or 4, but get {reference_points.shape[-1]} instead.') + if torch.cuda.is_available() and value.is_cuda: @@ -509,17 +598,23 @@ index 33024f8..4598eed 100644 + else: + output = multi_scale_deformable_attn_pytorch( + value, spatial_shapes, sampling_locations, attention_weights) - - output = self.output_proj(output) - ++ ++ output = self.output_proj(output) ++ + if not self.batch_first: + # (num_query, bs ,embed_dims) + output = output.permute(1, 0, 2) diff --git a/projects/mmdet3d_plugin/bevformer/modules/encoder.py b/projects/mmdet3d_plugin/bevformer/modules/encoder.py -index 6758847..4ca2608 100644 +index 6758847..b3e6f8d 100644 --- a/projects/mmdet3d_plugin/bevformer/modules/encoder.py +++ b/projects/mmdet3d_plugin/bevformer/modules/encoder.py -@@ -116,14 +116,14 @@ class BEVFormerEncoder(TransformerLayerSequence): - reference_points = reference_points.view( - D, B, 1, num_query, 4).repeat(1, 1, num_cam, 1, 1).unsqueeze(-1) +@@ -113,20 +113,20 @@ class BEVFormerEncoder(TransformerLayerSequence): + D, B, num_query = reference_points.size()[:3] + num_cam = lidar2img.size(1) +- reference_points = reference_points.view( +- D, B, 1, num_query, 4).repeat(1, 1, num_cam, 1, 1).unsqueeze(-1) +- - lidar2img = lidar2img.view( - 1, B, num_cam, 1, 4, 4).repeat(D, 1, 1, num_query, 1, 1) - @@ -528,6 +623,12 @@ index 6758847..4ca2608 100644 - eps = 1e-5 - - bev_mask = (reference_points_cam[..., 2:3] > eps) +- reference_points_cam = reference_points_cam[..., 0:2] / torch.maximum( +- reference_points_cam[..., 2:3], torch.ones_like(reference_points_cam[..., 2:3]) * eps) +- ++ reference_points = reference_points.view( ++ D, B, 1, num_query, 4).repeat(1, 1, num_cam, 1, 1).unsqueeze(-1) ++ + lidar2img = lidar2img.view( + 1, B, num_cam, 1, 4, 4).repeat(D, 1, 1, num_query, 1, 1) + @@ -536,19 +637,31 @@ index 6758847..4ca2608 100644 + eps = 1e-5 + + bev_mask = (reference_points_cam[..., 2:3] > eps) - reference_points_cam = reference_points_cam[..., 0:2] / torch.maximum( - reference_points_cam[..., 2:3], torch.ones_like(reference_points_cam[..., 2:3]) * eps) ++ reference_points_cam = reference_points_cam[..., 0:2] / torch.maximum( ++ reference_points_cam[..., 2:3], torch.ones_like(reference_points_cam[..., 2:3]) * eps) ++ + reference_points_cam[..., 0] /= img_metas[0]['img_shape'][0][1] + reference_points_cam[..., 1] /= img_metas[0]['img_shape'][0][0] -@@ -259,12 +259,14 @@ class BEVFormerLayer(MyCustomBaseTransformerLayer): - Default: `LN`. - ffn_num_fcs (int): The number of fully-connected layers in FFNs. - Default:2. +@@ -256,36 +256,40 @@ class BEVFormerLayer(MyCustomBaseTransformerLayer): + Default:None + act_cfg (dict): The activation config for FFNs. Default: `LN` + norm_cfg (dict): Config dict for normalization layer. +- Default: `LN`. +- ffn_num_fcs (int): The number of fully-connected layers in FFNs. +- Default:2. - """ - - def __init__(self, - attn_cfgs, - feedforward_channels, - ffn_dropout=0.0, +- operation_order=None, +- act_cfg=dict(type='ReLU', inplace=True), +- norm_cfg=dict(type='LN'), ++ Default: `LN`. ++ ffn_num_fcs (int): The number of fully-connected layers in FFNs. ++ Default:2. + """ + + def __init__(self, @@ -557,19 +670,30 @@ index 6758847..4ca2608 100644 + attn_cfgs, + feedforward_channels, + ffn_dropout=0.0, - operation_order=None, - act_cfg=dict(type='ReLU', inplace=True), - norm_cfg=dict(type='LN'), -@@ -277,12 +279,14 @@ class BEVFormerLayer(MyCustomBaseTransformerLayer): - operation_order=operation_order, - act_cfg=act_cfg, - norm_cfg=norm_cfg, ++ operation_order=None, ++ act_cfg=dict(type='ReLU', inplace=True), ++ norm_cfg=dict(type='LN'), + ffn_num_fcs=2, + **kwargs): + super(BEVFormerLayer, self).__init__( + attn_cfgs=attn_cfgs, + feedforward_channels=feedforward_channels, + ffn_dropout=ffn_dropout, +- operation_order=operation_order, +- act_cfg=act_cfg, +- norm_cfg=norm_cfg, - ffn_num_fcs=ffn_num_fcs, - **kwargs) - self.fp16_enabled = False - assert len(operation_order) == 6 - assert set(operation_order) == set( - ['self_attn', 'norm', 'cross_attn', 'ffn']) +- +- def forward(self, +- query, ++ operation_order=operation_order, ++ act_cfg=act_cfg, ++ norm_cfg=norm_cfg, + ffn_num_fcs=ffn_num_fcs, + **kwargs) + self.fp16_enabled = False @@ -578,13 +702,19 @@ index 6758847..4ca2608 100644 + assert len(operation_order) == 6 + assert set(operation_order) == set( + ['self_attn', 'norm', 'cross_attn', 'ffn']) - - def forward(self, - query, -@@ -364,15 +368,14 @@ class BEVFormerLayer(MyCustomBaseTransformerLayer): - identity if self.pre_norm else None, - query_pos=bev_pos, - key_pos=bev_pos, ++ ++ def forward(self, ++ query, + key=None, + value=None, + bev_pos=None, +@@ -361,21 +365,20 @@ class BEVFormerLayer(MyCustomBaseTransformerLayer): + query, + prev_bev, + prev_bev, +- identity if self.pre_norm else None, +- query_pos=bev_pos, +- key_pos=bev_pos, - attn_mask=attn_masks[attn_index], - key_padding_mask=query_key_padding_mask, - reference_points=ref_2d, @@ -594,6 +724,12 @@ index 6758847..4ca2608 100644 - **kwargs) - attn_index += 1 - identity = query +- +- elif layer == 'norm': +- query = self.norms[norm_index](query) ++ identity if self.pre_norm else None, ++ query_pos=bev_pos, ++ key_pos=bev_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=query_key_padding_mask, + reference_points=ref_2d, @@ -602,23 +738,35 @@ index 6758847..4ca2608 100644 + **kwargs) + attn_index += 1 + identity = query ++ ++ elif layer == 'norm': ++ query = self.norms[norm_index](query) + norm_index += 1 - elif layer == 'norm': - query = self.norms[norm_index](query) + # spaital cross attention diff --git a/projects/mmdet3d_plugin/bevformer/modules/spatial_cross_attention.py b/projects/mmdet3d_plugin/bevformer/modules/spatial_cross_attention.py -index 100d94f..e1c293f 100644 +index 100d94f..f2db003 100644 --- a/projects/mmdet3d_plugin/bevformer/modules/spatial_cross_attention.py +++ b/projects/mmdet3d_plugin/bevformer/modules/spatial_cross_attention.py -@@ -23,12 +23,19 @@ from mmcv.runner.base_module import BaseModule, ModuleList, Sequential - from mmcv.utils import ext_loader - from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \ - MultiScaleDeformableAttnFunction_fp16 +@@ -20,18 +20,25 @@ from mmcv.runner import force_fp32, auto_fp16 + + from mmcv.runner.base_module import BaseModule, ModuleList, Sequential + +-from mmcv.utils import ext_loader +-from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \ +- MultiScaleDeformableAttnFunction_fp16 -from projects.mmdet3d_plugin.models.utils.bricks import run_time -ext_module = ext_loader.load_ext( - '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) - - -@ATTENTION.register_module() +-class SpatialCrossAttention(BaseModule): +- """An attention module used in BEVFormer. +- Args: ++from mmcv.utils import ext_loader ++from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \ ++ MultiScaleDeformableAttnFunction_fp16 +from projects.mmdet3d_plugin.models.utils.bricks import run_time +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) @@ -632,13 +780,19 @@ index 100d94f..e1c293f 100644 + + +@ATTENTION.register_module() - class SpatialCrossAttention(BaseModule): - """An attention module used in BEVFormer. - Args: -@@ -132,47 +139,63 @@ class SpatialCrossAttention(BaseModule): - query = query + query_pos - - bs, num_query, _ = query.size() ++class SpatialCrossAttention(BaseModule): ++ """An attention module used in BEVFormer. ++ Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_cams (int): The number of cameras +@@ -129,53 +136,69 @@ class SpatialCrossAttention(BaseModule): + inp_residual = query + slots = torch.zeros_like(query) + if query_pos is not None: +- query = query + query_pos +- +- bs, num_query, _ = query.size() - - D = reference_points_cam.size(3) - indexes = [] @@ -649,6 +803,43 @@ index 100d94f..e1c293f 100644 - - # each camera only interacts with its corresponding BEV queries. This step can greatly save GPU memory. - queries_rebatch = query.new_zeros( +- [bs, self.num_cams, max_len, self.embed_dims]) +- reference_points_rebatch = reference_points_cam.new_zeros( +- [bs, self.num_cams, max_len, D, 2]) +- +- for j in range(bs): +- for i, reference_points_per_img in enumerate(reference_points_cam): +- index_query_per_img = indexes[i] +- queries_rebatch[j, i, :len(index_query_per_img)] = query[j, index_query_per_img] +- reference_points_rebatch[j, i, :len(index_query_per_img)] = reference_points_per_img[j, index_query_per_img] +- +- num_cams, l, bs, embed_dims = key.shape +- +- key = key.permute(2, 0, 1, 3).reshape( +- bs * self.num_cams, l, self.embed_dims) +- value = value.permute(2, 0, 1, 3).reshape( +- bs * self.num_cams, l, self.embed_dims) +- +- queries = self.deformable_attention(query=queries_rebatch.view(bs*self.num_cams, max_len, self.embed_dims), key=key, value=value, +- reference_points=reference_points_rebatch.view(bs*self.num_cams, max_len, D, 2), spatial_shapes=spatial_shapes, +- level_start_index=level_start_index).view(bs, self.num_cams, max_len, self.embed_dims) +- for j in range(bs): +- for i, index_query_per_img in enumerate(indexes): +- slots[j, index_query_per_img] += queries[j, i, :len(index_query_per_img)] +- +- count = bev_mask.sum(-1) > 0 +- count = count.permute(1, 2, 0).sum(-1) +- count = torch.clamp(count, min=1.0) +- slots = slots / count[..., None] +- slots = self.output_proj(slots) +- +- return self.dropout(slots) + inp_residual +- +- +-@ATTENTION.register_module() ++ query = query + query_pos ++ ++ bs, num_query, _ = query.size() + + D = reference_points_cam.size(3) + indexes = [] @@ -677,16 +868,7 @@ index 100d94f..e1c293f 100644 + + # each camera only interacts with its corresponding BEV queries. This step can greatly save GPU memory. + queries_rebatch = query.new_zeros( - [bs, self.num_cams, max_len, self.embed_dims]) -- reference_points_rebatch = reference_points_cam.new_zeros( -- [bs, self.num_cams, max_len, D, 2]) -- -- for j in range(bs): -- for i, reference_points_per_img in enumerate(reference_points_cam): -- index_query_per_img = indexes[i] -- queries_rebatch[j, i, :len(index_query_per_img)] = query[j, index_query_per_img] -- reference_points_rebatch[j, i, :len(index_query_per_img)] = reference_points_per_img[j, index_query_per_img] -- ++ [bs, self.num_cams, max_len, self.embed_dims]) + reference_points_rebatch = reference_points_cam.new_zeros( + [bs, self.num_cams, max_len, D, 2]) + @@ -696,27 +878,7 @@ index 100d94f..e1c293f 100644 + queries_rebatch[j, i, :len(index_query_per_img)] = query[j, index_query_per_img] + reference_points_rebatch[j, i, :len(index_query_per_img)] = reference_points_per_img[j, index_query_per_img] + - num_cams, l, bs, embed_dims = key.shape -- -- key = key.permute(2, 0, 1, 3).reshape( -- bs * self.num_cams, l, self.embed_dims) -- value = value.permute(2, 0, 1, 3).reshape( -- bs * self.num_cams, l, self.embed_dims) -- -- queries = self.deformable_attention(query=queries_rebatch.view(bs*self.num_cams, max_len, self.embed_dims), key=key, value=value, -- reference_points=reference_points_rebatch.view(bs*self.num_cams, max_len, D, 2), spatial_shapes=spatial_shapes, -- level_start_index=level_start_index).view(bs, self.num_cams, max_len, self.embed_dims) -- for j in range(bs): -- for i, index_query_per_img in enumerate(indexes): -- slots[j, index_query_per_img] += queries[j, i, :len(index_query_per_img)] -- -- count = bev_mask.sum(-1) > 0 -- count = count.permute(1, 2, 0).sum(-1) -- count = torch.clamp(count, min=1.0) -- slots = slots / count[..., None] -- slots = self.output_proj(slots) -- -- return self.dropout(slots) + inp_residual ++ num_cams, l, bs, embed_dims = key.shape + + key = key.permute(2, 0, 1, 3).reshape( + bs * self.num_cams, l, self.embed_dims) @@ -735,13 +897,19 @@ index 100d94f..e1c293f 100644 + slots = self.output_proj(slots) + + return self.dropout(slots) + inp_residual - - - @ATTENTION.register_module() -@@ -380,19 +403,14 @@ class MSDeformableAttention3D(BaseModule): - - # sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2 - # attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points ++ ++ ++@ATTENTION.register_module() + class MSDeformableAttention3D(BaseModule): + """An attention module used in BEVFormer based on Deformable-Detr. + `Deformable DETR: Deformable Transformers for End-to-End Object Detection. +@@ -377,23 +400,18 @@ class MSDeformableAttention3D(BaseModule): + raise ValueError( + f'Last dim of reference_points must be' + f' 2 or 4, but get {reference_points.shape[-1]} instead.') +- +- # sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2 +- # attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points - # - - if torch.cuda.is_available() and value.is_cuda: @@ -755,6 +923,12 @@ index 100d94f..e1c293f 100644 - else: - output = multi_scale_deformable_attn_pytorch( - value, spatial_shapes, sampling_locations, attention_weights) +- if not self.batch_first: +- output = output.permute(1, 0, 2) +- ++ ++ # sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2 ++ # attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points + # + + if torch.cuda.is_available() and value.is_cuda: @@ -763,23 +937,33 @@ index 100d94f..e1c293f 100644 + else: + output = multi_scale_deformable_attn_pytorch( + value, spatial_shapes, sampling_locations, attention_weights) - if not self.batch_first: - output = output.permute(1, 0, 2) - ++ if not self.batch_first: ++ output = output.permute(1, 0, 2) ++ + return output diff --git a/projects/mmdet3d_plugin/bevformer/modules/temporal_self_attention.py b/projects/mmdet3d_plugin/bevformer/modules/temporal_self_attention.py -index 78fb9f5..7d2d1e6 100644 +index 78fb9f5..88d863c 100644 --- a/projects/mmdet3d_plugin/bevformer/modules/temporal_self_attention.py +++ b/projects/mmdet3d_plugin/bevformer/modules/temporal_self_attention.py -@@ -18,12 +18,14 @@ from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning, - to_2tuple) - - from mmcv.utils import ext_loader +@@ -15,18 +15,20 @@ from mmcv.cnn.bricks.registry import ATTENTION + import math + from mmcv.runner.base_module import BaseModule, ModuleList, Sequential + from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning, +- to_2tuple) +- +-from mmcv.utils import ext_loader -ext_module = ext_loader.load_ext( - '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) - - -@ATTENTION.register_module() -class TemporalSelfAttention(BaseModule): +- """An attention module used in BEVFormer based on Deformable-Detr. +- +- `Deformable DETR: Deformable Transformers for End-to-End Object Detection. ++ to_2tuple) ++ ++from mmcv.utils import ext_loader +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + @@ -788,13 +972,19 @@ index 78fb9f5..7d2d1e6 100644 + +@ATTENTION.register_module() +class TemporalSelfAttention(BaseModule): - """An attention module used in BEVFormer based on Deformable-Detr. ++ """An attention module used in BEVFormer based on Deformable-Detr. ++ ++ `Deformable DETR: Deformable Transformers for End-to-End Object Detection. + `_. - `Deformable DETR: Deformable Transformers for End-to-End Object Detection. -@@ -235,21 +237,14 @@ class TemporalSelfAttention(BaseModule): - * 0.5 - else: - raise ValueError( + Args: +@@ -232,27 +234,20 @@ class TemporalSelfAttention(BaseModule): + sampling_locations = reference_points[:, :, None, :, None, :2] \ + + sampling_offsets / self.num_points \ + * reference_points[:, :, None, :, None, 2:] \ +- * 0.5 +- else: +- raise ValueError( - f'Last dim of reference_points must be' - f' 2 or 4, but get {reference_points.shape[-1]} instead.') - if torch.cuda.is_available() and value.is_cuda: @@ -810,6 +1000,12 @@ index 78fb9f5..7d2d1e6 100644 - else: - - output = multi_scale_deformable_attn_pytorch( +- value, spatial_shapes, sampling_locations, attention_weights) +- +- # output shape (bs*num_bev_queue, num_query, embed_dims) ++ * 0.5 ++ else: ++ raise ValueError( + f'Last dim of reference_points must be' + f' 2 or 4, but get {reference_points.shape[-1]} instead.') + if torch.cuda.is_available() and value.is_cuda: @@ -818,17 +1014,23 @@ index 78fb9f5..7d2d1e6 100644 + else: + + output = multi_scale_deformable_attn_pytorch( - value, spatial_shapes, sampling_locations, attention_weights) ++ value, spatial_shapes, sampling_locations, attention_weights) ++ ++ # output shape (bs*num_bev_queue, num_query, embed_dims) + # (bs*num_bev_queue, num_query, embed_dims)-> (num_query, embed_dims, bs*num_bev_queue) + output = output.permute(1, 2, 0) - # output shape (bs*num_bev_queue, num_query, embed_dims) diff --git a/projects/mmdet3d_plugin/bevformer/modules/transformer.py b/projects/mmdet3d_plugin/bevformer/modules/transformer.py -index b740fcc..afe081b 100644 +index b740fcc..d62388b 100644 --- a/projects/mmdet3d_plugin/bevformer/modules/transformer.py +++ b/projects/mmdet3d_plugin/bevformer/modules/transformer.py -@@ -12,13 +12,13 @@ from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence - from mmcv.runner.base_module import BaseModule - - from mmdet.models.utils.builder import TRANSFORMER +@@ -9,19 +9,19 @@ import torch + import torch.nn as nn + from mmcv.cnn import xavier_init + from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence +-from mmcv.runner.base_module import BaseModule +- +-from mmdet.models.utils.builder import TRANSFORMER -from torch.nn.init import normal_ -from projects.mmdet3d_plugin.models.utils.visual import save_tensor -from mmcv.runner.base_module import BaseModule @@ -836,6 +1038,12 @@ index b740fcc..afe081b 100644 -from .temporal_self_attention import TemporalSelfAttention -from .spatial_cross_attention import MSDeformableAttention3D -from .decoder import CustomMSDeformableAttention +-from projects.mmdet3d_plugin.models.utils.bricks import run_time +-from mmcv.runner import force_fp32, auto_fp16 +- ++from mmcv.runner.base_module import BaseModule ++ ++from mmdet.models.utils.builder import TRANSFORMER +from torch.nn.init import normal_ +from projects.mmdet3d_plugin.models.utils.visual import save_tensor +from mmcv.runner.base_module import BaseModule @@ -843,19 +1051,31 @@ index b740fcc..afe081b 100644 +from .temporal_self_attention import TemporalSelfAttention +from .spatial_cross_attention import MSDeformableAttention3D +from .decoder import CustomMSDeformableAttention - from projects.mmdet3d_plugin.models.utils.bricks import run_time - from mmcv.runner import force_fp32, auto_fp16 ++from projects.mmdet3d_plugin.models.utils.bricks import run_time ++from mmcv.runner import force_fp32, auto_fp16 ++ -@@ -147,12 +147,13 @@ class PerceptionTransformer(BaseModule): - for i in range(bs): - # num_prev_bev = prev_bev.size(1) - rotation_angle = kwargs['img_metas'][i]['can_bus'][-1] + @TRANSFORMER.register_module() + class PerceptionTransformer(BaseModule): +@@ -144,18 +144,19 @@ class PerceptionTransformer(BaseModule): + if prev_bev.shape[1] == bev_h * bev_w: + prev_bev = prev_bev.permute(1, 0, 2) + if self.rotate_prev_bev: +- for i in range(bs): +- # num_prev_bev = prev_bev.size(1) +- rotation_angle = kwargs['img_metas'][i]['can_bus'][-1] - tmp_prev_bev = prev_bev[:, i].reshape( - bev_h, bev_w, -1).permute(2, 0, 1) - tmp_prev_bev = rotate(tmp_prev_bev, rotation_angle, - center=self.rotate_center) - tmp_prev_bev = tmp_prev_bev.permute(1, 2, 0).reshape( - bev_h * bev_w, 1, -1) +- prev_bev[:, i] = tmp_prev_bev[:, 0] +- +- # add can bus signals ++ for i in range(bs): ++ # num_prev_bev = prev_bev.size(1) ++ rotation_angle = kwargs['img_metas'][i]['can_bus'][-1] + tmp_prev_bev = prev_bev[:, i].reshape( + bev_h, bev_w, -1).permute(2, 0, 1) + tmp_prev_bev = rotate(tmp_prev_bev, rotation_angle, @@ -863,9 +1083,12 @@ index b740fcc..afe081b 100644 + center=self.rotate_center) + tmp_prev_bev = tmp_prev_bev.permute(1, 2, 0).reshape( + bev_h * bev_w, 1, -1) - prev_bev[:, i] = tmp_prev_bev[:, 0] - - # add can bus signals ++ prev_bev[:, i] = tmp_prev_bev[:, 0] ++ ++ # add can bus signals + can_bus = bev_queries.new_tensor( + [each['can_bus'] for each in kwargs['img_metas']]) # [:, :] + can_bus = self.can_bus_mlp(can_bus)[None, :, :] diff --git a/projects/mmdet3d_plugin/datasets/builder.py b/projects/mmdet3d_plugin/datasets/builder.py index f9bf5be..9586277 100644 --- a/projects/mmdet3d_plugin/datasets/builder.py @@ -954,18 +1177,30 @@ index 4ac9a15..faa970b 100755 +torchrun --nproc_per_node=$GPUS --master_port=$PORT \ $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic diff --git a/tools/fp16/train.py b/tools/fp16/train.py -index eddc349..631be33 100644 +index eddc349..acf9e2a 100644 --- a/tools/fp16/train.py +++ b/tools/fp16/train.py -@@ -23,6 +23,8 @@ from mmdet.apis import set_random_seed - from mmseg import __version__ as mmseg_version - - from mmcv.utils import TORCH_VERSION, digit_version +@@ -20,12 +20,14 @@ from mmdet3d.datasets import build_dataset + from mmdet3d.models import build_model + from mmdet3d.utils import collect_env, get_root_logger + from mmdet.apis import set_random_seed +-from mmseg import __version__ as mmseg_version +- +-from mmcv.utils import TORCH_VERSION, digit_version +- +-def parse_args(): +- parser = argparse.ArgumentParser(description='Train a detector') ++from mmseg import __version__ as mmseg_version ++ ++from mmcv.utils import TORCH_VERSION, digit_version +import torch_npu +from torch_npu.contrib import transfer_to_npu - - def parse_args(): - parser = argparse.ArgumentParser(description='Train a detector') ++ ++def parse_args(): ++ parser = argparse.ArgumentParser(description='Train a detector') + parser.add_argument('config', help='train config file path') + parser.add_argument('--work-dir', help='the dir to save logs and models') + parser.add_argument( diff --git a/tools/test.py b/tools/test.py index acce20b..12ed94e 100755 --- a/tools/test.py