diff --git a/model_examples/GameFormer-Planner/GameFormer-Planner_NPU.patch b/model_examples/GameFormer-Planner/GameFormer-Planner_NPU.patch index d8ba39882a29bbb5b6e632c7ac5ef4d0a1cf1e1d..d3137fe5668e91dcfd28f7a07e2cb6c10819ddf8 100644 --- a/model_examples/GameFormer-Planner/GameFormer-Planner_NPU.patch +++ b/model_examples/GameFormer-Planner/GameFormer-Planner_NPU.patch @@ -726,15 +726,6 @@ diff --git a/Planner/bezier_path.py b/Planner/bezier_path.py index 4863a01..762040e 100644 --- a/Planner/bezier_path.py +++ b/Planner/bezier_path.py -@@ -72,8 +72,6 @@ def bezier_derivatives_control_points(control_points, n_derivatives): - Compute control points of the successive derivatives of a given bezier curve. - - A derivative of a bezier curve is a bezier curve. -- See https://pomax.github.io/bezierinfo/#derivatives -- for detailed explanations - - :param control_points: (numpy array) - :param n_derivatives: (int) @@ -82,8 +80,9 @@ def bezier_derivatives_control_points(control_points, n_derivatives): """ w = {0: control_points} diff --git a/model_examples/QCNet/README.md b/model_examples/QCNet/README.md index 13dc4cffe83907c6e867762ca220b3abc114efe0..b59f05da5e3d03a313b97c56b3d24423e5d466ac 100644 --- a/model_examples/QCNet/README.md +++ b/model_examples/QCNet/README.md @@ -105,7 +105,7 @@ code_path=model_examples/QCNet cd .. ``` -4. 安装 torch_geometric, torch_cluster, torch_scatter +4. 安装 torch_geometric, torch_scatter ``` git clone https://github.com/pyg-team/pytorch_geometric.git -b version_2_3_1 @@ -115,14 +115,6 @@ code_path=model_examples/QCNet pip install -e ./ --no-deps cd .. - git clone https://github.com/rusty1s/pytorch_cluster.git -b 1.6.1 - cd pytorch_cluster - git checkout 84bbb7140e03df01b3bb388ba4df299328ea2dff - git apply ../patch/torch_cluster.patch - // 该仓库编译耗时较久,需要30分钟左右 - pip install -e ./ --no-deps - cd .. - git clone https://github.com/rusty1s/pytorch_scatter.git -b 2.1.0 cd pytorch_scatter pip install -e ./ --no-deps diff --git a/model_examples/QCNet/patch/qcnet.patch b/model_examples/QCNet/patch/qcnet.patch index 3368dc381bfa7c7af53e7f75dfdc92f61c442de3..117927c2124abd986a4fda06cd9b09b610cf3256 100644 --- a/model_examples/QCNet/patch/qcnet.patch +++ b/model_examples/QCNet/patch/qcnet.patch @@ -1,8 +1,8 @@ diff --git a/datamodules/argoverse_v2_datamodule.py b/datamodules/argoverse_v2_datamodule.py -index 1b55133..2997824 100644 +index 1b55133..1494774 100644 --- a/datamodules/argoverse_v2_datamodule.py +++ b/datamodules/argoverse_v2_datamodule.py -@@ -13,7 +13,7 @@ +@@ -13,10 +13,11 @@ # limitations under the License. from typing import Callable, Optional @@ -11,6 +11,356 @@ index 1b55133..2997824 100644 from torch_geometric.loader import DataLoader from datasets import ArgoverseV2Dataset ++from .dataloader import DynamicBatchDataLoader + from transforms import TargetBuilder + + +@@ -40,6 +41,7 @@ class ArgoverseV2DataModule(pl.LightningDataModule): + train_transform: Optional[Callable] = TargetBuilder(50, 60), + val_transform: Optional[Callable] = TargetBuilder(50, 60), + test_transform: Optional[Callable] = None, ++ dynamic_sort: bool = False, + **kwargs) -> None: + super(ArgoverseV2DataModule, self).__init__() + self.root = root +@@ -59,6 +61,7 @@ class ArgoverseV2DataModule(pl.LightningDataModule): + self.train_transform = train_transform + self.val_transform = val_transform + self.test_transform = test_transform ++ self.dynamic_sort = dynamic_sort + + def prepare_data(self) -> None: + ArgoverseV2Dataset(self.root, 'train', self.train_raw_dir, self.train_processed_dir, self.train_transform) +@@ -74,9 +77,14 @@ class ArgoverseV2DataModule(pl.LightningDataModule): + self.test_transform) + + def train_dataloader(self): +- return DataLoader(self.train_dataset, batch_size=self.train_batch_size, shuffle=self.shuffle, +- num_workers=self.num_workers, pin_memory=self.pin_memory, +- persistent_workers=self.persistent_workers) ++ if self.dynamic_sort: ++ return DynamicBatchDataLoader(self.train_dataset, batch_size=self.train_batch_size, actual_batch_size=self.train_batch_size, shuffle=self.shuffle, ++ num_workers=self.num_workers, pin_memory=self.pin_memory, ++ persistent_workers=False) ++ else: ++ return DataLoader(self.train_dataset, batch_size=self.train_batch_size, shuffle=self.shuffle, ++ num_workers=self.num_workers, pin_memory=self.pin_memory, ++ persistent_workers=False) + + def val_dataloader(self): + return DataLoader(self.val_dataset, batch_size=self.val_batch_size, shuffle=False, +diff --git a/datamodules/dataloader.py b/datamodules/dataloader.py +new file mode 100644 +index 0000000..8b3f220 +--- /dev/null ++++ b/datamodules/dataloader.py +@@ -0,0 +1,201 @@ ++import math ++from collections.abc import Mapping ++from typing import List, Optional, Sequence, Union ++ ++import torch.utils.data ++from torch.utils.data.dataloader import default_collate ++from torch.utils.data import Sampler ++from torch.utils.data.distributed import DistributedSampler ++from torch.utils.data import BatchSampler ++import torch.distributed as dist ++ ++from torch_geometric.data import Batch, Dataset ++from torch_geometric.data.data import BaseData ++from torch_geometric.data.datapipes import DatasetAdapter ++ ++import numpy as np ++ ++class Collater: ++ def __init__(self, follow_batch, exclude_keys): ++ self.follow_batch = follow_batch ++ self.exclude_keys = exclude_keys ++ ++ def __call__(self, batch): ++ elem = batch[0] ++ if isinstance(elem, BaseData): ++ return Batch.from_data_list(batch, self.follow_batch, ++ self.exclude_keys) ++ elif isinstance(elem, torch.Tensor): ++ return default_collate(batch) ++ elif isinstance(elem, float): ++ return torch.tensor(batch, dtype=torch.float) ++ elif isinstance(elem, int): ++ return torch.tensor(batch) ++ elif isinstance(elem, str): ++ return batch ++ elif isinstance(elem, Mapping): ++ return {key: self([data[key] for data in batch]) for key in elem} ++ elif isinstance(elem, tuple) and hasattr(elem, '_fields'): ++ return type(elem)(*(self(s) for s in zip(*batch))) ++ elif isinstance(elem, Sequence) and not isinstance(elem, str): ++ return [self(s) for s in zip(*batch)] ++ ++ raise TypeError(f'DataLoader found invalid type: {type(elem)}') ++ ++ def collate(self, batch): # pragma: no cover ++ # TODO Deprecated, remove soon. ++ return self(batch) ++ ++class DynamicBatchSampler(BatchSampler): ++ def __init__(self, dataset, sampler, batch_size, drop_last=False): ++ super().__init__(sampler, batch_size, drop_last) ++ self.lengths = dataset.agents_num ++ self.batch_size = batch_size ++ self.drop_last = drop_last ++ ++ def __iter__(self): ++ if self.drop_last: ++ sampler_iter = iter(self.sampler.__iter__()) ++ while True: ++ try: ++ batch = [next(sampler_iter) for _ in range(self.batch_size)] ++ yield batch ++ except StopIteration: ++ break ++ else: ++ batch = [0] * self.batch_size ++ idx_in_batch = 0 ++ for idx in self.sampler: ++ batch[idx_in_batch] = idx ++ idx_in_batch += 1 ++ if idx_in_batch == self.batch_size: ++ yield batch ++ idx_in_batch = 0 ++ batch = [0] * self.batch_size ++ if idx_in_batch > 0: ++ yield batch[:idx_in_batch] ++ ++class DynamicDistributedSampler(DistributedSampler): ++ def __init__(self, dataset: Dataset, batch_size, num_replicas: Optional[int] = None, ++ rank: Optional[int] = None, shuffle: bool = True, ++ seed: int = 0, drop_last: bool = False) -> None: ++ if num_replicas is None: ++ if not dist.is_available(): ++ raise RuntimeError("Requires distributed package to be available") ++ num_replicas = dist.get_world_size() ++ if rank is None: ++ if not dist.is_available(): ++ raise RuntimeError("Requires distributed package to be available") ++ rank = dist.get_rank() ++ if rank >= num_replicas or rank < 0: ++ raise ValueError( ++ f"Invalid rank {rank}, rank should be in the interval [0, {num_replicas - 1}]") ++ self.dataset = dataset ++ self.num_replicas = num_replicas ++ self.rank = rank ++ self.batch_size = batch_size ++ self.epoch = 0 ++ self.drop_last = drop_last ++ self.lengths = dataset.agents_num ++ # If the dataset length is evenly divisible by # of replicas, then there ++ # is no need to drop any data, since the dataset will be split equally. ++ if self.drop_last and len(self.dataset) % self.num_replicas != 0: # type: ignore[arg-type] ++ # Split to nearest available length that is evenly divisible. ++ # This is to ensure each rank receives the same amount of data when ++ # using this Sampler. ++ self.num_samples = math.ceil( ++ (len(self.dataset) - self.num_replicas) / self.num_replicas # type: ignore[arg-type] ++ ) ++ else: ++ self.num_samples = math.ceil(len(self.dataset) / self.num_replicas) # type: ignore[arg-type] ++ self.total_size = self.num_samples * self.num_replicas ++ self.shuffle = shuffle ++ self.seed = seed ++ ++ def __iter__(self): ++ if self.shuffle: ++ # deterministically shuffle based on epoch and seed ++ g = torch.Generator() ++ g.manual_seed(self.seed + self.epoch) ++ indices = torch.randperm(len(self.dataset), generator=g).tolist() # type: ignore[arg-type] ++ else: ++ indices = list(range(len(self.dataset))) # type: ignore[arg-type] ++ ++ if not self.drop_last: ++ # add extra samples to make it evenly divisible ++ padding_size = self.total_size - len(indices) ++ if padding_size <= len(indices): ++ indices += indices[:padding_size] ++ self.lengths += self.lengths[:padding_size] ++ else: ++ indices += (indices * math.ceil(padding_size / len(indices)))[:padding_size] ++ self.lengths += (self.lengths * math.ceil(padding_size / len(self.lengths)))[:padding_size] ++ else: ++ # remove tail of data to make it evenly divisible. ++ indices = indices[:self.total_size] ++ self.lengths = self.lengths[:self.total_size] ++ assert len(indices) == self.total_size ++ ++ index = np.argsort(np.array(self.lengths)[indices]) ++ combine_list = np.array(indices)[index].reshape(self.num_samples, self.num_replicas) ++ combine_list[1::2] = combine_list[1::2, ::-1] ++ combine_list = combine_list.transpose(1, 0) ++ ++ num_batch = self.num_samples // self.batch_size ++ for i in range(self.num_replicas): ++ arr = combine_list[i,:num_batch*self.batch_size].reshape(self.batch_size, num_batch) ++ arr[1::2] = arr[1::2, ::-1] ++ combine_list[i,:num_batch*self.batch_size] = arr.transpose(1, 0).reshape(-1) ++ indices = combine_list.transpose(1, 0).reshape(-1) ++ # subsample ++ indices = indices[self.rank:self.total_size:self.num_replicas] ++ assert len(indices) == self.num_samples ++ ++ return iter(indices) ++ ++class DynamicBatchDataLoader(torch.utils.data.DataLoader): ++ r"""A data loader which merges data objects from a ++ :class:`torch_geometric.data.Dataset` to a mini-batch. ++ Data objects can be either of type :class:`~torch_geometric.data.Data` or ++ :class:`~torch_geometric.data.HeteroData`. ++ ++ Args: ++ dataset (Dataset): The dataset from which to load the data. ++ batch_size (int, optional): How many samples per batch to load. ++ (default: :obj:`1`) ++ shuffle (bool, optional): If set to :obj:`True`, the data will be ++ reshuffled at every epoch. (default: :obj:`False`) ++ follow_batch (List[str], optional): Creates assignment batch ++ vectors for each key in the list. (default: :obj:`None`) ++ exclude_keys (List[str], optional): Will exclude each key in the ++ list. (default: :obj:`None`) ++ **kwargs (optional): Additional arguments of ++ :class:`torch.utils.data.DataLoader`. ++ """ ++ def __init__( ++ self, ++ dataset: Union[Dataset, Sequence[BaseData], DatasetAdapter], ++ batch_size: int, ++ actual_batch_size:int, ++ shuffle: bool = False, ++ follow_batch: Optional[List[str]] = None, ++ exclude_keys: Optional[List[str]] = None, ++ **kwargs, ++ ): ++ # Remove for PyTorch Lightning: ++ kwargs.pop('collate_fn', None) ++ kwargs.pop('batch_sampler', None) ++ ++ # Save for PyTorch Lightning < 1.6: ++ self.follow_batch = follow_batch ++ self.exclude_keys = exclude_keys ++ sampler = DynamicDistributedSampler(dataset, actual_batch_size, shuffle=shuffle) ++ ++ super().__init__( ++ dataset, ++ collate_fn=Collater(follow_batch, exclude_keys), ++ # batch_size = batch_size, ++ # sampler = sampler, ++ batch_sampler=DynamicBatchSampler(dataset, sampler, actual_batch_size), ++ **kwargs, ++ ) +diff --git a/datasets/argoverse_v2_dataset.py b/datasets/argoverse_v2_dataset.py +index f78b02f..776f8e1 100644 +--- a/datasets/argoverse_v2_dataset.py ++++ b/datasets/argoverse_v2_dataset.py +@@ -16,6 +16,7 @@ import os + import pickle + import shutil + import sys ++import json + from pathlib import Path + from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union + from urllib import request +@@ -105,6 +106,9 @@ class ArgoverseV2Dataset(Dataset): + else: + self._raw_file_names = [] + ++ self._raw_file_names = np.array(self._raw_file_names) ++ self._raw_file_names = sorted(self._raw_file_names) ++ + if processed_dir is None: + processed_dir = os.path.join(root, split, 'processed') + self._processed_dir = processed_dir +@@ -123,8 +127,8 @@ class ArgoverseV2Dataset(Dataset): + name.endswith(('pkl', 'pickle'))] + else: + self._processed_file_names = [] ++ self.dim = dim + +- self.dim = dim + self.num_historical_steps = num_historical_steps + self.num_future_steps = num_future_steps + self.num_steps = num_historical_steps + num_future_steps +@@ -136,6 +140,11 @@ class ArgoverseV2Dataset(Dataset): + 'val': 24988, + 'test': 24984, + }[split] ++ ++ self.agents_num = [0 for _ in range(self._num_samples)] ++ self._processed_file_names = np.array(self._processed_file_names) ++ self._processed_file_names = sorted(self._processed_file_names) ++ + self._agent_types = ['vehicle', 'pedestrian', 'motorcyclist', 'cyclist', 'bus', 'static', 'background', + 'construction', 'riderless_bicycle', 'unknown'] + self._agent_categories = ['TRACK_FRAGMENT', 'UNSCORED_TRACK', 'SCORED_TRACK', 'FOCAL_TRACK'] +@@ -148,6 +157,7 @@ class ArgoverseV2Dataset(Dataset): + self._point_sides = ['LEFT', 'RIGHT', 'CENTER'] + self._polygon_to_polygon_types = ['NONE', 'PRED', 'SUCC', 'LEFT', 'RIGHT'] + super(ArgoverseV2Dataset, self).__init__(root=root, transform=transform, pre_transform=None, pre_filter=None) ++ self.get_agents_num() + + @property + def raw_dir(self) -> str: +@@ -181,8 +191,27 @@ class ArgoverseV2Dataset(Dataset): + shutil.move(os.path.join(self.raw_dir, self.split, raw_file_name), self.raw_dir) + os.rmdir(os.path.join(self.raw_dir, self.split)) + ++ def get_agents_num(self) -> None: ++ if self.split != 'train': ++ return ++ agents_num_file_name = f"{self.split}_agents_num.json" ++ agents_num_file_path = os.path.join(self.processed_dir, agents_num_file_name) ++ if os.path.exists(agents_num_file_path): ++ with open(agents_num_file_path, "r") as handle: ++ self.agents_num = json.load(handle) ++ return ++ ++ for idx in tqdm(range(self._num_samples)): ++ data = self.get(idx) ++ self.agents_num[idx] = data["agent"]['num_nodes'] ++ ++ with open(agents_num_file_path, 'w') as handle: ++ json.dump(self.agents_num, handle) ++ + def process(self) -> None: +- for raw_file_name in tqdm(self.raw_file_names): ++ agents_num_file_name = f"{self.split}_agents_num.json" ++ agents_num_file_path = os.path.join(self.processed_dir, agents_num_file_name) ++ for idx, raw_file_name in tqdm(enumerate(self.raw_file_names)): + df = pd.read_parquet(os.path.join(self.raw_dir, raw_file_name, f'scenario_{raw_file_name}.parquet')) + map_dir = Path(self.raw_dir) / raw_file_name + map_path = map_dir / sorted(map_dir.glob('log_map_archive_*.json'))[0] +@@ -194,10 +223,14 @@ class ArgoverseV2Dataset(Dataset): + data['scenario_id'] = self.get_scenario_id(df) + data['city'] = self.get_city(df) + data['agent'] = self.get_agent_features(df) ++ self.agents_num[idx] = data["agent"]['num_nodes'] + data.update(self.get_map_features(map_api, centerlines)) + with open(os.path.join(self.processed_dir, f'{raw_file_name}.pkl'), 'wb') as handle: + pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL) + ++ with open(agents_num_file_path, 'w') as handle: ++ json.dump(self.agents_num, handle) ++ + @staticmethod + def get_scenario_id(df: pd.DataFrame) -> str: + return df['scenario_id'].values[0] +diff --git a/layers/__init__.py b/layers/__init__.py +index 6633b5a..2c8782b 100644 +--- a/layers/__init__.py ++++ b/layers/__init__.py +@@ -14,3 +14,4 @@ + from layers.attention_layer import AttentionLayer + from layers.fourier_embedding import FourierEmbedding + from layers.mlp_layer import MLPLayer ++from layers.radius import radius, radius_graph diff --git a/layers/attention_layer.py b/layers/attention_layer.py index 3b62e71..3410c78 100644 --- a/layers/attention_layer.py @@ -60,10 +410,196 @@ index 3b62e71..3410c78 100644 return self.to_out(agg) def _ff_block(self, x: torch.Tensor) -> torch.Tensor: +diff --git a/layers/fourier_embedding.py b/layers/fourier_embedding.py +index 092643b..be2a28c 100644 +--- a/layers/fourier_embedding.py ++++ b/layers/fourier_embedding.py +@@ -29,6 +29,7 @@ class FourierEmbedding(nn.Module): + super(FourierEmbedding, self).__init__() + self.input_dim = input_dim + self.hidden_dim = hidden_dim ++ self.pi2 = 2 * math.pi + + self.freqs = nn.Embedding(input_dim, num_freq_bands) if input_dim != 0 else None + self.mlps = nn.ModuleList( +@@ -55,7 +56,7 @@ class FourierEmbedding(nn.Module): + else: + raise ValueError('Both continuous_inputs and categorical_embs are None') + else: +- x = continuous_inputs.unsqueeze(-1) * self.freqs.weight * 2 * math.pi ++ x = self.pi2 * continuous_inputs.unsqueeze(-1) * self.freqs.weight + # Warning: if your data are noisy, don't use learnable sinusoidal embedding + x = torch.cat([x.cos(), x.sin(), continuous_inputs.unsqueeze(-1)], dim=-1) + continuous_embs: List[Optional[torch.Tensor]] = [None] * self.input_dim +diff --git a/layers/radius.py b/layers/radius.py +new file mode 100644 +index 0000000..677ba95 +--- /dev/null ++++ b/layers/radius.py +@@ -0,0 +1,134 @@ ++from typing import Optional ++ ++import torch ++import torch_npu ++import mx_driving ++ ++ ++def radius(x: torch.Tensor, y: torch.Tensor, r: float, ++ batch_x: Optional[torch.Tensor] = None, ++ batch_y: Optional[torch.Tensor] = None, max_num_neighbors: int = 32, ++ num_workers: int = 1) -> torch.Tensor: ++ r"""Finds for each element in :obj:`y` all points in :obj:`x` within ++ distance :obj:`r`. ++ ++ Args: ++ x (Tensor): Node feature matrix ++ :math:`\mathbf{X} \in \mathbb{R}^{N \times F}`. ++ y (Tensor): Node feature matrix ++ :math:`\mathbf{Y} \in \mathbb{R}^{M \times F}`. ++ r (float): The radius. ++ batch_x (LongTensor, optional): Batch vector ++ :math:`\mathbf{b} \in {\{ 0, \ldots, B-1\}}^N`, which assigns each ++ node to a specific example. :obj:`batch_x` needs to be sorted. ++ (default: :obj:`None`) ++ batch_y (LongTensor, optional): Batch vector ++ :math:`\mathbf{b} \in {\{ 0, \ldots, B-1\}}^M`, which assigns each ++ node to a specific example. :obj:`batch_y` needs to be sorted. ++ (default: :obj:`None`) ++ max_num_neighbors (int, optional): The maximum number of neighbors to ++ return for each element in :obj:`y`. ++ If the number of actual neighbors is greater than ++ :obj:`max_num_neighbors`, returned neighbors are picked randomly. ++ (default: :obj:`32`) ++ num_workers (int): Number of workers to use for computation. Has no ++ effect in case :obj:`batch_x` or :obj:`batch_y` is not ++ :obj:`None`, or the input lies on the GPU. (default: :obj:`1`) ++ ++ .. code-block:: python ++ ++ import torch ++ from torch_cluster import radius ++ ++ x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]]) ++ batch_x = torch.tensor([0, 0, 0, 0]) ++ y = torch.Tensor([[-1, 0], [1, 0]]) ++ batch_y = torch.tensor([0, 0]) ++ assign_index = radius(x, y, 1.5, batch_x, batch_y) ++ """ ++ if x.numel() == 0 or y.numel() == 0: ++ return torch.empty(2, 0, dtype=torch.long, device=x.device) ++ ++ x = x.view(-1, 1) if x.dim() == 1 else x ++ y = y.view(-1, 1) if y.dim() == 1 else y ++ x, y = x.contiguous(), y.contiguous() ++ ++ batch_size = 1 ++ if batch_x is not None: ++ assert x.size(0) == batch_x.numel() ++ batch_size = int(batch_x.max()) + 1 ++ if batch_y is not None: ++ assert y.size(0) == batch_y.numel() ++ batch_size = max(batch_size, int(batch_y.max()) + 1) ++ ++ ptr_x: Optional[torch.Tensor] = None ++ ptr_y: Optional[torch.Tensor] = None ++ device = x.device ++ if batch_size > 1: ++ assert batch_x is not None ++ assert batch_y is not None ++ arange = torch.arange(batch_size + 1, device=x.device) ++ ptr_x = torch.bucketize(arange, batch_x).cpu() ++ ptr_y = torch.bucketize(arange, batch_y).cpu() ++ else: ++ ptr_x = torch.tensor([0, x.shape[0]]).to(device) ++ ptr_y = torch.tensor([0, y.shape[0]]).to(device) ++ ++ return mx_driving.radius(x.npu(), y.npu(), ptr_x.int().to(device), ptr_y.int().to(device), r, ++ max_num_neighbors) ++ ++ ++def radius_graph(x: torch.Tensor, r: float, ++ batch: Optional[torch.Tensor] = None, loop: bool = False, ++ max_num_neighbors: int = 32, flow: str = 'source_to_target', ++ num_workers: int = 1) -> torch.Tensor: ++ r"""Computes graph edges to all points within a given distance. ++ ++ Args: ++ x (Tensor): Node feature matrix ++ :math:`\mathbf{X} \in \mathbb{R}^{N \times F}`. ++ r (float): The radius. ++ batch (LongTensor, optional): Batch vector ++ :math:`\mathbf{b} \in {\{ 0, \ldots, B-1\}}^N`, which assigns each ++ node to a specific example. :obj:`batch` needs to be sorted. ++ (default: :obj:`None`) ++ loop (bool, optional): If :obj:`True`, the graph will contain ++ self-loops. (default: :obj:`False`) ++ max_num_neighbors (int, optional): The maximum number of neighbors to ++ return for each element. ++ If the number of actual neighbors is greater than ++ :obj:`max_num_neighbors`, returned neighbors are picked randomly. ++ (default: :obj:`32`) ++ flow (string, optional): The flow direction when used in combination ++ with message passing (:obj:`"source_to_target"` or ++ :obj:`"target_to_source"`). (default: :obj:`"source_to_target"`) ++ num_workers (int): Number of workers to use for computation. Has no ++ effect in case :obj:`batch` is not :obj:`None`, or the input lies ++ on the GPU. (default: :obj:`1`) ++ ++ :rtype: :class:`LongTensor` ++ ++ .. code-block:: python ++ ++ import torch ++ from torch_cluster import radius_graph ++ ++ x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]]) ++ batch = torch.tensor([0, 0, 0, 0]) ++ edge_index = radius_graph(x, r=1.5, batch=batch, loop=False) ++ """ ++ ++ assert flow in ['source_to_target', 'target_to_source'] ++ edge_index = radius(x, x, r, batch, batch, ++ max_num_neighbors if loop else max_num_neighbors + 1, ++ num_workers) ++ if flow == 'source_to_target': ++ row, col = edge_index[1], edge_index[0] ++ else: ++ row, col = edge_index[0], edge_index[1] ++ ++ if not loop: ++ mask = row != col ++ row, col = row[mask], col[mask] ++ ++ return torch.stack([row, col], dim=0) diff --git a/modules/qcnet_agent_encoder.py b/modules/qcnet_agent_encoder.py -index 99d19a6..00bc9ad 100644 +index 99d19a6..615a6e2 100644 --- a/modules/qcnet_agent_encoder.py +++ b/modules/qcnet_agent_encoder.py +@@ -15,8 +15,6 @@ from typing import Dict, Mapping, Optional + + import torch + import torch.nn as nn +-from torch_cluster import radius +-from torch_cluster import radius_graph + from torch_geometric.data import Batch + from torch_geometric.data import HeteroData + from torch_geometric.utils import dense_to_sparse +@@ -24,6 +22,7 @@ from torch_geometric.utils import subgraph + + from layers.attention_layer import AttentionLayer + from layers.fourier_embedding import FourierEmbedding ++from layers import radius, radius_graph + from utils import angle_between_2d_vectors + from utils import weight_init + from utils import wrap_angle +@@ -101,6 +100,7 @@ class QCNetAgentEncoder(nn.Module): + head_vector_a = torch.stack([head_a.cos(), head_a.sin()], dim=-1) + pos_pl = data['map_polygon']['position'][:, :self.input_dim].contiguous() + orient_pl = data['map_polygon']['orientation'].contiguous() ++ + if self.dataset == 'argoverse_v2': + vel = data['agent']['velocity'][:, :self.num_historical_steps, :self.input_dim].contiguous() + length = width = height = None @@ -145,10 +145,17 @@ class QCNetAgentEncoder(nn.Module): pos_pl = pos_pl.repeat(self.num_historical_steps, 1) orient_pl = orient_pl.repeat(self.num_historical_steps) @@ -87,10 +623,27 @@ index 99d19a6..00bc9ad 100644 batch_s = torch.arange(self.num_historical_steps, device=pos_a.device).repeat_interleave(data['agent']['num_nodes']) diff --git a/modules/qcnet_decoder.py b/modules/qcnet_decoder.py -index 32066a5..1d2e8cc 100644 +index 32066a5..59e69e6 100644 --- a/modules/qcnet_decoder.py +++ b/modules/qcnet_decoder.py -@@ -83,9 +83,10 @@ class QCNetDecoder(nn.Module): +@@ -17,8 +17,6 @@ from typing import Dict, List, Mapping, Optional + import torch + import torch.nn as nn + import torch.nn.functional as F +-from torch_cluster import radius +-from torch_cluster import radius_graph + from torch_geometric.data import Batch + from torch_geometric.data import HeteroData + from torch_geometric.utils import dense_to_sparse +@@ -26,6 +24,7 @@ from torch_geometric.utils import dense_to_sparse + from layers import AttentionLayer + from layers import FourierEmbedding + from layers import MLPLayer ++from layers import radius, radius_graph + from utils import angle_between_2d_vectors + from utils import bipartite_dense_to_sparse + from utils import weight_init +@@ -83,9 +82,10 @@ class QCNetDecoder(nn.Module): num_freq_bands=num_freq_bands) self.y_emb = FourierEmbedding(input_dim=output_dim + output_head, hidden_dim=hidden_dim, num_freq_bands=num_freq_bands) @@ -104,6 +657,16 @@ index 32066a5..1d2e8cc 100644 self.t2m_propose_attn_layers = nn.ModuleList( [AttentionLayer(hidden_dim=hidden_dim, num_heads=num_heads, head_dim=head_dim, dropout=dropout, bipartite=True, has_pos_emb=True) for _ in range(num_layers)] +@@ -221,7 +221,8 @@ class QCNetDecoder(nn.Module): + m = m.reshape(-1, self.num_modes, self.hidden_dim).transpose(0, 1).reshape(-1, self.hidden_dim) + m = self.pl2m_propose_attn_layers[i]((x_pl, m), r_pl2m, edge_index_pl2m) + m = self.a2m_propose_attn_layers[i]((x_a, m), r_a2m, edge_index_a2m) +- m = m.reshape(self.num_modes, -1, self.hidden_dim).transpose(0, 1).reshape(-1, self.hidden_dim) ++ m = m.reshape(self.num_modes, -1, self.hidden_dim).transpose(0, 1) ++ m = m.reshape(-1, self.hidden_dim) + m = self.m2m_propose_attn_layer(m, None, edge_index_m2m) + m = m.reshape(-1, self.num_modes, self.hidden_dim) + locs_propose_pos[t] = self.to_loc_propose_pos(m) @@ -252,7 +253,9 @@ class QCNetDecoder(nn.Module): self.num_future_steps, 1)) m = self.y_emb(loc_propose_pos.detach().view(-1, self.output_dim)) @@ -115,11 +678,29 @@ index 32066a5..1d2e8cc 100644 for i in range(self.num_layers): m = self.t2m_refine_attn_layers[i]((x_t, m), r_t2m, edge_index_t2m) m = m.reshape(-1, self.num_modes, self.hidden_dim).transpose(0, 1).reshape(-1, self.hidden_dim) +diff --git a/modules/qcnet_map_encoder.py b/modules/qcnet_map_encoder.py +index 19e3817..c9d8d59 100644 +--- a/modules/qcnet_map_encoder.py ++++ b/modules/qcnet_map_encoder.py +@@ -15,12 +15,12 @@ from typing import Dict + + import torch + import torch.nn as nn +-from torch_cluster import radius_graph + from torch_geometric.data import Batch + from torch_geometric.data import HeteroData + + from layers.attention_layer import AttentionLayer + from layers.fourier_embedding import FourierEmbedding ++from layers import radius, radius_graph + from utils import angle_between_2d_vectors + from utils import merge_edges + from utils import weight_init diff --git a/predictors/qcnet.py b/predictors/qcnet.py -index 35ee89e..4033139 100644 +index 35ee89e..8ae80a4 100644 --- a/predictors/qcnet.py +++ b/predictors/qcnet.py -@@ -11,12 +11,13 @@ +@@ -11,17 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. @@ -134,22 +715,38 @@ index 35ee89e..4033139 100644 import torch import torch.nn as nn import torch.nn.functional as F -@@ -152,6 +153,10 @@ class QCNet(pl.LightningModule): + from torch_geometric.data import Batch + from torch_geometric.data import HeteroData ++from torch_npu.optim import NpuFusedAdamW ++import torch.distributed as dist + + from losses import MixtureNLLLoss + from losses import NLLLoss +@@ -71,6 +74,7 @@ class QCNet(pl.LightningModule): + T_max: int, + submission_dir: str, + submission_file_name: str, ++ profiling_step: int, + **kwargs) -> None: + super(QCNet, self).__init__() + self.save_hyperparameters() +@@ -152,6 +156,10 @@ class QCNet(pl.LightningModule): self.MR = MR(max_guesses=6) self.test_predictions = dict() + # for evaluating training speed + self.init_time = time.time() -+ self.profiling_step = 100 ++ self.profiling_step = profiling_step + self.avg_train_time = 0.0 def forward(self, data: HeteroData): scene_enc = self.encoder(data) -@@ -200,6 +205,11 @@ class QCNet(pl.LightningModule): +@@ -200,6 +208,12 @@ class QCNet(pl.LightningModule): prob=pi, mask=reg_mask[:, -1:]) * cls_mask cls_loss = cls_loss.sum() / cls_mask.sum().clamp_(min=1) -+ if batch_idx > 0 and batch_idx % self.profiling_step == 0: ++ ++ if dist.get_rank() == 0 and batch_idx > 0 and batch_idx % self.profiling_step == 0: + self.avg_train_time = (time.time() - self.init_time) / self.profiling_step + self.init_time = time.time() + print(f"Average Training Time (step {batch_idx - self.profiling_step}-{batch_idx}): {self.avg_train_time:.3f}s") @@ -157,8 +754,24 @@ index 35ee89e..4033139 100644 self.log('train_reg_loss_propose', reg_loss_propose, prog_bar=False, on_step=True, on_epoch=True, batch_size=1) self.log('train_reg_loss_refine', reg_loss_refine, prog_bar=False, on_step=True, on_epoch=True, batch_size=1) self.log('train_cls_loss', cls_loss, prog_bar=False, on_step=True, on_epoch=True, batch_size=1) +@@ -367,7 +381,7 @@ class QCNet(pl.LightningModule): + "weight_decay": 0.0}, + ] + +- optimizer = torch.optim.AdamW(optim_groups, lr=self.lr, weight_decay=self.weight_decay) ++ optimizer = NpuFusedAdamW(optim_groups, lr=self.lr, weight_decay=self.weight_decay) + scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer, T_max=self.T_max, eta_min=0.0) + return [optimizer], [scheduler] + +@@ -402,4 +416,6 @@ class QCNet(pl.LightningModule): + parser.add_argument('--T_max', type=int, default=64) + parser.add_argument('--submission_dir', type=str, default='./') + parser.add_argument('--submission_file_name', type=str, default='submission') ++ parser.add_argument('--profiling_step', type=int, default=500) ++ parser.add_argument('--dynamic_sort', action="store_true", default=True) + return parent_parser diff --git a/train_qcnet.py b/train_qcnet.py -index 092b41c..0fcc46c 100644 +index 092b41c..cf96b6a 100644 --- a/train_qcnet.py +++ b/train_qcnet.py @@ -13,16 +13,22 @@ @@ -194,8 +807,9 @@ index 092b41c..0fcc46c 100644 model_checkpoint = ModelCheckpoint(monitor='val_minFDE', save_top_k=5, mode='min') lr_monitor = LearningRateMonitor(logging_interval='epoch') - trainer = pl.Trainer(accelerator=args.accelerator, devices=args.devices, +- strategy=DDPStrategy(find_unused_parameters=False, gradient_as_bucket_view=True), + trainer = Trainer(accelerator=args.accelerator, devices=args.devices, - strategy=DDPStrategy(find_unused_parameters=False, gradient_as_bucket_view=True), ++ strategy=DDPStrategy(find_unused_parameters=False, gradient_as_bucket_view=False), callbacks=[model_checkpoint, lr_monitor], max_epochs=args.max_epochs) trainer.fit(model, datamodule) diff --git a/val.py b/val.py diff --git a/model_examples/QCNet/patch/torch_cluster.patch b/model_examples/QCNet/patch/torch_cluster.patch deleted file mode 100644 index 5dc32cf8e1c6cd690f33dcfc0d7b2065f67cd966..0000000000000000000000000000000000000000 --- a/model_examples/QCNet/patch/torch_cluster.patch +++ /dev/null @@ -1,38 +0,0 @@ -diff --git a/torch_cluster/radius.py b/torch_cluster/radius.py -index fd73b75..3569058 100644 ---- a/torch_cluster/radius.py -+++ b/torch_cluster/radius.py -@@ -1,9 +1,10 @@ - from typing import Optional - - import torch -+import torch_npu - - --@torch.jit.script -+# @torch.jit.script - def radius(x: torch.Tensor, y: torch.Tensor, r: float, - batch_x: Optional[torch.Tensor] = None, - batch_y: Optional[torch.Tensor] = None, max_num_neighbors: int = 32, -@@ -66,14 +67,16 @@ def radius(x: torch.Tensor, y: torch.Tensor, r: float, - assert batch_x is not None - assert batch_y is not None - arange = torch.arange(batch_size + 1, device=x.device) -- ptr_x = torch.bucketize(arange, batch_x) -- ptr_y = torch.bucketize(arange, batch_y) -+ ptr_x = torch.bucketize(arange, batch_x).cpu() -+ ptr_y = torch.bucketize(arange, batch_y).cpu() - -- return torch.ops.torch_cluster.radius(x, y, ptr_x, ptr_y, r, -- max_num_neighbors, num_workers) -+ ori_device = x.device -+ -+ return torch.ops.torch_cluster.radius(x.cpu(), y.cpu(), ptr_x, ptr_y, r, -+ max_num_neighbors, num_workers).to(ori_device) - - --@torch.jit.script -+# @torch.jit.script - def radius_graph(x: torch.Tensor, r: float, - batch: Optional[torch.Tensor] = None, loop: bool = False, - max_num_neighbors: int = 32, flow: str = 'source_to_target', diff --git a/model_examples/QCNet/script/train_performance.sh b/model_examples/QCNet/script/train_performance.sh index 6d12725859d66b4fa6c8ecc550c1e73a45b2be61..b7097497dbf7d1e6a88fb72d3c360f0214d90e1b 100644 --- a/model_examples/QCNet/script/train_performance.sh +++ b/model_examples/QCNet/script/train_performance.sh @@ -10,9 +10,11 @@ export CPU_AFFINITY_CONF=1 # 设置算子缓存数量,取值范围[1, 10000000],默认值为10000,一般情况下保持默认即可 export ACLNN_CACHE_LIMIT=500000 +cur_path=$(pwd) +ASCEND_DEVICE_ID=0 # /path/to/datasets 请更改为存放数据的路径 python QCNet/train_qcnet.py --root /path/to/datasets --train_batch_size 4 \ --val_batch_size 4 --test_batch_size 4 --devices 8 --num_workers 8 --dataset argoverse_v2 \ --num_historical_steps 50 --num_future_steps 60 --num_recurrent_steps 3 \ --pl2pl_radius 150 --time_span 10 --pl2a_radius 50 --a2a_radius 50 \ - --num_t2m_steps 30 --pl2m_radius 150 --a2m_radius 150 --T_max 1 --max_epochs 1 + --num_t2m_steps 30 --pl2m_radius 150 --a2m_radius 150 --T_max 1 --max_epochs 1 --max_epochs 1 >$cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &