From b917361ca55b034606bfafa1f0dac5834321e506 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Tue, 29 Apr 2025 18:03:38 +0800 Subject: [PATCH] [MISC] Clean up torch_npu (#688) torch_npu 2.5.1 support autoload now. This patch does: 1. remove useless torch_npu import 2. replace `torch_npu.npu` to `torch.npu`. Signed-off-by: wangxiyuan --- examples/dp_offline/data_parallel.py | 1 - tests/multicard/test_pyhccl_distributed.py | 1 - tests/ops/test_rotary_embedding.py | 1 - tests/singlecard/test_camem.py | 13 ++++--------- tests/singlecard/test_pyhccl.py | 1 - vllm_ascend/attention/attention.py | 9 ++------- vllm_ascend/device_allocator/camem.py | 16 +++++----------- .../distributed/device_communicators/pyhccl.py | 1 - vllm_ascend/ops/__init__.py | 3 +-- .../patch/worker/patch_0_8_4/patch_metrics.py | 3 +-- vllm_ascend/platform.py | 2 -- vllm_ascend/quantization/quant_config.py | 1 - vllm_ascend/utils.py | 1 - vllm_ascend/worker/model_runner.py | 7 +++---- vllm_ascend/worker/pooling_model_runner.py | 5 ++--- 15 files changed, 18 insertions(+), 47 deletions(-) diff --git a/examples/dp_offline/data_parallel.py b/examples/dp_offline/data_parallel.py index 1e94940..0299497 100644 --- a/examples/dp_offline/data_parallel.py +++ b/examples/dp_offline/data_parallel.py @@ -30,7 +30,6 @@ def main(): for i in range(local_rank * tp_size, (local_rank + 1) * tp_size)) import torch - import torch_npu # noqa from vllm import LLM, SamplingParams from vllm.distributed.parallel_state import ( destroy_distributed_environment, destroy_model_parallel) diff --git a/tests/multicard/test_pyhccl_distributed.py b/tests/multicard/test_pyhccl_distributed.py index 1b35d0f..42918c9 100644 --- a/tests/multicard/test_pyhccl_distributed.py +++ b/tests/multicard/test_pyhccl_distributed.py @@ -20,7 +20,6 @@ import multiprocessing import os import torch -import torch_npu # noqa: F401 from vllm.distributed.parallel_state import (get_world_group, init_distributed_environment) from vllm.utils import update_environment_variables diff --git a/tests/ops/test_rotary_embedding.py b/tests/ops/test_rotary_embedding.py index 2ab0420..2d5ec18 100644 --- a/tests/ops/test_rotary_embedding.py +++ b/tests/ops/test_rotary_embedding.py @@ -9,7 +9,6 @@ from typing import Optional, Tuple, Union import pytest import torch import torch.nn as nn -import torch_npu # noqa: F401 import vllm_ascend.platform # noqa: F401 diff --git a/tests/singlecard/test_camem.py b/tests/singlecard/test_camem.py index 7ebb70c..76e265c 100644 --- a/tests/singlecard/test_camem.py +++ b/tests/singlecard/test_camem.py @@ -25,11 +25,6 @@ from vllm.utils import GiB_bytes from tests.utils import fork_new_process_for_each_test from vllm_ascend.device_allocator.camem import CaMemAllocator -try: - import torch_npu # noqa: F401 -except ImportError: - print("Failed to import torch_npu.") - @fork_new_process_for_each_test def test_basic_camem(): @@ -53,9 +48,9 @@ def test_basic_camem(): output = x + y + z assert torch.allclose(output, torch.ones_like(output) * 3) - free_bytes = torch_npu.npu.mem_get_info()[0] + free_bytes = torch.npu.mem_get_info()[0] allocator.sleep() - free_bytes_after_sleep = torch_npu.npu.mem_get_info()[0] + free_bytes_after_sleep = torch.npu.mem_get_info()[0] assert free_bytes_after_sleep > free_bytes allocator.wake_up() @@ -67,7 +62,7 @@ def test_basic_camem(): @fork_new_process_for_each_test def test_end_to_end(): os.environ["VLLM_USE_V1"] = "0" - free, total = torch_npu.npu.mem_get_info() + free, total = torch.npu.mem_get_info() used_bytes_baseline = total - free # in case other process is running llm = LLM("Qwen/Qwen2.5-0.5B-Instruct", enable_sleep_mode=True) prompt = "How are you?" @@ -79,7 +74,7 @@ def test_end_to_end(): # test sleep level 1 here. llm.sleep(level=1) - free_gpu_bytes_after_sleep, total = torch_npu.npu.mem_get_info() + free_gpu_bytes_after_sleep, total = torch.npu.mem_get_info() used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline # now the memory usage should be less than the model weights # (0.5B model, 1GiB weights) diff --git a/tests/singlecard/test_pyhccl.py b/tests/singlecard/test_pyhccl.py index 8183b70..57621db 100644 --- a/tests/singlecard/test_pyhccl.py +++ b/tests/singlecard/test_pyhccl.py @@ -17,7 +17,6 @@ # limitations under the License. # import torch -import torch_npu # noqa: F401 from vllm_ascend.distributed.device_communicators.pyhccl_wrapper import \ HCCLLibrary diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py index b179785..83ed446 100644 --- a/vllm_ascend/attention/attention.py +++ b/vllm_ascend/attention/attention.py @@ -20,14 +20,9 @@ from typing import Any, Dict, List, Optional, Tuple, Type import numpy as np import torch -from torch.nn.functional import scaled_dot_product_attention - -try: - import torch_npu # noqa: F401 -except ImportError: - print("Failed to import torch_npu.") - +import torch_npu import torchair._contrib.custom_torch_ops # type: ignore # noqa: F401 +from torch.nn.functional import scaled_dot_product_attention from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionLayer, AttentionMetadata, AttentionType, diff --git a/vllm_ascend/device_allocator/camem.py b/vllm_ascend/device_allocator/camem.py index 7cfb690..d34c319 100644 --- a/vllm_ascend/device_allocator/camem.py +++ b/vllm_ascend/device_allocator/camem.py @@ -24,12 +24,6 @@ from typing import Any, Callable, Dict, Optional, Tuple, Union import torch from acl.rt import memcpy # type: ignore # noqa: F401 from vllm.logger import logger - -try: - import torch_npu # noqa: F401 -except ImportError: - print("Failed to import torch_npu.") - from vllm.utils import is_pin_memory_available @@ -95,10 +89,10 @@ def unmap_and_release(allocation_handle: HandleType) -> None: def get_pluggable_allocator( python_malloc_fn: Callable[[tuple[int, int, int, int]], None], python_free_func: Callable[[int], tuple[int, int, int, int]] -) -> torch_npu.npu.memory.NPUPluggableAllocator: +) -> torch.npu.memory.NPUPluggableAllocator: init_module(python_malloc_fn, python_free_func) - new_alloc = torch_npu.npu.memory.NPUPluggableAllocator( - lib_name, 'my_malloc', 'my_free') + new_alloc = torch.npu.memory.NPUPluggableAllocator(lib_name, 'my_malloc', + 'my_free') return new_alloc @@ -107,8 +101,8 @@ def use_memory_pool_with_allocator( python_malloc_fn: Callable[[tuple[int, int, int, int]], None], python_free_func: Callable[[int], tuple[int, int, int, int]]): new_alloc = get_pluggable_allocator(python_malloc_fn, python_free_func) - mem_pool = torch_npu.npu.memory.MemPool(new_alloc._allocator) - with torch_npu.npu.memory.use_mem_pool(mem_pool): + mem_pool = torch.npu.memory.MemPool(new_alloc._allocator) + with torch.npu.memory.use_mem_pool(mem_pool): yield mem_pool, new_alloc diff --git a/vllm_ascend/distributed/device_communicators/pyhccl.py b/vllm_ascend/distributed/device_communicators/pyhccl.py index 3c0ea87..984ece7 100644 --- a/vllm_ascend/distributed/device_communicators/pyhccl.py +++ b/vllm_ascend/distributed/device_communicators/pyhccl.py @@ -19,7 +19,6 @@ from typing import Optional, Union import torch import torch.distributed as dist -import torch_npu # noqa: F401 from torch.distributed import ProcessGroup, ReduceOp from vllm.distributed.utils import StatelessProcessGroup from vllm.logger import logger diff --git a/vllm_ascend/ops/__init__.py b/vllm_ascend/ops/__init__.py index 317024f..990ffea 100644 --- a/vllm_ascend/ops/__init__.py +++ b/vllm_ascend/ops/__init__.py @@ -16,7 +16,6 @@ # import torch -import torch_npu # noqa: F401 import vllm_ascend.ops.activation # noqa import vllm_ascend.ops.common_fused_moe # noqa @@ -34,7 +33,7 @@ class dummyFusionOp: def register_dummy_fusion_op() -> None: - torch.cuda.CUDAGraph = torch_npu.npu.NPUGraph + torch.cuda.CUDAGraph = torch.npu.NPUGraph torch.ops._C.rms_norm = dummyFusionOp(name="rms_norm") torch.ops._C.fused_add_rms_norm = dummyFusionOp(name="fused_add_rms_norm") torch.ops._C.static_scaled_fp8_quant = dummyFusionOp( diff --git a/vllm_ascend/patch/worker/patch_0_8_4/patch_metrics.py b/vllm_ascend/patch/worker/patch_0_8_4/patch_metrics.py index 4ba223f..b3c98fc 100644 --- a/vllm_ascend/patch/worker/patch_0_8_4/patch_metrics.py +++ b/vllm_ascend/patch/worker/patch_0_8_4/patch_metrics.py @@ -18,7 +18,6 @@ from typing import Callable, Optional, Union import torch -import torch_npu from vllm.spec_decode.metrics import (AsyncMetricsCollector, SpecDecodeWorkerMetrics) @@ -36,7 +35,7 @@ def init_tensors(self, if isinstance(device_type, torch.device): device_type = device_type.type if device_type == 'npu': - self._copy_stream = torch_npu.npu.Stream() + self._copy_stream = torch.npu.Stream() def maybe_collect_rejsample_metrics( diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 4e4a397..6055a85 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -20,7 +20,6 @@ import os from typing import TYPE_CHECKING, Optional, Tuple import torch -import torch_npu # noqa: F401 import vllm.envs as envs from vllm.logger import logger from vllm.platforms import Platform, PlatformEnum @@ -244,7 +243,6 @@ class NPUPlatform(Platform): timeout) -> None: from torch.distributed import ProcessGroup, is_hccl_available assert is_hccl_available() - import torch_npu # noqa from torch_npu._C._distributed_c10d import ProcessGroupHCCL backend_options = ProcessGroupHCCL.Options() backend_options._timeout = timeout diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index da8e96b..adedaa7 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -19,7 +19,6 @@ from types import MappingProxyType from typing import Any, Callable, Dict, List, Mapping, Optional import torch -import torch_npu # noqa: F401 from vllm.distributed import get_tensor_model_parallel_rank from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 64e1425..778b129 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -17,7 +17,6 @@ # Adapted from vllm-project/vllm/vllm/worker/worker.py # import torch -import torch_npu # noqa: F401 from packaging.version import InvalidVersion, Version from vllm.logger import logger diff --git a/vllm_ascend/worker/model_runner.py b/vllm_ascend/worker/model_runner.py index f1425d4..e08fd08 100644 --- a/vllm_ascend/worker/model_runner.py +++ b/vllm_ascend/worker/model_runner.py @@ -28,7 +28,6 @@ from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, import numpy as np import torch import torch.nn as nn -import torch_npu import vllm.envs as envs from vllm.attention import AttentionMetadata, get_attn_backend from vllm.attention.backends.utils import CommonAttentionState @@ -1145,7 +1144,7 @@ class NPUModelRunnerBase(ModelRunnerBase[TModelInputForNPU]): device=self.device) self.execute_model(model_input, kv_caches, intermediate_tensors) - torch_npu.npu.synchronize() + torch.npu.synchronize() return def remove_all_loras(self): @@ -1357,8 +1356,8 @@ class NPUModelRunner(NPUModelRunnerBase[ModelInputForNPUWithSamplingMetadata]): if (self.observability_config is not None and self.observability_config.collect_model_forward_time): - model_forward_start = torch_npu.npu.Event(enable_timing=True) - model_forward_end = torch_npu.npu.Event(enable_timing=True) + model_forward_start = torch.npu.Event(enable_timing=True) + model_forward_end = torch.npu.Event(enable_timing=True) model_forward_start.record() if not bypass_model_exec: diff --git a/vllm_ascend/worker/pooling_model_runner.py b/vllm_ascend/worker/pooling_model_runner.py index f584a88..e1262fb 100644 --- a/vllm_ascend/worker/pooling_model_runner.py +++ b/vllm_ascend/worker/pooling_model_runner.py @@ -134,9 +134,8 @@ class NPUPoolingModelRunner( } if self.has_inner_state else {} if (self.observability_config is not None and self.observability_config.collect_model_forward_time): - import torch_npu - model_forward_start = torch_npu.npu.Event(enable_timing=True) - model_forward_end = torch_npu.npu.Event(enable_timing=True) + model_forward_start = torch.npu.Event(enable_timing=True) + model_forward_end = torch.npu.Event(enable_timing=True) model_forward_start.record() cross_enc_kwargs = {}