From b917361ca55b034606bfafa1f0dac5834321e506 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Tue, 29 Apr 2025 18:03:38 +0800
Subject: [PATCH] [MISC] Clean up torch_npu (#688)

torch_npu 2.5.1 support autoload now. This patch does:
1. remove useless torch_npu import
2. replace `torch_npu.npu` to `torch.npu`.

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 examples/dp_offline/data_parallel.py             |  1 -
 tests/multicard/test_pyhccl_distributed.py       |  1 -
 tests/ops/test_rotary_embedding.py               |  1 -
 tests/singlecard/test_camem.py                   | 13 ++++---------
 tests/singlecard/test_pyhccl.py                  |  1 -
 vllm_ascend/attention/attention.py               |  9 ++-------
 vllm_ascend/device_allocator/camem.py            | 16 +++++-----------
 .../distributed/device_communicators/pyhccl.py   |  1 -
 vllm_ascend/ops/__init__.py                      |  3 +--
 .../patch/worker/patch_0_8_4/patch_metrics.py    |  3 +--
 vllm_ascend/platform.py                          |  2 --
 vllm_ascend/quantization/quant_config.py         |  1 -
 vllm_ascend/utils.py                             |  1 -
 vllm_ascend/worker/model_runner.py               |  7 +++----
 vllm_ascend/worker/pooling_model_runner.py       |  5 ++---
 15 files changed, 18 insertions(+), 47 deletions(-)

diff --git a/examples/dp_offline/data_parallel.py b/examples/dp_offline/data_parallel.py
index 1e94940..0299497 100644
--- a/examples/dp_offline/data_parallel.py
+++ b/examples/dp_offline/data_parallel.py
@@ -30,7 +30,6 @@ def main():
         for i in range(local_rank * tp_size, (local_rank + 1) * tp_size))
 
     import torch
-    import torch_npu  # noqa
     from vllm import LLM, SamplingParams
     from vllm.distributed.parallel_state import (
         destroy_distributed_environment, destroy_model_parallel)
diff --git a/tests/multicard/test_pyhccl_distributed.py b/tests/multicard/test_pyhccl_distributed.py
index 1b35d0f..42918c9 100644
--- a/tests/multicard/test_pyhccl_distributed.py
+++ b/tests/multicard/test_pyhccl_distributed.py
@@ -20,7 +20,6 @@ import multiprocessing
 import os
 
 import torch
-import torch_npu  # noqa: F401
 from vllm.distributed.parallel_state import (get_world_group,
                                              init_distributed_environment)
 from vllm.utils import update_environment_variables
diff --git a/tests/ops/test_rotary_embedding.py b/tests/ops/test_rotary_embedding.py
index 2ab0420..2d5ec18 100644
--- a/tests/ops/test_rotary_embedding.py
+++ b/tests/ops/test_rotary_embedding.py
@@ -9,7 +9,6 @@ from typing import Optional, Tuple, Union
 import pytest
 import torch
 import torch.nn as nn
-import torch_npu  # noqa: F401
 
 import vllm_ascend.platform  # noqa: F401
 
diff --git a/tests/singlecard/test_camem.py b/tests/singlecard/test_camem.py
index 7ebb70c..76e265c 100644
--- a/tests/singlecard/test_camem.py
+++ b/tests/singlecard/test_camem.py
@@ -25,11 +25,6 @@ from vllm.utils import GiB_bytes
 from tests.utils import fork_new_process_for_each_test
 from vllm_ascend.device_allocator.camem import CaMemAllocator
 
-try:
-    import torch_npu  # noqa: F401
-except ImportError:
-    print("Failed to import torch_npu.")
-
 
 @fork_new_process_for_each_test
 def test_basic_camem():
@@ -53,9 +48,9 @@ def test_basic_camem():
     output = x + y + z
     assert torch.allclose(output, torch.ones_like(output) * 3)
 
-    free_bytes = torch_npu.npu.mem_get_info()[0]
+    free_bytes = torch.npu.mem_get_info()[0]
     allocator.sleep()
-    free_bytes_after_sleep = torch_npu.npu.mem_get_info()[0]
+    free_bytes_after_sleep = torch.npu.mem_get_info()[0]
     assert free_bytes_after_sleep > free_bytes
     allocator.wake_up()
 
@@ -67,7 +62,7 @@ def test_basic_camem():
 @fork_new_process_for_each_test
 def test_end_to_end():
     os.environ["VLLM_USE_V1"] = "0"
-    free, total = torch_npu.npu.mem_get_info()
+    free, total = torch.npu.mem_get_info()
     used_bytes_baseline = total - free  # in case other process is running
     llm = LLM("Qwen/Qwen2.5-0.5B-Instruct", enable_sleep_mode=True)
     prompt = "How are you?"
@@ -79,7 +74,7 @@ def test_end_to_end():
     # test sleep level 1 here.
     llm.sleep(level=1)
 
-    free_gpu_bytes_after_sleep, total = torch_npu.npu.mem_get_info()
+    free_gpu_bytes_after_sleep, total = torch.npu.mem_get_info()
     used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
     # now the memory usage should be less than the model weights
     # (0.5B model, 1GiB weights)
diff --git a/tests/singlecard/test_pyhccl.py b/tests/singlecard/test_pyhccl.py
index 8183b70..57621db 100644
--- a/tests/singlecard/test_pyhccl.py
+++ b/tests/singlecard/test_pyhccl.py
@@ -17,7 +17,6 @@
 # limitations under the License.
 #
 import torch
-import torch_npu  # noqa: F401
 
 from vllm_ascend.distributed.device_communicators.pyhccl_wrapper import \
     HCCLLibrary
diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py
index b179785..83ed446 100644
--- a/vllm_ascend/attention/attention.py
+++ b/vllm_ascend/attention/attention.py
@@ -20,14 +20,9 @@ from typing import Any, Dict, List, Optional, Tuple, Type
 
 import numpy as np
 import torch
-from torch.nn.functional import scaled_dot_product_attention
-
-try:
-    import torch_npu  # noqa: F401
-except ImportError:
-    print("Failed to import torch_npu.")
-
+import torch_npu
 import torchair._contrib.custom_torch_ops  # type: ignore  # noqa: F401
+from torch.nn.functional import scaled_dot_product_attention
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer,
                                               AttentionMetadata, AttentionType,
diff --git a/vllm_ascend/device_allocator/camem.py b/vllm_ascend/device_allocator/camem.py
index 7cfb690..d34c319 100644
--- a/vllm_ascend/device_allocator/camem.py
+++ b/vllm_ascend/device_allocator/camem.py
@@ -24,12 +24,6 @@ from typing import Any, Callable, Dict, Optional, Tuple, Union
 import torch
 from acl.rt import memcpy  # type: ignore # noqa: F401
 from vllm.logger import logger
-
-try:
-    import torch_npu  # noqa: F401
-except ImportError:
-    print("Failed to import torch_npu.")
-
 from vllm.utils import is_pin_memory_available
 
 
@@ -95,10 +89,10 @@ def unmap_and_release(allocation_handle: HandleType) -> None:
 def get_pluggable_allocator(
     python_malloc_fn: Callable[[tuple[int, int, int, int]], None],
     python_free_func: Callable[[int], tuple[int, int, int, int]]
-) -> torch_npu.npu.memory.NPUPluggableAllocator:
+) -> torch.npu.memory.NPUPluggableAllocator:
     init_module(python_malloc_fn, python_free_func)
-    new_alloc = torch_npu.npu.memory.NPUPluggableAllocator(
-        lib_name, 'my_malloc', 'my_free')
+    new_alloc = torch.npu.memory.NPUPluggableAllocator(lib_name, 'my_malloc',
+                                                       'my_free')
     return new_alloc
 
 
@@ -107,8 +101,8 @@ def use_memory_pool_with_allocator(
         python_malloc_fn: Callable[[tuple[int, int, int, int]], None],
         python_free_func: Callable[[int], tuple[int, int, int, int]]):
     new_alloc = get_pluggable_allocator(python_malloc_fn, python_free_func)
-    mem_pool = torch_npu.npu.memory.MemPool(new_alloc._allocator)
-    with torch_npu.npu.memory.use_mem_pool(mem_pool):
+    mem_pool = torch.npu.memory.MemPool(new_alloc._allocator)
+    with torch.npu.memory.use_mem_pool(mem_pool):
         yield mem_pool, new_alloc
 
 
diff --git a/vllm_ascend/distributed/device_communicators/pyhccl.py b/vllm_ascend/distributed/device_communicators/pyhccl.py
index 3c0ea87..984ece7 100644
--- a/vllm_ascend/distributed/device_communicators/pyhccl.py
+++ b/vllm_ascend/distributed/device_communicators/pyhccl.py
@@ -19,7 +19,6 @@ from typing import Optional, Union
 
 import torch
 import torch.distributed as dist
-import torch_npu  # noqa: F401
 from torch.distributed import ProcessGroup, ReduceOp
 from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import logger
diff --git a/vllm_ascend/ops/__init__.py b/vllm_ascend/ops/__init__.py
index 317024f..990ffea 100644
--- a/vllm_ascend/ops/__init__.py
+++ b/vllm_ascend/ops/__init__.py
@@ -16,7 +16,6 @@
 #
 
 import torch
-import torch_npu  # noqa: F401
 
 import vllm_ascend.ops.activation  # noqa
 import vllm_ascend.ops.common_fused_moe  # noqa
@@ -34,7 +33,7 @@ class dummyFusionOp:
 
 
 def register_dummy_fusion_op() -> None:
-    torch.cuda.CUDAGraph = torch_npu.npu.NPUGraph
+    torch.cuda.CUDAGraph = torch.npu.NPUGraph
     torch.ops._C.rms_norm = dummyFusionOp(name="rms_norm")
     torch.ops._C.fused_add_rms_norm = dummyFusionOp(name="fused_add_rms_norm")
     torch.ops._C.static_scaled_fp8_quant = dummyFusionOp(
diff --git a/vllm_ascend/patch/worker/patch_0_8_4/patch_metrics.py b/vllm_ascend/patch/worker/patch_0_8_4/patch_metrics.py
index 4ba223f..b3c98fc 100644
--- a/vllm_ascend/patch/worker/patch_0_8_4/patch_metrics.py
+++ b/vllm_ascend/patch/worker/patch_0_8_4/patch_metrics.py
@@ -18,7 +18,6 @@
 from typing import Callable, Optional, Union
 
 import torch
-import torch_npu
 from vllm.spec_decode.metrics import (AsyncMetricsCollector,
                                       SpecDecodeWorkerMetrics)
 
@@ -36,7 +35,7 @@ def init_tensors(self,
     if isinstance(device_type, torch.device):
         device_type = device_type.type
     if device_type == 'npu':
-        self._copy_stream = torch_npu.npu.Stream()
+        self._copy_stream = torch.npu.Stream()
 
 
 def maybe_collect_rejsample_metrics(
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
index 4e4a397..6055a85 100644
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -20,7 +20,6 @@ import os
 from typing import TYPE_CHECKING, Optional, Tuple
 
 import torch
-import torch_npu  # noqa: F401
 import vllm.envs as envs
 from vllm.logger import logger
 from vllm.platforms import Platform, PlatformEnum
@@ -244,7 +243,6 @@ class NPUPlatform(Platform):
                                   timeout) -> None:
         from torch.distributed import ProcessGroup, is_hccl_available
         assert is_hccl_available()
-        import torch_npu  # noqa
         from torch_npu._C._distributed_c10d import ProcessGroupHCCL
         backend_options = ProcessGroupHCCL.Options()
         backend_options._timeout = timeout
diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
index da8e96b..adedaa7 100644
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -19,7 +19,6 @@ from types import MappingProxyType
 from typing import Any, Callable, Dict, List, Mapping, Optional
 
 import torch
-import torch_npu  # noqa: F401
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
                                                   FusedMoeWeightScaleSupported)
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
index 64e1425..778b129 100644
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -17,7 +17,6 @@
 # Adapted from vllm-project/vllm/vllm/worker/worker.py
 #
 import torch
-import torch_npu  # noqa: F401
 from packaging.version import InvalidVersion, Version
 from vllm.logger import logger
 
diff --git a/vllm_ascend/worker/model_runner.py b/vllm_ascend/worker/model_runner.py
index f1425d4..e08fd08 100644
--- a/vllm_ascend/worker/model_runner.py
+++ b/vllm_ascend/worker/model_runner.py
@@ -28,7 +28,6 @@ from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set,
 import numpy as np
 import torch
 import torch.nn as nn
-import torch_npu
 import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.attention.backends.utils import CommonAttentionState
@@ -1145,7 +1144,7 @@ class NPUModelRunnerBase(ModelRunnerBase[TModelInputForNPU]):
                     device=self.device)
 
             self.execute_model(model_input, kv_caches, intermediate_tensors)
-            torch_npu.npu.synchronize()
+            torch.npu.synchronize()
             return
 
     def remove_all_loras(self):
@@ -1357,8 +1356,8 @@ class NPUModelRunner(NPUModelRunnerBase[ModelInputForNPUWithSamplingMetadata]):
 
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
-            model_forward_start = torch_npu.npu.Event(enable_timing=True)
-            model_forward_end = torch_npu.npu.Event(enable_timing=True)
+            model_forward_start = torch.npu.Event(enable_timing=True)
+            model_forward_end = torch.npu.Event(enable_timing=True)
             model_forward_start.record()
 
         if not bypass_model_exec:
diff --git a/vllm_ascend/worker/pooling_model_runner.py b/vllm_ascend/worker/pooling_model_runner.py
index f584a88..e1262fb 100644
--- a/vllm_ascend/worker/pooling_model_runner.py
+++ b/vllm_ascend/worker/pooling_model_runner.py
@@ -134,9 +134,8 @@ class NPUPoolingModelRunner(
         } if self.has_inner_state else {}
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
-            import torch_npu
-            model_forward_start = torch_npu.npu.Event(enable_timing=True)
-            model_forward_end = torch_npu.npu.Event(enable_timing=True)
+            model_forward_start = torch.npu.Event(enable_timing=True)
+            model_forward_end = torch.npu.Event(enable_timing=True)
             model_forward_start.record()
 
         cross_enc_kwargs = {}