[MISC] Clean up torch_npu (#688)
torch_npu 2.5.1 support autoload now. This patch does: 1. remove useless torch_npu import 2. replace `torch_npu.npu` to `torch.npu`. Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -30,7 +30,6 @@ def main():
|
|||||||
for i in range(local_rank * tp_size, (local_rank + 1) * tp_size))
|
for i in range(local_rank * tp_size, (local_rank + 1) * tp_size))
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch_npu # noqa
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.distributed.parallel_state import (
|
from vllm.distributed.parallel_state import (
|
||||||
destroy_distributed_environment, destroy_model_parallel)
|
destroy_distributed_environment, destroy_model_parallel)
|
||||||
|
|||||||
@@ -20,7 +20,6 @@ import multiprocessing
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch_npu # noqa: F401
|
|
||||||
from vllm.distributed.parallel_state import (get_world_group,
|
from vllm.distributed.parallel_state import (get_world_group,
|
||||||
init_distributed_environment)
|
init_distributed_environment)
|
||||||
from vllm.utils import update_environment_variables
|
from vllm.utils import update_environment_variables
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ from typing import Optional, Tuple, Union
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch_npu # noqa: F401
|
|
||||||
|
|
||||||
import vllm_ascend.platform # noqa: F401
|
import vllm_ascend.platform # noqa: F401
|
||||||
|
|
||||||
|
|||||||
@@ -25,11 +25,6 @@ from vllm.utils import GiB_bytes
|
|||||||
from tests.utils import fork_new_process_for_each_test
|
from tests.utils import fork_new_process_for_each_test
|
||||||
from vllm_ascend.device_allocator.camem import CaMemAllocator
|
from vllm_ascend.device_allocator.camem import CaMemAllocator
|
||||||
|
|
||||||
try:
|
|
||||||
import torch_npu # noqa: F401
|
|
||||||
except ImportError:
|
|
||||||
print("Failed to import torch_npu.")
|
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@fork_new_process_for_each_test
|
||||||
def test_basic_camem():
|
def test_basic_camem():
|
||||||
@@ -53,9 +48,9 @@ def test_basic_camem():
|
|||||||
output = x + y + z
|
output = x + y + z
|
||||||
assert torch.allclose(output, torch.ones_like(output) * 3)
|
assert torch.allclose(output, torch.ones_like(output) * 3)
|
||||||
|
|
||||||
free_bytes = torch_npu.npu.mem_get_info()[0]
|
free_bytes = torch.npu.mem_get_info()[0]
|
||||||
allocator.sleep()
|
allocator.sleep()
|
||||||
free_bytes_after_sleep = torch_npu.npu.mem_get_info()[0]
|
free_bytes_after_sleep = torch.npu.mem_get_info()[0]
|
||||||
assert free_bytes_after_sleep > free_bytes
|
assert free_bytes_after_sleep > free_bytes
|
||||||
allocator.wake_up()
|
allocator.wake_up()
|
||||||
|
|
||||||
@@ -67,7 +62,7 @@ def test_basic_camem():
|
|||||||
@fork_new_process_for_each_test
|
@fork_new_process_for_each_test
|
||||||
def test_end_to_end():
|
def test_end_to_end():
|
||||||
os.environ["VLLM_USE_V1"] = "0"
|
os.environ["VLLM_USE_V1"] = "0"
|
||||||
free, total = torch_npu.npu.mem_get_info()
|
free, total = torch.npu.mem_get_info()
|
||||||
used_bytes_baseline = total - free # in case other process is running
|
used_bytes_baseline = total - free # in case other process is running
|
||||||
llm = LLM("Qwen/Qwen2.5-0.5B-Instruct", enable_sleep_mode=True)
|
llm = LLM("Qwen/Qwen2.5-0.5B-Instruct", enable_sleep_mode=True)
|
||||||
prompt = "How are you?"
|
prompt = "How are you?"
|
||||||
@@ -79,7 +74,7 @@ def test_end_to_end():
|
|||||||
# test sleep level 1 here.
|
# test sleep level 1 here.
|
||||||
llm.sleep(level=1)
|
llm.sleep(level=1)
|
||||||
|
|
||||||
free_gpu_bytes_after_sleep, total = torch_npu.npu.mem_get_info()
|
free_gpu_bytes_after_sleep, total = torch.npu.mem_get_info()
|
||||||
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
|
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
|
||||||
# now the memory usage should be less than the model weights
|
# now the memory usage should be less than the model weights
|
||||||
# (0.5B model, 1GiB weights)
|
# (0.5B model, 1GiB weights)
|
||||||
|
|||||||
@@ -17,7 +17,6 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
import torch
|
import torch
|
||||||
import torch_npu # noqa: F401
|
|
||||||
|
|
||||||
from vllm_ascend.distributed.device_communicators.pyhccl_wrapper import \
|
from vllm_ascend.distributed.device_communicators.pyhccl_wrapper import \
|
||||||
HCCLLibrary
|
HCCLLibrary
|
||||||
|
|||||||
@@ -20,14 +20,9 @@ from typing import Any, Dict, List, Optional, Tuple, Type
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from torch.nn.functional import scaled_dot_product_attention
|
import torch_npu
|
||||||
|
|
||||||
try:
|
|
||||||
import torch_npu # noqa: F401
|
|
||||||
except ImportError:
|
|
||||||
print("Failed to import torch_npu.")
|
|
||||||
|
|
||||||
import torchair._contrib.custom_torch_ops # type: ignore # noqa: F401
|
import torchair._contrib.custom_torch_ops # type: ignore # noqa: F401
|
||||||
|
from torch.nn.functional import scaled_dot_product_attention
|
||||||
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
||||||
AttentionLayer,
|
AttentionLayer,
|
||||||
AttentionMetadata, AttentionType,
|
AttentionMetadata, AttentionType,
|
||||||
|
|||||||
@@ -24,12 +24,6 @@ from typing import Any, Callable, Dict, Optional, Tuple, Union
|
|||||||
import torch
|
import torch
|
||||||
from acl.rt import memcpy # type: ignore # noqa: F401
|
from acl.rt import memcpy # type: ignore # noqa: F401
|
||||||
from vllm.logger import logger
|
from vllm.logger import logger
|
||||||
|
|
||||||
try:
|
|
||||||
import torch_npu # noqa: F401
|
|
||||||
except ImportError:
|
|
||||||
print("Failed to import torch_npu.")
|
|
||||||
|
|
||||||
from vllm.utils import is_pin_memory_available
|
from vllm.utils import is_pin_memory_available
|
||||||
|
|
||||||
|
|
||||||
@@ -95,10 +89,10 @@ def unmap_and_release(allocation_handle: HandleType) -> None:
|
|||||||
def get_pluggable_allocator(
|
def get_pluggable_allocator(
|
||||||
python_malloc_fn: Callable[[tuple[int, int, int, int]], None],
|
python_malloc_fn: Callable[[tuple[int, int, int, int]], None],
|
||||||
python_free_func: Callable[[int], tuple[int, int, int, int]]
|
python_free_func: Callable[[int], tuple[int, int, int, int]]
|
||||||
) -> torch_npu.npu.memory.NPUPluggableAllocator:
|
) -> torch.npu.memory.NPUPluggableAllocator:
|
||||||
init_module(python_malloc_fn, python_free_func)
|
init_module(python_malloc_fn, python_free_func)
|
||||||
new_alloc = torch_npu.npu.memory.NPUPluggableAllocator(
|
new_alloc = torch.npu.memory.NPUPluggableAllocator(lib_name, 'my_malloc',
|
||||||
lib_name, 'my_malloc', 'my_free')
|
'my_free')
|
||||||
return new_alloc
|
return new_alloc
|
||||||
|
|
||||||
|
|
||||||
@@ -107,8 +101,8 @@ def use_memory_pool_with_allocator(
|
|||||||
python_malloc_fn: Callable[[tuple[int, int, int, int]], None],
|
python_malloc_fn: Callable[[tuple[int, int, int, int]], None],
|
||||||
python_free_func: Callable[[int], tuple[int, int, int, int]]):
|
python_free_func: Callable[[int], tuple[int, int, int, int]]):
|
||||||
new_alloc = get_pluggable_allocator(python_malloc_fn, python_free_func)
|
new_alloc = get_pluggable_allocator(python_malloc_fn, python_free_func)
|
||||||
mem_pool = torch_npu.npu.memory.MemPool(new_alloc._allocator)
|
mem_pool = torch.npu.memory.MemPool(new_alloc._allocator)
|
||||||
with torch_npu.npu.memory.use_mem_pool(mem_pool):
|
with torch.npu.memory.use_mem_pool(mem_pool):
|
||||||
yield mem_pool, new_alloc
|
yield mem_pool, new_alloc
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -19,7 +19,6 @@ from typing import Optional, Union
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
import torch_npu # noqa: F401
|
|
||||||
from torch.distributed import ProcessGroup, ReduceOp
|
from torch.distributed import ProcessGroup, ReduceOp
|
||||||
from vllm.distributed.utils import StatelessProcessGroup
|
from vllm.distributed.utils import StatelessProcessGroup
|
||||||
from vllm.logger import logger
|
from vllm.logger import logger
|
||||||
|
|||||||
@@ -16,7 +16,6 @@
|
|||||||
#
|
#
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch_npu # noqa: F401
|
|
||||||
|
|
||||||
import vllm_ascend.ops.activation # noqa
|
import vllm_ascend.ops.activation # noqa
|
||||||
import vllm_ascend.ops.common_fused_moe # noqa
|
import vllm_ascend.ops.common_fused_moe # noqa
|
||||||
@@ -34,7 +33,7 @@ class dummyFusionOp:
|
|||||||
|
|
||||||
|
|
||||||
def register_dummy_fusion_op() -> None:
|
def register_dummy_fusion_op() -> None:
|
||||||
torch.cuda.CUDAGraph = torch_npu.npu.NPUGraph
|
torch.cuda.CUDAGraph = torch.npu.NPUGraph
|
||||||
torch.ops._C.rms_norm = dummyFusionOp(name="rms_norm")
|
torch.ops._C.rms_norm = dummyFusionOp(name="rms_norm")
|
||||||
torch.ops._C.fused_add_rms_norm = dummyFusionOp(name="fused_add_rms_norm")
|
torch.ops._C.fused_add_rms_norm = dummyFusionOp(name="fused_add_rms_norm")
|
||||||
torch.ops._C.static_scaled_fp8_quant = dummyFusionOp(
|
torch.ops._C.static_scaled_fp8_quant = dummyFusionOp(
|
||||||
|
|||||||
@@ -18,7 +18,6 @@
|
|||||||
from typing import Callable, Optional, Union
|
from typing import Callable, Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch_npu
|
|
||||||
from vllm.spec_decode.metrics import (AsyncMetricsCollector,
|
from vllm.spec_decode.metrics import (AsyncMetricsCollector,
|
||||||
SpecDecodeWorkerMetrics)
|
SpecDecodeWorkerMetrics)
|
||||||
|
|
||||||
@@ -36,7 +35,7 @@ def init_tensors(self,
|
|||||||
if isinstance(device_type, torch.device):
|
if isinstance(device_type, torch.device):
|
||||||
device_type = device_type.type
|
device_type = device_type.type
|
||||||
if device_type == 'npu':
|
if device_type == 'npu':
|
||||||
self._copy_stream = torch_npu.npu.Stream()
|
self._copy_stream = torch.npu.Stream()
|
||||||
|
|
||||||
|
|
||||||
def maybe_collect_rejsample_metrics(
|
def maybe_collect_rejsample_metrics(
|
||||||
|
|||||||
@@ -20,7 +20,6 @@ import os
|
|||||||
from typing import TYPE_CHECKING, Optional, Tuple
|
from typing import TYPE_CHECKING, Optional, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch_npu # noqa: F401
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm.logger import logger
|
from vllm.logger import logger
|
||||||
from vllm.platforms import Platform, PlatformEnum
|
from vllm.platforms import Platform, PlatformEnum
|
||||||
@@ -244,7 +243,6 @@ class NPUPlatform(Platform):
|
|||||||
timeout) -> None:
|
timeout) -> None:
|
||||||
from torch.distributed import ProcessGroup, is_hccl_available
|
from torch.distributed import ProcessGroup, is_hccl_available
|
||||||
assert is_hccl_available()
|
assert is_hccl_available()
|
||||||
import torch_npu # noqa
|
|
||||||
from torch_npu._C._distributed_c10d import ProcessGroupHCCL
|
from torch_npu._C._distributed_c10d import ProcessGroupHCCL
|
||||||
backend_options = ProcessGroupHCCL.Options()
|
backend_options = ProcessGroupHCCL.Options()
|
||||||
backend_options._timeout = timeout
|
backend_options._timeout = timeout
|
||||||
|
|||||||
@@ -19,7 +19,6 @@ from types import MappingProxyType
|
|||||||
from typing import Any, Callable, Dict, List, Mapping, Optional
|
from typing import Any, Callable, Dict, List, Mapping, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch_npu # noqa: F401
|
|
||||||
from vllm.distributed import get_tensor_model_parallel_rank
|
from vllm.distributed import get_tensor_model_parallel_rank
|
||||||
from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
|
from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
|
||||||
FusedMoeWeightScaleSupported)
|
FusedMoeWeightScaleSupported)
|
||||||
|
|||||||
@@ -17,7 +17,6 @@
|
|||||||
# Adapted from vllm-project/vllm/vllm/worker/worker.py
|
# Adapted from vllm-project/vllm/vllm/worker/worker.py
|
||||||
#
|
#
|
||||||
import torch
|
import torch
|
||||||
import torch_npu # noqa: F401
|
|
||||||
from packaging.version import InvalidVersion, Version
|
from packaging.version import InvalidVersion, Version
|
||||||
from vllm.logger import logger
|
from vllm.logger import logger
|
||||||
|
|
||||||
|
|||||||
@@ -28,7 +28,6 @@ from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set,
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch_npu
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm.attention import AttentionMetadata, get_attn_backend
|
from vllm.attention import AttentionMetadata, get_attn_backend
|
||||||
from vllm.attention.backends.utils import CommonAttentionState
|
from vllm.attention.backends.utils import CommonAttentionState
|
||||||
@@ -1145,7 +1144,7 @@ class NPUModelRunnerBase(ModelRunnerBase[TModelInputForNPU]):
|
|||||||
device=self.device)
|
device=self.device)
|
||||||
|
|
||||||
self.execute_model(model_input, kv_caches, intermediate_tensors)
|
self.execute_model(model_input, kv_caches, intermediate_tensors)
|
||||||
torch_npu.npu.synchronize()
|
torch.npu.synchronize()
|
||||||
return
|
return
|
||||||
|
|
||||||
def remove_all_loras(self):
|
def remove_all_loras(self):
|
||||||
@@ -1357,8 +1356,8 @@ class NPUModelRunner(NPUModelRunnerBase[ModelInputForNPUWithSamplingMetadata]):
|
|||||||
|
|
||||||
if (self.observability_config is not None
|
if (self.observability_config is not None
|
||||||
and self.observability_config.collect_model_forward_time):
|
and self.observability_config.collect_model_forward_time):
|
||||||
model_forward_start = torch_npu.npu.Event(enable_timing=True)
|
model_forward_start = torch.npu.Event(enable_timing=True)
|
||||||
model_forward_end = torch_npu.npu.Event(enable_timing=True)
|
model_forward_end = torch.npu.Event(enable_timing=True)
|
||||||
model_forward_start.record()
|
model_forward_start.record()
|
||||||
|
|
||||||
if not bypass_model_exec:
|
if not bypass_model_exec:
|
||||||
|
|||||||
@@ -134,9 +134,8 @@ class NPUPoolingModelRunner(
|
|||||||
} if self.has_inner_state else {}
|
} if self.has_inner_state else {}
|
||||||
if (self.observability_config is not None
|
if (self.observability_config is not None
|
||||||
and self.observability_config.collect_model_forward_time):
|
and self.observability_config.collect_model_forward_time):
|
||||||
import torch_npu
|
model_forward_start = torch.npu.Event(enable_timing=True)
|
||||||
model_forward_start = torch_npu.npu.Event(enable_timing=True)
|
model_forward_end = torch.npu.Event(enable_timing=True)
|
||||||
model_forward_end = torch_npu.npu.Event(enable_timing=True)
|
|
||||||
model_forward_start.record()
|
model_forward_start.record()
|
||||||
|
|
||||||
cross_enc_kwargs = {}
|
cross_enc_kwargs = {}
|
||||||
|
|||||||
Reference in New Issue
Block a user