Fix the device error when using ray as vllm-acend backend (#884)
1. Remove RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES 2. Add lazy init for vllm_ascend_C Signed-off-by: zhuo97 <1103045176@qq.com>
This commit is contained in:
@@ -19,9 +19,6 @@
|
|||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
import vllm_ascend.platform as pf
|
|
||||||
|
|
||||||
pf.CUSTOM_OP_ENABLED = True # set True for custom Ops of Multi-Step.
|
|
||||||
prompts = [
|
prompts = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
"The president of the United States is",
|
"The president of the United States is",
|
||||||
|
|||||||
@@ -10,7 +10,9 @@ import pytest
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
import vllm_ascend.platform # noqa: F401
|
from vllm_ascend.utils import enable_custom_op
|
||||||
|
|
||||||
|
enable_custom_op()
|
||||||
|
|
||||||
# Only Neox style true scenario is supported for now
|
# Only Neox style true scenario is supported for now
|
||||||
IS_NEOX_STYLE = [True]
|
IS_NEOX_STYLE = [True]
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ from vllm.utils import async_tensor_h2d, make_tensor_with_pad
|
|||||||
|
|
||||||
from vllm_ascend.ascend_config import get_ascend_config
|
from vllm_ascend.ascend_config import get_ascend_config
|
||||||
from vllm_ascend.ops.cache import concat_and_cache_mla
|
from vllm_ascend.ops.cache import concat_and_cache_mla
|
||||||
from vllm_ascend.platform import CUSTOM_OP_ENABLED
|
from vllm_ascend.utils import enable_custom_op
|
||||||
from vllm_ascend.worker.model_runner import (
|
from vllm_ascend.worker.model_runner import (
|
||||||
ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata)
|
ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata)
|
||||||
|
|
||||||
@@ -462,7 +462,7 @@ class AscendMetadata(AttentionMetadata):
|
|||||||
for i in range(num_queries):
|
for i in range(num_queries):
|
||||||
self.seq_lens[i] += 1
|
self.seq_lens[i] += 1
|
||||||
self.max_decode_seq_len = max(self.seq_lens)
|
self.max_decode_seq_len = max(self.seq_lens)
|
||||||
if CUSTOM_OP_ENABLED:
|
if enable_custom_op():
|
||||||
#advance a step on NPU for existing inputs for a multi-step runner if custom ops is enabled
|
#advance a step on NPU for existing inputs for a multi-step runner if custom ops is enabled
|
||||||
torch.ops._C.advance_step_flashattn_ascendc(
|
torch.ops._C.advance_step_flashattn_ascendc(
|
||||||
num_seqs=num_seqs,
|
num_seqs=num_seqs,
|
||||||
|
|||||||
@@ -22,11 +22,12 @@ import torch
|
|||||||
from vllm.model_executor.layers.rotary_embedding import (
|
from vllm.model_executor.layers.rotary_embedding import (
|
||||||
DeepseekScalingRotaryEmbedding, RotaryEmbedding)
|
DeepseekScalingRotaryEmbedding, RotaryEmbedding)
|
||||||
|
|
||||||
from vllm_ascend.platform import CUSTOM_OP_ENABLED
|
from vllm_ascend.utils import enable_custom_op
|
||||||
|
|
||||||
|
|
||||||
def custom_rotary_embedding_enabled(query, neox_style, head_size):
|
def custom_rotary_embedding_enabled(query, neox_style, head_size):
|
||||||
return query.dtype == torch.float16 and neox_style and head_size % 32 == 0 and CUSTOM_OP_ENABLED
|
return query.dtype == torch.float16 and neox_style and head_size % 32 == 0 and enable_custom_op(
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def rope_forward_oot(
|
def rope_forward_oot(
|
||||||
|
|||||||
@@ -16,7 +16,6 @@
|
|||||||
#
|
#
|
||||||
|
|
||||||
import gc
|
import gc
|
||||||
import logging
|
|
||||||
import os
|
import os
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from typing import TYPE_CHECKING, Optional, Tuple
|
from typing import TYPE_CHECKING, Optional, Tuple
|
||||||
@@ -32,16 +31,6 @@ import vllm_ascend.envs as ascend_envs
|
|||||||
from vllm_ascend.ascend_config import check_ascend_config, init_ascend_config
|
from vllm_ascend.ascend_config import check_ascend_config, init_ascend_config
|
||||||
from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes
|
from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes
|
||||||
|
|
||||||
CUSTOM_OP_ENABLED = False
|
|
||||||
try:
|
|
||||||
# register custom ops into torch_library here
|
|
||||||
import vllm_ascend.vllm_ascend_C # type: ignore # noqa: F401
|
|
||||||
CUSTOM_OP_ENABLED = True
|
|
||||||
except ImportError as e:
|
|
||||||
logging.warning(
|
|
||||||
"Failed to import 'vllm_ascend.vllm_ascend_C': %s. All custom ops will be disabled. ",
|
|
||||||
e)
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.config import ModelConfig, VllmConfig
|
from vllm.config import ModelConfig, VllmConfig
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
@@ -50,7 +39,6 @@ else:
|
|||||||
VllmConfig = None
|
VllmConfig = None
|
||||||
FlexibleArgumentParser = None
|
FlexibleArgumentParser = None
|
||||||
|
|
||||||
os.environ["RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES"] = "1"
|
|
||||||
os.environ["ACL_OP_INIT_MODE"] = ascend_envs.VLLM_ASCEND_ACL_OP_INIT_MODE
|
os.environ["ACL_OP_INIT_MODE"] = ascend_envs.VLLM_ASCEND_ACL_OP_INIT_MODE
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -54,6 +54,8 @@ MAX_CAPTURE_SIZE = 1920
|
|||||||
|
|
||||||
ASCEND_QUATIZATION_METHOD = "ascend"
|
ASCEND_QUATIZATION_METHOD = "ascend"
|
||||||
|
|
||||||
|
CUSTOM_OP_ENABLED = None
|
||||||
|
|
||||||
|
|
||||||
def try_register_lib(lib_name: str, lib_info: str = ""):
|
def try_register_lib(lib_name: str, lib_info: str = ""):
|
||||||
import importlib
|
import importlib
|
||||||
@@ -68,6 +70,31 @@ def try_register_lib(lib_name: str, lib_info: str = ""):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def enable_custom_op():
|
||||||
|
"""
|
||||||
|
Enable lazy init for vllm_ascend_C to avoid early initialization of CANN's RTS component.
|
||||||
|
Ensure that ASCEND_RT_VISIBLE_DEVICES can be dynamically modified before torch.npu.set_device().
|
||||||
|
"""
|
||||||
|
global CUSTOM_OP_ENABLED
|
||||||
|
|
||||||
|
if CUSTOM_OP_ENABLED is not None:
|
||||||
|
return CUSTOM_OP_ENABLED
|
||||||
|
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
# register custom ops into torch_library here
|
||||||
|
import vllm_ascend.vllm_ascend_C # type: ignore # noqa: F401
|
||||||
|
CUSTOM_OP_ENABLED = True
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
CUSTOM_OP_ENABLED = False
|
||||||
|
logger.warning(
|
||||||
|
"Warning: Failed to register custom ops, all custom ops will be disabled"
|
||||||
|
)
|
||||||
|
|
||||||
|
return CUSTOM_OP_ENABLED
|
||||||
|
|
||||||
|
|
||||||
def find_hccl_library() -> str:
|
def find_hccl_library() -> str:
|
||||||
"""
|
"""
|
||||||
We either use the library file specified by the `HCCL_SO_PATH`
|
We either use the library file specified by the `HCCL_SO_PATH`
|
||||||
|
|||||||
@@ -117,6 +117,11 @@ class NPUWorker(WorkerBase):
|
|||||||
allocator = CaMemAllocator.get_instance()
|
allocator = CaMemAllocator.get_instance()
|
||||||
allocator.wake_up(tags=tags)
|
allocator.wake_up(tags=tags)
|
||||||
|
|
||||||
|
def initialize_cache(self, num_gpu_blocks: int,
|
||||||
|
num_cpu_blocks: int) -> None:
|
||||||
|
self.cache_config.num_gpu_blocks = num_gpu_blocks
|
||||||
|
self.cache_config.num_cpu_blocks = num_cpu_blocks
|
||||||
|
|
||||||
def init_device(self):
|
def init_device(self):
|
||||||
if self.device_config.device.type == "npu":
|
if self.device_config.device.type == "npu":
|
||||||
self.device = torch.device(f"npu:{self.local_rank_across_dp}")
|
self.device = torch.device(f"npu:{self.local_rank_across_dp}")
|
||||||
|
|||||||
Reference in New Issue
Block a user