### What this PR does / why we need it? In the case where `backend = ray`, only the main process completes the `forward_oot` call, while the other worker processes call `forward_native`. (This bug should also exist when `backend = mp`.) ### Does this PR introduce _any_ user-facing change? no. ### How was this patch tested? **Environment:** CANN: 8.0.0 PyTorch: 2.5.1 Torch: 2.5.1rc1 python: 3.10 python: 3.10 vllm: branch main vllm-ascend: branch main The current implementation avoids the Ray Worker initialization issue, as addressed in the [PR](https://github.com/vllm-project/vllm-ascend/pull/92). Then, during the `forward_oot` call, logging will be performed. **Script:** ```bash python examples/offline_distributed_inference_npu.py ``` **Result:** ```bash NPURayWorkerWrapper pid=3984223) forward_oot run. ############################################# (NPURayWorkerWrapper pid=3984223) forward_oot run. ############################################# (NPURayWorkerWrapper pid=3984223) forward_oot run. ############################################# (NPURayWorkerWrapper pid=3984223) forward_oot run. ############################################# (NPURayWorkerWrapper pid=3984223) forward_oot run. ############################################# forward_oot run. ############################################# forward_oot run. ############################################# Processed prompts: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:07<00:00, 1.96s/it, est. speed input: 2.80 toks/s, output: 51.00 toks/s] Prompt: 'Hello, my name is', Generated text: ' Alex and I am a 16 year old male. I have been diagnosed with a rare genetic disorder called X-linked recessive. I have been told that I will not be able to have children. I have been told that I will not be able to have children because of the X-linked recessive disorder. I have been told that I will not be able to have children because of the X-linked recessive disorder. I have been told that I will not be able to have children because of' Prompt: 'The president of the United States is', Generated text: ' Statesman. He is the leader of the country. He is the one who makes the decisions. He is the one who makes the laws. He is the one who makes the rules. He is the one who makes the country strong. He is the one who makes the country happy. He is the one who makes the country safe. He is the one who makes the country free. He is the one who makes the country beautiful. He is the one who makes the country great. He is' Prompt: 'The capital of France is', Generated text: ' the city of Paris. It is the largest city in France and the second largest city in Europe. It is located in the center of the country, in the south of the country. It is situated on the banks of the Seine River, which flows through the city. The city is surrounded by the Alps and the Pyrenees mountains. The city is also surrounded by the Mediterranean Sea. The city is known for its beautiful architecture, its museums, its parks, and its food. Paris is' Prompt: 'The future of AI is', Generated text: ' following the path of the internet, and the internet is following the path of the web. The web is a network of interconnected web pages, and the internet is a network of interconnected computers. The web is a network of interconnected computers, and the internet is a network of interconnected computers. The web is a network of interconnected computers, and the internet is a network of interconnected computers. The web is a network of interconnected computers, and the internet is a network of interconnected computers. The web is a network' ``` --------- Signed-off-by: Chenguang Li <757486878@qq.com>
131 lines
4.3 KiB
Python
131 lines
4.3 KiB
Python
#
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
# This file is a part of the vllm-ascend project.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import os
|
|
from typing import TYPE_CHECKING, Optional, Tuple
|
|
|
|
import torch
|
|
|
|
try:
|
|
import torch_npu # noqa: F401
|
|
except ImportError:
|
|
print("Failed to import torch_npu.")
|
|
|
|
from vllm.config import VllmConfig
|
|
from vllm.platforms import Platform, PlatformEnum
|
|
|
|
if TYPE_CHECKING:
|
|
from vllm.utils import FlexibleArgumentParser
|
|
else:
|
|
FlexibleArgumentParser = None
|
|
|
|
os.environ["RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES"] = "1"
|
|
|
|
|
|
def _device_id_to_physical_device_id(device_id: int) -> int:
|
|
if "ASCEND_RT_VISIBLE_DEVICES" in os.environ:
|
|
device_ids = os.environ["ASCEND_RT_VISIBLE_DEVICES"].split(",")
|
|
if device_ids == [""]:
|
|
raise RuntimeError("ASCEND_RT_VISIBLE_DEVICES is set to empty"
|
|
"string, which means Ascend NPU support is"
|
|
"disabled.")
|
|
physical_device_id = device_ids[device_id]
|
|
return int(physical_device_id)
|
|
else:
|
|
return device_id
|
|
|
|
|
|
class NPUPlatform(Platform):
|
|
|
|
_enum = PlatformEnum.OOT
|
|
device_name: str = "npu"
|
|
device_type: str = "npu"
|
|
simple_compile_backend: str = "npu"
|
|
ray_device_key: str = "NPU"
|
|
device_control_env_var: str = "ASCEND_RT_VISIBLE_DEVICES"
|
|
dispatch_key: str = "PrivateUse1"
|
|
|
|
supported_quantization: list[str] = ["ascend"]
|
|
|
|
@classmethod
|
|
def pre_register_and_update(cls,
|
|
parser: Optional[FlexibleArgumentParser] = None
|
|
) -> None:
|
|
from vllm_ascend.quantization.quant_config import \
|
|
AscendQuantConfig # noqa: F401
|
|
|
|
@classmethod
|
|
def get_device_capability(cls, device_id: int = 0):
|
|
return None
|
|
|
|
@classmethod
|
|
def get_device_name(cls, device_id: int = 0) -> str:
|
|
physical_device_id = _device_id_to_physical_device_id(device_id)
|
|
return torch.npu.get_device_name(physical_device_id)
|
|
|
|
@classmethod
|
|
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
|
|
return True
|
|
|
|
@classmethod
|
|
def inference_mode(cls):
|
|
return torch.inference_mode()
|
|
|
|
@classmethod
|
|
def set_device(cls, device: torch.device):
|
|
torch.npu.set_device(device)
|
|
|
|
@classmethod
|
|
def empty_cache(cls):
|
|
torch.npu.empty_cache()
|
|
|
|
@classmethod
|
|
def synchronize(cls):
|
|
torch.npu.synchronize()
|
|
|
|
@classmethod
|
|
def mem_get_info(cls) -> Tuple[int, int]:
|
|
return torch.npu.mem_get_info()
|
|
|
|
@classmethod
|
|
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
|
parallel_config = vllm_config.parallel_config
|
|
if parallel_config.worker_cls == "auto":
|
|
parallel_config.worker_cls = "vllm_ascend.worker.NPUWorker"
|
|
cache_config = vllm_config.cache_config
|
|
if cache_config and cache_config.block_size is None:
|
|
# TODO: Set block_size to 128 will lead unexpected accuracy issue in mla case. Please set block_size to 128 back once the problem is fixed.
|
|
cache_config.block_size = 16
|
|
|
|
@classmethod
|
|
def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
|
|
kv_cache_dtype, block_size, use_v1, use_mla):
|
|
if use_mla:
|
|
return "vllm_ascend.attention.AscendMLAAttentionBackend"
|
|
return "vllm_ascend.attention.AscendAttentionBackend"
|
|
|
|
@classmethod
|
|
def get_current_memory_usage(cls,
|
|
device: Optional[torch.types.Device] = None
|
|
) -> float:
|
|
torch.npu.reset_peak_memory_stats(device)
|
|
return torch.npu.max_memory_allocated(device)
|
|
|
|
@classmethod
|
|
def get_device_communicator_cls(cls) -> str:
|
|
return "vllm_ascend.communicator.NPUCommunicator"
|