From 07d44ade194b018ae2cc172482d55cb746c5fd0e Mon Sep 17 00:00:00 2001 From: zhiyuanzhang <100197139+zzy-ContiLearn@users.noreply.github.com> Date: Wed, 3 Sep 2025 22:25:08 +0800 Subject: [PATCH] bugfix: fix initialization error for mooncake in k8s (#2541) ### What this PR does / why we need it? The detail has been clarified in that issue : https://github.com/vllm-project/vllm-ascend/issues/2557 ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? easy to test beacause we just need to echo the variable - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/6997a25ac65ed6cc3c2be6d09ca45f633a345f63 --------- Signed-off-by: zzy-ContiLearn <1831242919@qq.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: LCAIZJ --- .../mooncake_connector_deployment_guide.md | 2 ++ tests/ut/kv_connector/test_mooncake_connector.py | 9 ++++++++- vllm_ascend/distributed/mooncake_connector.py | 13 +++++++++++-- vllm_ascend/envs.py | 4 ++++ 4 files changed, 25 insertions(+), 3 deletions(-) diff --git a/examples/disaggregated_prefill_v1/mooncake_connector_deployment_guide.md b/examples/disaggregated_prefill_v1/mooncake_connector_deployment_guide.md index 614eca5..3e916cc 100644 --- a/examples/disaggregated_prefill_v1/mooncake_connector_deployment_guide.md +++ b/examples/disaggregated_prefill_v1/mooncake_connector_deployment_guide.md @@ -32,6 +32,7 @@ export GLOO_SOCKET_IFNAME="xxxxxx" export TP_SOCKET_IFNAME="xxxxxx" export HCCL_SOCKET_IFNAME="xxxxxx" export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 +export PHYSICAL_DEVICES=$(ls /dev/davinci* 2>/dev/null | grep -o '[0-9]\+' | sort -n | paste -sd',' -) vllm serve "/xxxxx/DeepSeek-V2-Lite-Chat" \ --host localhost \ @@ -100,6 +101,7 @@ export GLOO_SOCKET_IFNAME="xxxxxx" export TP_SOCKET_IFNAME="xxxxxx" export HCCL_SOCKET_IFNAME="xxxxxx" export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7 +export PHYSICAL_DEVICES=$(ls /dev/davinci* 2>/dev/null | grep -o '[0-9]\+' | sort -n | paste -sd',' -) vllm serve "/xxxxx/DeepSeek-V2-Lite-Chat" \ --host localhost \ diff --git a/tests/ut/kv_connector/test_mooncake_connector.py b/tests/ut/kv_connector/test_mooncake_connector.py index f6732a0..c7a20e0 100644 --- a/tests/ut/kv_connector/test_mooncake_connector.py +++ b/tests/ut/kv_connector/test_mooncake_connector.py @@ -1094,6 +1094,7 @@ class MockTransferEngine: class MockEnvsAscend: MOONCAKE_CONNECTOR_PROTOCOL = "mock_protocol" + PHYSICAL_DEVICES = "10,11" def mock_get_tensor_model_parallel_rank(): @@ -1122,7 +1123,7 @@ class TestMooncakeConnectorWorker(unittest.TestCase): self.mock_transfer_engine.register_memory.return_value = 0 self.patches = [ - patch('os.getenv', return_value="0,1"), + patch('os.getenv', return_value="10,11"), patch('torch.Tensor.size', return_value=(10, 16, 8, 16)), patch('torch.Tensor.element_size', return_value=4), patch('torch.Tensor.data_ptr', return_value=0x1000), @@ -1191,6 +1192,12 @@ class TestMooncakeConnectorWorker(unittest.TestCase): self.assertTrue(worker.use_mla) self.assertEqual(len(worker.block_len), 2) + def test_device_id_selection_with_physical_devices(self): + # Test with physical devices set + worker = MooncakeConnectorWorker(self.vllm_config, self.engine_id) + # Default tp_rank is 0, so device_id should be 10 + self.assertEqual(worker.device_id, 10) + if __name__ == '__main__': unittest.main() diff --git a/vllm_ascend/distributed/mooncake_connector.py b/vllm_ascend/distributed/mooncake_connector.py index e223877..c527db3 100644 --- a/vllm_ascend/distributed/mooncake_connector.py +++ b/vllm_ascend/distributed/mooncake_connector.py @@ -2,7 +2,6 @@ import contextlib import hashlib import math -import os import queue import random import struct @@ -29,6 +28,8 @@ from vllm.utils import get_ip, logger, make_zmq_path, make_zmq_socket from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.request import RequestStatus +import vllm_ascend.envs as envs_ascend + if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata from vllm.forward_context import ForwardContext @@ -758,13 +759,21 @@ class MooncakeConnectorWorker: # get tp device id # TODO(kw): https://github.com/vllm-project/vllm-ascend/pull/940 # introducing some changes - device_ids_str = os.getenv("ASCEND_RT_VISIBLE_DEVICES", None) + device_ids_str = envs_ascend.PHYSICAL_DEVICES if device_ids_str is None: device_ids = list( range(self.dp_rank * self.tp_size, (self.dp_rank + 1) * self.tp_size)) else: device_ids = list(map(int, device_ids_str.split(','))) + start_index = self.dp_rank * self.tp_size + end_index = start_index + self.tp_size + if len(device_ids) < end_index: + raise ValueError( + f"Not enough physical devices available for DP rank {self.dp_rank}. " + f"Expected at least {end_index} devices, but found {len(device_ids)} " + "in PHYSICAL_DEVICES.") + device_ids = device_ids[start_index:end_index] assert len(device_ids) > self.tp_rank # type: ignore self.device_id = device_ids[self.tp_rank] # type: ignore diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index 625b65a..04d94a9 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -135,6 +135,10 @@ env_variables: Dict[str, Callable[[], Any]] = { # this feature in eager mode will get better performance. "VLLM_ASCEND_ENABLE_MLP_OPTIMIZE": lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MLP_OPTIMIZE", '0'))), + # Determine the number of physical devices in a non-full-use scenario + # caused by the initialization of the Mooncake connector. + "PHYSICAL_DEVICES": + lambda: os.getenv("PHYSICAL_DEVICES", None), } # end-env-vars-definition