bugfix: fix initialization error for mooncake in k8s (#2541)
### What this PR does / why we need it?
The detail has been clarified in that issue :
https://github.com/vllm-project/vllm-ascend/issues/2557
### Does this PR introduce _any_ user-facing change?
NO
### How was this patch tested?
easy to test beacause we just need to echo the variable
- vLLM version: v0.10.1.1
- vLLM main:
6997a25ac6
---------
Signed-off-by: zzy-ContiLearn <1831242919@qq.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: LCAIZJ <leichao139636@163.com>
This commit is contained in:
@@ -32,6 +32,7 @@ export GLOO_SOCKET_IFNAME="xxxxxx"
|
|||||||
export TP_SOCKET_IFNAME="xxxxxx"
|
export TP_SOCKET_IFNAME="xxxxxx"
|
||||||
export HCCL_SOCKET_IFNAME="xxxxxx"
|
export HCCL_SOCKET_IFNAME="xxxxxx"
|
||||||
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
|
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
|
||||||
|
export PHYSICAL_DEVICES=$(ls /dev/davinci* 2>/dev/null | grep -o '[0-9]\+' | sort -n | paste -sd',' -)
|
||||||
|
|
||||||
vllm serve "/xxxxx/DeepSeek-V2-Lite-Chat" \
|
vllm serve "/xxxxx/DeepSeek-V2-Lite-Chat" \
|
||||||
--host localhost \
|
--host localhost \
|
||||||
@@ -100,6 +101,7 @@ export GLOO_SOCKET_IFNAME="xxxxxx"
|
|||||||
export TP_SOCKET_IFNAME="xxxxxx"
|
export TP_SOCKET_IFNAME="xxxxxx"
|
||||||
export HCCL_SOCKET_IFNAME="xxxxxx"
|
export HCCL_SOCKET_IFNAME="xxxxxx"
|
||||||
export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7
|
export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7
|
||||||
|
export PHYSICAL_DEVICES=$(ls /dev/davinci* 2>/dev/null | grep -o '[0-9]\+' | sort -n | paste -sd',' -)
|
||||||
|
|
||||||
vllm serve "/xxxxx/DeepSeek-V2-Lite-Chat" \
|
vllm serve "/xxxxx/DeepSeek-V2-Lite-Chat" \
|
||||||
--host localhost \
|
--host localhost \
|
||||||
|
|||||||
@@ -1094,6 +1094,7 @@ class MockTransferEngine:
|
|||||||
|
|
||||||
class MockEnvsAscend:
|
class MockEnvsAscend:
|
||||||
MOONCAKE_CONNECTOR_PROTOCOL = "mock_protocol"
|
MOONCAKE_CONNECTOR_PROTOCOL = "mock_protocol"
|
||||||
|
PHYSICAL_DEVICES = "10,11"
|
||||||
|
|
||||||
|
|
||||||
def mock_get_tensor_model_parallel_rank():
|
def mock_get_tensor_model_parallel_rank():
|
||||||
@@ -1122,7 +1123,7 @@ class TestMooncakeConnectorWorker(unittest.TestCase):
|
|||||||
self.mock_transfer_engine.register_memory.return_value = 0
|
self.mock_transfer_engine.register_memory.return_value = 0
|
||||||
|
|
||||||
self.patches = [
|
self.patches = [
|
||||||
patch('os.getenv', return_value="0,1"),
|
patch('os.getenv', return_value="10,11"),
|
||||||
patch('torch.Tensor.size', return_value=(10, 16, 8, 16)),
|
patch('torch.Tensor.size', return_value=(10, 16, 8, 16)),
|
||||||
patch('torch.Tensor.element_size', return_value=4),
|
patch('torch.Tensor.element_size', return_value=4),
|
||||||
patch('torch.Tensor.data_ptr', return_value=0x1000),
|
patch('torch.Tensor.data_ptr', return_value=0x1000),
|
||||||
@@ -1191,6 +1192,12 @@ class TestMooncakeConnectorWorker(unittest.TestCase):
|
|||||||
self.assertTrue(worker.use_mla)
|
self.assertTrue(worker.use_mla)
|
||||||
self.assertEqual(len(worker.block_len), 2)
|
self.assertEqual(len(worker.block_len), 2)
|
||||||
|
|
||||||
|
def test_device_id_selection_with_physical_devices(self):
|
||||||
|
# Test with physical devices set
|
||||||
|
worker = MooncakeConnectorWorker(self.vllm_config, self.engine_id)
|
||||||
|
# Default tp_rank is 0, so device_id should be 10
|
||||||
|
self.assertEqual(worker.device_id, 10)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
import contextlib
|
import contextlib
|
||||||
import hashlib
|
import hashlib
|
||||||
import math
|
import math
|
||||||
import os
|
|
||||||
import queue
|
import queue
|
||||||
import random
|
import random
|
||||||
import struct
|
import struct
|
||||||
@@ -29,6 +28,8 @@ from vllm.utils import get_ip, logger, make_zmq_path, make_zmq_socket
|
|||||||
from vllm.v1.core.sched.output import SchedulerOutput
|
from vllm.v1.core.sched.output import SchedulerOutput
|
||||||
from vllm.v1.request import RequestStatus
|
from vllm.v1.request import RequestStatus
|
||||||
|
|
||||||
|
import vllm_ascend.envs as envs_ascend
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.attention.backends.abstract import AttentionMetadata
|
from vllm.attention.backends.abstract import AttentionMetadata
|
||||||
from vllm.forward_context import ForwardContext
|
from vllm.forward_context import ForwardContext
|
||||||
@@ -758,13 +759,21 @@ class MooncakeConnectorWorker:
|
|||||||
# get tp device id
|
# get tp device id
|
||||||
# TODO(kw): https://github.com/vllm-project/vllm-ascend/pull/940
|
# TODO(kw): https://github.com/vllm-project/vllm-ascend/pull/940
|
||||||
# introducing some changes
|
# introducing some changes
|
||||||
device_ids_str = os.getenv("ASCEND_RT_VISIBLE_DEVICES", None)
|
device_ids_str = envs_ascend.PHYSICAL_DEVICES
|
||||||
if device_ids_str is None:
|
if device_ids_str is None:
|
||||||
device_ids = list(
|
device_ids = list(
|
||||||
range(self.dp_rank * self.tp_size,
|
range(self.dp_rank * self.tp_size,
|
||||||
(self.dp_rank + 1) * self.tp_size))
|
(self.dp_rank + 1) * self.tp_size))
|
||||||
else:
|
else:
|
||||||
device_ids = list(map(int, device_ids_str.split(',')))
|
device_ids = list(map(int, device_ids_str.split(',')))
|
||||||
|
start_index = self.dp_rank * self.tp_size
|
||||||
|
end_index = start_index + self.tp_size
|
||||||
|
if len(device_ids) < end_index:
|
||||||
|
raise ValueError(
|
||||||
|
f"Not enough physical devices available for DP rank {self.dp_rank}. "
|
||||||
|
f"Expected at least {end_index} devices, but found {len(device_ids)} "
|
||||||
|
"in PHYSICAL_DEVICES.")
|
||||||
|
device_ids = device_ids[start_index:end_index]
|
||||||
assert len(device_ids) > self.tp_rank # type: ignore
|
assert len(device_ids) > self.tp_rank # type: ignore
|
||||||
self.device_id = device_ids[self.tp_rank] # type: ignore
|
self.device_id = device_ids[self.tp_rank] # type: ignore
|
||||||
|
|
||||||
|
|||||||
@@ -135,6 +135,10 @@ env_variables: Dict[str, Callable[[], Any]] = {
|
|||||||
# this feature in eager mode will get better performance.
|
# this feature in eager mode will get better performance.
|
||||||
"VLLM_ASCEND_ENABLE_MLP_OPTIMIZE":
|
"VLLM_ASCEND_ENABLE_MLP_OPTIMIZE":
|
||||||
lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MLP_OPTIMIZE", '0'))),
|
lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MLP_OPTIMIZE", '0'))),
|
||||||
|
# Determine the number of physical devices in a non-full-use scenario
|
||||||
|
# caused by the initialization of the Mooncake connector.
|
||||||
|
"PHYSICAL_DEVICES":
|
||||||
|
lambda: os.getenv("PHYSICAL_DEVICES", None),
|
||||||
}
|
}
|
||||||
|
|
||||||
# end-env-vars-definition
|
# end-env-vars-definition
|
||||||
|
|||||||
Reference in New Issue
Block a user