[BugFix] Require kv producer for layer sharding (#8563)
### What this PR does / why we need it? This PR introduce stricter Ascend `additional_config.layer_sharding` validation to the 0.18 release branch so it is only accepted on PD-disaggregated P nodes with `kv_role="kv_producer"`. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? E2E test --------- Signed-off-by: chenchuw886 <chenchuw@huawei.com> Co-authored-by: chenchuw886 <chenchuw@huawei.com>
This commit is contained in:
@@ -20,7 +20,7 @@ On multi‑socket ARM systems, the OS scheduler may place vLLM threads on CPUs f
|
|||||||
| Device type | Default mode | Description |
|
| Device type | Default mode | Description |
|
||||||
| ----------- | ------------ | ------------ |
|
| ----------- | ------------ | ------------ |
|
||||||
| A3 (No Affinity) | `global_slice` | Splits the allowed CPU list evenly based on the **total number of global logical NPUs**, ensuring each NPU is assigned a contiguous segment of CPU cores. This prevents CPU core overlap across multiple process groups. |
|
| A3 (No Affinity) | `global_slice` | Splits the allowed CPU list evenly based on the **total number of global logical NPUs**, ensuring each NPU is assigned a contiguous segment of CPU cores. This prevents CPU core overlap across multiple process groups. |
|
||||||
| A2 / 310P / Others | `topo_affinity` | Allocates CPUs based on NPU topology affinity (`npu‑smi info -t topo`). If multiple NPUs are assigned to a single NUMA node (which may cause bandwidth contention), the CPU allocation extends to adjacent NUMA nodes. |
|
| A2 / Atlas 300 inference products / Others | `topo_affinity` | Allocates CPUs based on NPU topology affinity (`npu‑smi info -t topo`). If multiple NPUs are assigned to a single NUMA node (which may cause bandwidth contention), the CPU allocation extends to adjacent NUMA nodes. |
|
||||||
|
|
||||||
- **Default**: enabled (enable_cpu_binding = true).
|
- **Default**: enabled (enable_cpu_binding = true).
|
||||||
- **Fallback**: If NPU topo affinity is unavailable, global_slice is used.
|
- **Fallback**: If NPU topo affinity is unavailable, global_slice is used.
|
||||||
@@ -156,7 +156,7 @@ With the current `global_slice` strategy, some CPU/NPU layouts cannot avoid cros
|
|||||||
|2|10-14|`IRQ`: 10-11, `Main`: 12, `ACL`: 13, `Release`: 14|
|
|2|10-14|`IRQ`: 10-11, `Main`: 12, `ACL`: 13, `Release`: 14|
|
||||||
|3|15-19|`IRQ`: 15-16, `Main`: 17, `ACL`: 18, `Release`: 19|
|
|3|15-19|`IRQ`: 15-16, `Main`: 17, `ACL`: 18, `Release`: 19|
|
||||||
|
|
||||||
### Example 5: A2/310P topo_affinity with NUMA extension
|
### Example 5: A2/Atlas 300 inference products topo_affinity with NUMA extension
|
||||||
|
|
||||||
**Inputs**:
|
**Inputs**:
|
||||||
|
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ from unittest.mock import patch
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
|
from vllm.config import KVTransferConfig
|
||||||
|
|
||||||
from tests.e2e.conftest import VllmRunner, wait_until_npu_memory_free
|
from tests.e2e.conftest import VllmRunner, wait_until_npu_memory_free
|
||||||
from tests.e2e.model_utils import check_outputs_equal
|
from tests.e2e.model_utils import check_outputs_equal
|
||||||
@@ -180,6 +181,7 @@ def test_qwen3_moe_fc2_oshard_tp2() -> None:
|
|||||||
enable_expert_parallel=True,
|
enable_expert_parallel=True,
|
||||||
enforce_eager=True, # TODO(Levi-JQ): support graph mode for fc2 in Qwen
|
enforce_eager=True, # TODO(Levi-JQ): support graph mode for fc2 in Qwen
|
||||||
additional_config={"layer_sharding": ["o_proj"]},
|
additional_config={"layer_sharding": ["o_proj"]},
|
||||||
|
kv_transfer_config=KVTransferConfig(kv_role="kv_producer"),
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_model.generate(example_prompts, sampling_params)
|
vllm_model.generate(example_prompts, sampling_params)
|
||||||
|
|
||||||
@@ -261,7 +263,6 @@ def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
|
|||||||
max_model_len=163840,
|
max_model_len=163840,
|
||||||
compilation_config={"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12], "cudagraph_mode": "FULL_DECODE_ONLY"},
|
compilation_config={"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12], "cudagraph_mode": "FULL_DECODE_ONLY"},
|
||||||
speculative_config={"num_speculative_tokens": 1, "method": "deepseek_mtp"},
|
speculative_config={"num_speculative_tokens": 1, "method": "deepseek_mtp"},
|
||||||
additional_config={"layer_sharding": ["q_b_proj", "o_proj"]},
|
|
||||||
reasoning_parser="deepseek_v3",
|
reasoning_parser="deepseek_v3",
|
||||||
tokenizer_mode="deepseek_v32",
|
tokenizer_mode="deepseek_v32",
|
||||||
gpu_memory_utilization=0.8,
|
gpu_memory_utilization=0.8,
|
||||||
@@ -290,7 +291,7 @@ def test_deepseek3_2_w8a8c8_pruning_mtp_tp2_ep():
|
|||||||
max_model_len=163840,
|
max_model_len=163840,
|
||||||
compilation_config={"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12], "cudagraph_mode": "FULL_DECODE_ONLY"},
|
compilation_config={"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12], "cudagraph_mode": "FULL_DECODE_ONLY"},
|
||||||
speculative_config={"num_speculative_tokens": 1, "method": "deepseek_mtp"},
|
speculative_config={"num_speculative_tokens": 1, "method": "deepseek_mtp"},
|
||||||
additional_config={"layer_sharding": ["q_b_proj", "o_proj"], "enable_sparse_c8": True},
|
additional_config={"enable_sparse_c8": True},
|
||||||
reasoning_parser="deepseek_v3",
|
reasoning_parser="deepseek_v3",
|
||||||
tokenizer_mode="deepseek_v32",
|
tokenizer_mode="deepseek_v32",
|
||||||
gpu_memory_utilization=0.8,
|
gpu_memory_utilization=0.8,
|
||||||
|
|||||||
@@ -689,12 +689,13 @@ class TestNPUPlatform(TestBase):
|
|||||||
|
|
||||||
self.assertEqual(vllm_config.cache_config.block_size, 512)
|
self.assertEqual(vllm_config.cache_config.block_size, 512)
|
||||||
|
|
||||||
def test_validate_layer_sharding_config_accepts_single_node(self):
|
def test_validate_layer_sharding_config_rejects_missing_kv_transfer_config(self):
|
||||||
vllm_config = TestNPUPlatform.mock_vllm_config()
|
vllm_config = TestNPUPlatform.mock_vllm_config()
|
||||||
vllm_config.additional_config = {"layer_sharding": ["q_b_proj", "o_proj"]}
|
vllm_config.additional_config = {"layer_sharding": ["q_b_proj", "o_proj"]}
|
||||||
vllm_config.kv_transfer_config = None
|
vllm_config.kv_transfer_config = None
|
||||||
|
|
||||||
self.platform._validate_layer_sharding_config(vllm_config)
|
with pytest.raises(ValueError, match="layer_sharding is only supported on P nodes"):
|
||||||
|
self.platform._validate_layer_sharding_config(vllm_config)
|
||||||
|
|
||||||
def test_validate_layer_sharding_config_accepts_kv_producer(self):
|
def test_validate_layer_sharding_config_accepts_kv_producer(self):
|
||||||
vllm_config = TestNPUPlatform.mock_vllm_config()
|
vllm_config = TestNPUPlatform.mock_vllm_config()
|
||||||
|
|||||||
@@ -241,7 +241,7 @@ class NPUPlatform(Platform):
|
|||||||
return
|
return
|
||||||
|
|
||||||
kv_transfer_config = vllm_config.kv_transfer_config
|
kv_transfer_config = vllm_config.kv_transfer_config
|
||||||
if kv_transfer_config is not None and kv_transfer_config.kv_role != "kv_producer":
|
if kv_transfer_config is None or kv_transfer_config.kv_role != "kv_producer":
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"additional_config.layer_sharding is only supported on P nodes "
|
"additional_config.layer_sharding is only supported on P nodes "
|
||||||
"(kv_role='kv_producer') when KV transfer is enabled."
|
"(kv_role='kv_producer') when KV transfer is enabled."
|
||||||
|
|||||||
Reference in New Issue
Block a user