From a4ba82e1380e3d2bda0cb6cb61a27a65b07727f9 Mon Sep 17 00:00:00 2001 From: Frank Chen Date: Thu, 23 Apr 2026 16:06:53 +0800 Subject: [PATCH] [BugFix] Require kv producer for layer sharding (#8563) ### What this PR does / why we need it? This PR introduce stricter Ascend `additional_config.layer_sharding` validation to the 0.18 release branch so it is only accepted on PD-disaggregated P nodes with `kv_role="kv_producer"`. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? E2E test --------- Signed-off-by: chenchuw886 Co-authored-by: chenchuw886 --- docs/source/developer_guide/Design_Documents/cpu_binding.md | 4 ++-- .../multicard/2-cards/test_offline_inference_distributed.py | 5 +++-- tests/ut/test_platform.py | 5 +++-- vllm_ascend/platform.py | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/docs/source/developer_guide/Design_Documents/cpu_binding.md b/docs/source/developer_guide/Design_Documents/cpu_binding.md index 4fdff3e7..5aed6f35 100644 --- a/docs/source/developer_guide/Design_Documents/cpu_binding.md +++ b/docs/source/developer_guide/Design_Documents/cpu_binding.md @@ -20,7 +20,7 @@ On multi‑socket ARM systems, the OS scheduler may place vLLM threads on CPUs f | Device type | Default mode | Description | | ----------- | ------------ | ------------ | | A3 (No Affinity) | `global_slice` | Splits the allowed CPU list evenly based on the **total number of global logical NPUs**, ensuring each NPU is assigned a contiguous segment of CPU cores. This prevents CPU core overlap across multiple process groups. | - | A2 / 310P / Others | `topo_affinity` | Allocates CPUs based on NPU topology affinity (`npu‑smi info -t topo`). If multiple NPUs are assigned to a single NUMA node (which may cause bandwidth contention), the CPU allocation extends to adjacent NUMA nodes. | + | A2 / Atlas 300 inference products / Others | `topo_affinity` | Allocates CPUs based on NPU topology affinity (`npu‑smi info -t topo`). If multiple NPUs are assigned to a single NUMA node (which may cause bandwidth contention), the CPU allocation extends to adjacent NUMA nodes. | - **Default**: enabled (enable_cpu_binding = true). - **Fallback**: If NPU topo affinity is unavailable, global_slice is used. @@ -156,7 +156,7 @@ With the current `global_slice` strategy, some CPU/NPU layouts cannot avoid cros |2|10-14|`IRQ`: 10-11, `Main`: 12, `ACL`: 13, `Release`: 14| |3|15-19|`IRQ`: 15-16, `Main`: 17, `ACL`: 18, `Release`: 19| -### Example 5: A2/310P topo_affinity with NUMA extension +### Example 5: A2/Atlas 300 inference products topo_affinity with NUMA extension **Inputs**: diff --git a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py index cf54e686..55e2765a 100644 --- a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py +++ b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py @@ -26,6 +26,7 @@ from unittest.mock import patch import pytest from vllm import SamplingParams +from vllm.config import KVTransferConfig from tests.e2e.conftest import VllmRunner, wait_until_npu_memory_free from tests.e2e.model_utils import check_outputs_equal @@ -180,6 +181,7 @@ def test_qwen3_moe_fc2_oshard_tp2() -> None: enable_expert_parallel=True, enforce_eager=True, # TODO(Levi-JQ): support graph mode for fc2 in Qwen additional_config={"layer_sharding": ["o_proj"]}, + kv_transfer_config=KVTransferConfig(kv_role="kv_producer"), ) as vllm_model: vllm_model.generate(example_prompts, sampling_params) @@ -261,7 +263,6 @@ def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep(): max_model_len=163840, compilation_config={"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12], "cudagraph_mode": "FULL_DECODE_ONLY"}, speculative_config={"num_speculative_tokens": 1, "method": "deepseek_mtp"}, - additional_config={"layer_sharding": ["q_b_proj", "o_proj"]}, reasoning_parser="deepseek_v3", tokenizer_mode="deepseek_v32", gpu_memory_utilization=0.8, @@ -290,7 +291,7 @@ def test_deepseek3_2_w8a8c8_pruning_mtp_tp2_ep(): max_model_len=163840, compilation_config={"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12], "cudagraph_mode": "FULL_DECODE_ONLY"}, speculative_config={"num_speculative_tokens": 1, "method": "deepseek_mtp"}, - additional_config={"layer_sharding": ["q_b_proj", "o_proj"], "enable_sparse_c8": True}, + additional_config={"enable_sparse_c8": True}, reasoning_parser="deepseek_v3", tokenizer_mode="deepseek_v32", gpu_memory_utilization=0.8, diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index d5a8d7cd..a5f73ea9 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -689,12 +689,13 @@ class TestNPUPlatform(TestBase): self.assertEqual(vllm_config.cache_config.block_size, 512) - def test_validate_layer_sharding_config_accepts_single_node(self): + def test_validate_layer_sharding_config_rejects_missing_kv_transfer_config(self): vllm_config = TestNPUPlatform.mock_vllm_config() vllm_config.additional_config = {"layer_sharding": ["q_b_proj", "o_proj"]} vllm_config.kv_transfer_config = None - self.platform._validate_layer_sharding_config(vllm_config) + with pytest.raises(ValueError, match="layer_sharding is only supported on P nodes"): + self.platform._validate_layer_sharding_config(vllm_config) def test_validate_layer_sharding_config_accepts_kv_producer(self): vllm_config = TestNPUPlatform.mock_vllm_config() diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 23df1669..f15f1a0a 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -241,7 +241,7 @@ class NPUPlatform(Platform): return kv_transfer_config = vllm_config.kv_transfer_config - if kv_transfer_config is not None and kv_transfer_config.kv_role != "kv_producer": + if kv_transfer_config is None or kv_transfer_config.kv_role != "kv_producer": raise ValueError( "additional_config.layer_sharding is only supported on P nodes " "(kv_role='kv_producer') when KV transfer is enabled."