[BugFix] Require kv producer for layer sharding (#8563)

### What this PR does / why we need it? This PR introduce stricter Ascend `additional_config.layer_sharding` validation to the 0.18 release branch so it is only accepted on PD-disaggregated P nodes with `kv_role="kv_producer"`. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? E2E test --------- Signed-off-by: chenchuw886 <chenchuw@huawei.com> Co-authored-by: chenchuw886 <chenchuw@huawei.com>
2026-04-23 16:06:53 +08:00
parent 4a254ba59a
commit a4ba82e138
4 changed files with 9 additions and 7 deletions
--- a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
@@ -26,6 +26,7 @@ from unittest.mock import patch

 import pytest
 from vllm import SamplingParams
+from vllm.config import KVTransferConfig

 from tests.e2e.conftest import VllmRunner, wait_until_npu_memory_free
 from tests.e2e.model_utils import check_outputs_equal
@@ -180,6 +181,7 @@ def test_qwen3_moe_fc2_oshard_tp2() -> None:
        enable_expert_parallel=True,
        enforce_eager=True,  # TODO(Levi-JQ): support graph mode for fc2 in Qwen
        additional_config={"layer_sharding": ["o_proj"]},
+        kv_transfer_config=KVTransferConfig(kv_role="kv_producer"),
    ) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)

@@ -261,7 +263,6 @@ def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
        max_model_len=163840,
        compilation_config={"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12], "cudagraph_mode": "FULL_DECODE_ONLY"},
        speculative_config={"num_speculative_tokens": 1, "method": "deepseek_mtp"},
-        additional_config={"layer_sharding": ["q_b_proj", "o_proj"]},
        reasoning_parser="deepseek_v3",
        tokenizer_mode="deepseek_v32",
        gpu_memory_utilization=0.8,
@@ -290,7 +291,7 @@ def test_deepseek3_2_w8a8c8_pruning_mtp_tp2_ep():
        max_model_len=163840,
        compilation_config={"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12], "cudagraph_mode": "FULL_DECODE_ONLY"},
        speculative_config={"num_speculative_tokens": 1, "method": "deepseek_mtp"},
-        additional_config={"layer_sharding": ["q_b_proj", "o_proj"], "enable_sparse_c8": True},
+        additional_config={"enable_sparse_c8": True},
        reasoning_parser="deepseek_v3",
        tokenizer_mode="deepseek_v32",
        gpu_memory_utilization=0.8,
--- a/tests/ut/test_platform.py
+++ b/tests/ut/test_platform.py
@@ -689,12 +689,13 @@ class TestNPUPlatform(TestBase):

        self.assertEqual(vllm_config.cache_config.block_size, 512)

-    def test_validate_layer_sharding_config_accepts_single_node(self):
+    def test_validate_layer_sharding_config_rejects_missing_kv_transfer_config(self):
        vllm_config = TestNPUPlatform.mock_vllm_config()
        vllm_config.additional_config = {"layer_sharding": ["q_b_proj", "o_proj"]}
        vllm_config.kv_transfer_config = None

-        self.platform._validate_layer_sharding_config(vllm_config)
+        with pytest.raises(ValueError, match="layer_sharding is only supported on P nodes"):
+            self.platform._validate_layer_sharding_config(vllm_config)

    def test_validate_layer_sharding_config_accepts_kv_producer(self):
        vllm_config = TestNPUPlatform.mock_vllm_config()