[bugfix] align max_num_batched_tokens with tp*pcp when using FLASHCOMM1 (#6000)

### What this PR does / why we need it? Align max_num_batched_tokens with tp*pcp when using FLASHCOMM1 to avoid assert error in `NPUModelRunner._dummy_run`. - vLLM version: v0.13.0 - vLLM main: 2c24bc6996 --------- Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
2026-01-23 14:19:49 +08:00
parent f8d03d21f1
commit 749e24f81e
4 changed files with 33 additions and 3 deletions
--- a/tests/ut/attention/test_mla_cp.py
+++ b/tests/ut/attention/test_mla_cp.py
@@ -176,12 +176,16 @@ class TestAscendMLAImpl(TestBase):
        vllm_config = MagicMock()
        speculative_config = MagicMock()
        model_config = MagicMock()
+        parallel_config = MagicMock()
+        parallel_config.prefill_context_parallel_size = 1
+        parallel_config.tensor_parallel_size = 2
        speculative_config.num_speculative_tokens = 4
        vllm_config.speculative_config = speculative_config
        model_config.dtype = torch.float16
        vllm_config.model_config = model_config
        get_current_vllm_config.return_value = vllm_config
        vllm_config.additional_config = {"refresh": True}
+        vllm_config.parallel_config = parallel_config
        init_ascend_config(vllm_config)

        num_heads = 256
--- a/tests/ut/attention/test_mla_v1.py
+++ b/tests/ut/attention/test_mla_v1.py
@@ -757,12 +757,15 @@ class TestAscendMLAImpl(TestBase):
        vllm_config = MagicMock()
        speculative_config = MagicMock()
        model_config = MagicMock()
+        parallel_config = MagicMock()
+        parallel_config.prefill_context_parallel_size = 1
        speculative_config.num_speculative_tokens = 4
        vllm_config.speculative_config = speculative_config
        model_config.dtype = torch.float16
        vllm_config.model_config = model_config
        get_current_vllm_config.return_value = vllm_config
        vllm_config.additional_config = {"refresh": True}
+        vllm_config.parallel_config = parallel_config
        init_ascend_config(vllm_config)

        num_heads = 256
--- a/tests/ut/attention/test_sfa_v1.py
+++ b/tests/ut/attention/test_sfa_v1.py
@@ -5,6 +5,7 @@ import torch

 from tests.ut.base import TestBase
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
+from vllm.distributed.parallel_state import GroupCoordinator

 if 'torch_npu._inductor' not in sys.modules:
    sys.modules['torch_npu._inductor'] = MagicMock()
@@ -81,7 +82,13 @@ class TestAscendSFAMetadata(TestBase):

 class TestAscendSFAMetadataBuilder(TestBase):

-    def setUp(self):
+    @patch('vllm.distributed.parallel_state._TP',
+           new_callable=lambda: MagicMock(spec=GroupCoordinator))
+    def setUp(self, mock_tp):
+        mock_tp.world_size = 2
+        mock_tp.rank_in_group = MagicMock()
+        mock_tp.device_group = MagicMock()
+
        self.mock_cfg = MagicMock()

        self.mock_cfg.parallel_config = MagicMock()