[Fix] fix resources limit error when apply speculative decoding and aclgraph (#2472)

### What this PR does / why we need it? When both speculative decoding and aclgraph are applied, and cudagraph_capture_sizes uses the default value, it will report that the stream resources are insufficient. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.10.1.1 - vLLM main: 9c99e4871f Signed-off-by: withHades <244036962@qq.com>
2025-09-04 11:50:43 +08:00
parent 0c0789be74
commit 7d47d8f4f6
2 changed files with 24 additions and 4 deletions
--- a/tests/ut/test_utils.py
+++ b/tests/ut/test_utils.py
@@ -261,6 +261,20 @@ class TestUtils(TestBase):
        self.assertEqual(
            147,
            len(test_vllm_config.compilation_config.cudagraph_capture_sizes))
+
+        test_vllm_config.speculative_config = mock.MagicMock()
+        test_vllm_config.speculative_config.draft_model_config = mock.MagicMock(
+        )
+        test_vllm_config.speculative_config.draft_model_config.hf_config = mock.MagicMock(
+        )
+        test_vllm_config.speculative_config.draft_model_config.hf_config.num_hidden_layers = 2
+        os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
+        utils.update_aclgraph_sizes(test_vllm_config)
+        del os.environ['HCCL_OP_EXPANSION_MODE']
+        self.assertEqual(
+            120,
+            len(test_vllm_config.compilation_config.cudagraph_capture_sizes))
+
        # max_num_batch_sizes >= len(original_sizes)
        test_compilation_config = CompilationConfig(
            cudagraph_capture_sizes=[1, 2, 3])