[Feature] support aclgraph for model runner v2 (#7110)
### What this PR does / why we need it?
This PR aims to support aclgraph for model runner v2, please see RFC
#5208. The PR contains these modifications:
- adapt to newest commit of vllm main branch.
- supply a unified interface of extra forward context for both model
runner v1 and model runner v2.
- implement graph mode for main model.
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: Ronald1995 <ronaldautomobile@163.com>
This commit is contained in:
@@ -386,10 +386,11 @@ class TestEagleProposerDummyRun(TestBase):
|
||||
set_current_vllm_config(None)
|
||||
|
||||
# cpu does not support parallel-group, let alone `sp`
|
||||
@patch('vllm_ascend.ascend_forward_context.get_forward_context')
|
||||
@patch("vllm_ascend.spec_decode.eagle_proposer.get_forward_context",
|
||||
**{"return_value.flash_comm_v1_enabled": False})
|
||||
@patch("vllm_ascend.spec_decode.eagle_proposer.set_ascend_forward_context")
|
||||
def test_dummy_run_basic(self, mock_context, mock_get_context):
|
||||
def test_dummy_run_basic(self, mock_context, mock_get_context, mock_get_context_2):
|
||||
num_tokens = 32
|
||||
with_prefill = False
|
||||
|
||||
@@ -402,10 +403,11 @@ class TestEagleProposerDummyRun(TestBase):
|
||||
self.assertTrue(self.proposer._runnable.call_count == 1)
|
||||
|
||||
# cpu does not support parallel-group, let alone `sp`
|
||||
@patch('vllm_ascend.ascend_forward_context.get_forward_context')
|
||||
@patch("vllm_ascend.spec_decode.eagle_proposer.get_forward_context",
|
||||
**{"return_value.flash_comm_v1_enabled": False})
|
||||
@patch("vllm_ascend.spec_decode.eagle_proposer.set_ascend_forward_context")
|
||||
def test_dummy_run_with_prefill(self, mock_context, mock_get_context):
|
||||
def test_dummy_run_with_prefill(self, mock_context, mock_get_context, mock_get_context_2):
|
||||
mock_context.return_value.__enter__.return_value = None
|
||||
# cpu does not support `torch.ops.vllm.maybe_pad_and_reduce`
|
||||
with set_current_vllm_config(self.vllm_config):
|
||||
@@ -413,11 +415,12 @@ class TestEagleProposerDummyRun(TestBase):
|
||||
self.proposer.dummy_run(num_tokens=64, with_prefill=True, num_reqs=4)
|
||||
self.assertTrue(self.proposer._runnable.call_count == 1)
|
||||
|
||||
@patch('vllm_ascend.ascend_forward_context.get_forward_context')
|
||||
@patch("vllm_ascend.spec_decode.eagle_proposer.update_full_graph_params")
|
||||
@patch("vllm_ascend.spec_decode.eagle_proposer.get_forward_context")
|
||||
@patch("vllm_ascend.spec_decode.eagle_proposer.set_ascend_forward_context")
|
||||
def test_dummy_run_in_graph_capture(self, mock_context, mock_get_context,
|
||||
mock_update_full_graph_params):
|
||||
mock_update_full_graph_params, mock_get_context_2):
|
||||
last_use_cuda_graph = self.proposer.use_cuda_graph
|
||||
mock_return_context = MagicMock()
|
||||
mock_return_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
|
||||
@@ -425,6 +428,7 @@ class TestEagleProposerDummyRun(TestBase):
|
||||
# cpu does not support parallel-group, let alone `sp`
|
||||
mock_return_context.flash_comm_v1_enabled = False
|
||||
mock_get_context.return_value = mock_return_context
|
||||
mock_get_context_2.return_value = mock_return_context
|
||||
self.proposer.use_cuda_graph = True
|
||||
# cpu does not support `torch.ops.vllm.maybe_pad_and_reduce`
|
||||
with set_current_vllm_config(self.vllm_config):
|
||||
@@ -435,12 +439,13 @@ class TestEagleProposerDummyRun(TestBase):
|
||||
self.assertTrue(self.proposer._runnable.call_count == 1)
|
||||
mock_update_full_graph_params.assert_not_called()
|
||||
self.proposer.use_cuda_graph = last_use_cuda_graph
|
||||
|
||||
|
||||
@patch('vllm_ascend.ascend_forward_context.get_forward_context')
|
||||
@patch("vllm_ascend.spec_decode.eagle_proposer.update_full_graph_params")
|
||||
@patch("vllm_ascend.spec_decode.eagle_proposer.get_forward_context")
|
||||
@patch("vllm_ascend.spec_decode.eagle_proposer.set_ascend_forward_context")
|
||||
def test_dummy_run_in_graph_run(self, mock_context, mock_get_context,
|
||||
mock_update_full_graph_params):
|
||||
mock_update_full_graph_params, mock_get_context_2):
|
||||
last_use_cuda_graph = self.proposer.use_cuda_graph
|
||||
mock_return_context = MagicMock()
|
||||
mock_return_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
|
||||
@@ -448,6 +453,7 @@ class TestEagleProposerDummyRun(TestBase):
|
||||
# cpu does not support parallel-group, let alone `sp`
|
||||
mock_return_context.flash_comm_v1_enabled = False
|
||||
mock_get_context.return_value = mock_return_context
|
||||
mock_get_context_2.return_value = mock_return_context
|
||||
self.proposer.use_cuda_graph = True
|
||||
# cpu does not support `torch.ops.vllm.maybe_pad_and_reduce`
|
||||
with set_current_vllm_config(self.vllm_config):
|
||||
|
||||
Reference in New Issue
Block a user