diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 16c0f68d..00014c80 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -571,26 +571,6 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: vllm_config.model_config.architectures[0], num_hidden_layers, len(original_sizes)) - # default or defined cudagraph_capture_sizes may not consider num_speculative_tokens>1 scenario - # the maximum size cudagraph_capture_sizes[0] should be greater or equal than - # (num_speculative_tokens+1)*max_num_seqs, otherwise draft model will run in eager mode - if vllm_config.speculative_config is not None and \ - vllm_config.speculative_config.num_speculative_tokens > 1: - num_speculative_tokens = vllm_config.speculative_config.num_speculative_tokens - max_num_seqs = vllm_config.scheduler_config.max_num_seqs - original_sizes, compilation_config.cudagraph_capture_sizes = \ - compilation_config.cudagraph_capture_sizes, None - assert len(original_sizes) > 0 - if original_sizes[0] < (num_speculative_tokens + 1) * max_num_seqs: - enlarged_sizes = [(num_speculative_tokens + 1) * size - for size in original_sizes] - update_cudagraph_capture_sizes(vllm_config, enlarged_sizes) - logger.info( - "Adjusted ACL graphs: %s → %s for speculative decoding", - original_sizes, enlarged_sizes) - else: - compilation_config.cudagraph_capture_sizes = original_sizes - # TODO(wxy): Move to ops module def dispose_tensor(x: torch.Tensor): diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 907a6e07..04ff0287 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -4027,6 +4027,16 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin): "; please try cudagraph_mode=PIECEWISE, " "and make sure compilation level is piecewise") + if (aclgraph_mode.decode_mode() == CUDAGraphMode.FULL + and aclgraph_mode.separate_routine() + and self.uniform_decode_query_len > 1): + self.compilation_config.adjust_cudagraph_sizes_for_spec_decode( + self.uniform_decode_query_len, + self.parallel_config.tensor_parallel_size) + capture_sizes = self.compilation_config.cudagraph_capture_sizes + self.aclgraph_batch_sizes = (capture_sizes + if capture_sizes is not None else []) + self.aclgraph_dispatcher.initialize_cudagraph_keys( self.compilation_config.cudagraph_mode, self.uniform_decode_query_len) @@ -4122,17 +4132,8 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin): x for x in self.aclgraph_batch_sizes if x <= max_num_tokens and x >= self.uniform_decode_query_len ] - compilation_cases_decode = sorted(decode_cudagraph_batch_sizes) - # TODO: refactor this when vLLM supports mtp>1 - if not all(x % self.uniform_decode_query_len == 0 - for x in decode_cudagraph_batch_sizes): - raise ValueError( - "In the MTP fullgraph scenario, each graph size must be an integer multiple of " - f"(num_speculative_tokens + 1): {self.uniform_decode_query_len}. " - f"Please modify the cudagraph_capture_sizes variable to be integer multiple of {self.uniform_decode_query_len}, " - f"while ensuring the maximum cudagraph_capture_sizes does not exceed max_num_seqs * (num_speculative_tokens + 1): {max_num_tokens}. " - "For example, with MTP=2 and max_num_seqs=16, we recommend setting cudagraph_capture_sizes to [48]." - ) + compilation_cases_decode = list( + reversed(decode_cudagraph_batch_sizes)) self._capture_aclgraphs( compilation_cases=compilation_cases_decode, aclgraph_runtime_mode=CUDAGraphMode.FULL,