diff --git a/examples/external_online_dp/run_dp_template.sh b/examples/external_online_dp/run_dp_template.sh index 0243ea3e..08f263ad 100644 --- a/examples/external_online_dp/run_dp_template.sh +++ b/examples/external_online_dp/run_dp_template.sh @@ -29,4 +29,4 @@ vllm serve model_path \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ --quantization ascend \ - --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \ \ No newline at end of file + --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' diff --git a/tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py b/tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py index 880b44ae..7a782258 100644 --- a/tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py +++ b/tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py @@ -74,10 +74,7 @@ async def test_models(model: str, mode: str) -> None: "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000" } additional_config: dict[str, Any] = {} - speculative_config = { - "num_speculative_tokens": 2, - "method": "deepseek_mtp" - } + speculative_config = {"num_speculative_tokens": 2, "method": "mtp"} compilation_config = { "cudagraph_capture_sizes": [56], "cudagraph_mode": "FULL_DECODE_ONLY" diff --git a/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py b/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py index 80157588..3776e49c 100644 --- a/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py +++ b/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py @@ -84,10 +84,7 @@ async def test_models(model: str) -> None: "chunked_prefill_for_mla": True, "enable_weight_nz_layout": True } - speculative_config = { - "num_speculative_tokens": 1, - "method": "deepseek_mtp" - } + speculative_config = {"num_speculative_tokens": 1, "method": "mtp"} server_args = [ "--quantization", "ascend", "--data-parallel-size", "2", "--tensor-parallel-size", "8", "--enable-expert-parallel", "--port", diff --git a/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py b/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py index 35082edb..7a76a4a1 100644 --- a/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py +++ b/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py @@ -76,10 +76,7 @@ async def test_models(model: str, mode: str) -> None: "HCCL_BUFFSIZE": "1024", "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True" } - speculative_config = { - "num_speculative_tokens": 1, - "method": "deepseek_mtp" - } + speculative_config = {"num_speculative_tokens": 1, "method": "mtp"} additional_config = { "torchair_graph_config": { "enabled": True, diff --git a/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py b/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py index 6413aba0..3f504ae9 100644 --- a/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py +++ b/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py @@ -62,10 +62,7 @@ async def test_models(model: str) -> None: "DISABLE_L2_CACHE": "1", "DYNAMIC_EPLB": "true", } - speculative_config = { - "num_speculative_tokens": 1, - "method": "deepseek_mtp" - } + speculative_config = {"num_speculative_tokens": 1, "method": "mtp"} compilation_config = { "cudagraph_capture_sizes": [24], "cudagraph_mode": "FULL_DECODE_ONLY" diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml index 7bfe3f5e..6754bdc8 100644 --- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml +++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml @@ -29,7 +29,7 @@ deployment: --trust-remote-code --quantization ascend --gpu-memory-utilization 0.9 - --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' + --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}' - @@ -50,7 +50,7 @@ deployment: --trust-remote-code --quantization ascend --gpu-memory-utilization 0.9 - --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' + --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}' benchmarks: acc: diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml index 01100f29..f672dde5 100644 --- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml +++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml @@ -30,7 +30,7 @@ deployment: --quantization ascend --gpu-memory-utilization 0.9 --enforce-eager - --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' + --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}' - @@ -52,6 +52,6 @@ deployment: --quantization ascend --gpu-memory-utilization 0.9 --enforce-eager - --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' + --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}' benchmarks: diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml index 6ca189c4..fd093735 100644 --- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml +++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml @@ -39,7 +39,7 @@ deployment: --max-num-batched-tokens 16384 --trust-remote-code --gpu-memory-utilization 0.9 - --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' + --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --kv-transfer-config '{"kv_connector": "LLMDataDistCMgrConnector", "kv_buffer_device": "npu", @@ -69,7 +69,7 @@ deployment: --max-num-batched-tokens 16384 --trust-remote-code --gpu-memory-utilization 0.9 - --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' + --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --kv-transfer-config '{"kv_connector": "LLMDataDistCMgrConnector", "kv_buffer_device": "npu", @@ -100,7 +100,7 @@ deployment: --max-num-batched-tokens 256 --trust-remote-code --gpu-memory-utilization 0.9 - --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' + --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --kv-transfer-config '{"kv_connector": "LLMDataDistCMgrConnector", "kv_buffer_device": "npu", @@ -130,7 +130,7 @@ deployment: --max-num-batched-tokens 256 --trust-remote-code --gpu-memory-utilization 0.9 - --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' + --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --kv-transfer-config '{"kv_connector": "LLMDataDistCMgrConnector", "kv_buffer_device": "npu", diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml index 37a024b9..8b7723f5 100644 --- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml +++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml @@ -38,7 +38,7 @@ deployment: --max-num-batched-tokens 16384 --trust-remote-code --gpu-memory-utilization 0.9 - --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' + --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --kv-transfer-config '{"kv_connector": "LLMDataDistCMgrConnector", "kv_buffer_device": "npu", @@ -68,7 +68,7 @@ deployment: --max-num-batched-tokens 16384 --trust-remote-code --gpu-memory-utilization 0.9 - --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' + --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --kv-transfer-config '{"kv_connector": "LLMDataDistCMgrConnector", "kv_buffer_device": "npu", @@ -99,7 +99,7 @@ deployment: --max-num-batched-tokens 256 --trust-remote-code --gpu-memory-utilization 0.9 - --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' + --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --kv-transfer-config '{"kv_connector": "LLMDataDistCMgrConnector", "kv_buffer_device": "npu", @@ -129,7 +129,7 @@ deployment: --max-num-batched-tokens 256 --trust-remote-code --gpu-memory-utilization 0.9 - --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' + --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --kv-transfer-config '{"kv_connector": "LLMDataDistCMgrConnector", "kv_buffer_device": "npu", diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py index 6b90ec36..99c7d51c 100644 --- a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py +++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py @@ -56,7 +56,7 @@ def mtp_correctness(sampling_config: SamplingParams, enable_expert_parallel=True, speculative_config={ "method": - "deepseek_mtp", + "mtp", "num_speculative_tokens": num_speculative_tokens, "disable_padded_drafter_batch": diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py index d5096717..ddaeeab9 100644 --- a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py +++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py @@ -58,7 +58,7 @@ def mtp_torchair_correctness( distributed_executor_backend="mp", enable_expert_parallel=True, speculative_config={ - "method": "deepseek_mtp", + "method": "mtp", "num_speculative_tokens": 1, }, enforce_eager=False, diff --git a/tests/ut/torchair/test_torchair_model_runner.py b/tests/ut/torchair/test_torchair_model_runner.py index a11726f2..bbbe82bb 100644 --- a/tests/ut/torchair/test_torchair_model_runner.py +++ b/tests/ut/torchair/test_torchair_model_runner.py @@ -21,7 +21,7 @@ class TestNPUTorchairModelRunner(PytestBase): runner.vllm_config = MagicMock(spec=VllmConfig) runner.speculative_config = MagicMock( - method="deepseek_mtp", + method="mtp", num_speculative_tokens=4, disable_padded_drafter_batch=False) diff --git a/tests/ut/torchair/test_torchair_mtp_proposer.py b/tests/ut/torchair/test_torchair_mtp_proposer.py index 50745226..ec2dc425 100644 --- a/tests/ut/torchair/test_torchair_mtp_proposer.py +++ b/tests/ut/torchair/test_torchair_mtp_proposer.py @@ -19,7 +19,7 @@ class TestTorchairMtpProposer(PytestBase): vllm_config.speculative_config = MagicMock() vllm_config.speculative_config.draft_model_config = MagicMock() vllm_config.speculative_config.draft_model_config.dtype = torch.float16 - vllm_config.speculative_config.method = "deepseek_mtp" + vllm_config.speculative_config.method = "mtp" vllm_config.speculative_config.num_speculative_tokens = 5 vllm_config.load_config = MagicMock() cache_config = CacheConfig(block_size=16) diff --git a/vllm_ascend/compilation/acl_graph.py b/vllm_ascend/compilation/acl_graph.py index 025ff3c1..4ddc1d85 100644 --- a/vllm_ascend/compilation/acl_graph.py +++ b/vllm_ascend/compilation/acl_graph.py @@ -257,7 +257,7 @@ def update_mla_attn_params(update_stream, forward_context, runtime_shape, softmax_lse) = param seq_lens_list = forward_context.attn_metadata[ key].decode.seq_lens_list - if speculative_config and speculative_config.method == "deepseek_mtp" \ + if speculative_config and speculative_config.method == "mtp" \ and not forward_context.is_mtp_model: actual_seq_lengths = forward_context.attn_metadata[ key].decode.actual_seq_lengths_q diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py index 60a54e51..26c4dc86 100644 --- a/vllm_ascend/patch/platform/__init__.py +++ b/vllm_ascend/patch/platform/__init__.py @@ -16,7 +16,6 @@ import os -import vllm_ascend.patch.platform.patch_config # noqa import vllm_ascend.patch.platform.patch_distributed # noqa import vllm_ascend.patch.platform.patch_ec_connector # noqa import vllm_ascend.patch.platform.patch_mamba_config # noqa diff --git a/vllm_ascend/patch/platform/patch_config.py b/vllm_ascend/patch/platform/patch_config.py deleted file mode 100644 index b798fda3..00000000 --- a/vllm_ascend/patch/platform/patch_config.py +++ /dev/null @@ -1,234 +0,0 @@ -import ast - -from vllm.config.speculative import SpeculativeConfig -from vllm.logger import logger - - -def __post_init__(self): - - # Note: "method" is a new parameter that helps to extend the - # configuration of non-model-based proposers, and the "model" parameter - # will be used to set the draft model, eagle head, or additional weight - # when needed. If users do not specify "method", the speculative method - # will be detected automatically if possible. If the speculative method - # can not be detected, it will be considered as the "draft_model" by - # default. - - if self.model is None and self.num_speculative_tokens is not None: - # TODO(Shangming): Refactor mtp configuration logic when supporting - if (self.target_model_config - and self.target_model_config.hf_text_config.model_type - in ("deepseek_v3", "deepseek_v32", "mimo", "ernie4_5_moe", - "qwen3_next")): - # use the draft model from the same model: - self.model = self.target_model_config.model - # Align the quantization of draft model for cases such as - # --quantization fp8 with a bf16 checkpoint. - if not self.quantization: - self.quantization = self.target_model_config.quantization - elif self.method in ("ngram", "[ngram]"): - self.model = "ngram" - elif self.method == "suffix": - self.model = "suffix" - else: - raise ValueError("num_speculative_tokens was provided but without " - "speculative model.") - - # Automatically configure the method for ngram when "model" is used - # instead of "method" - if self.method is None and (self.model is not None - and self.model in ("ngram", "[ngram]")): - self.method = "ngram" - - if self.method in ("ngram", "[ngram]"): - # Unified to "ngram" internally - self.method = "ngram" - # Set default values if not provided - if (self.prompt_lookup_min is None and self.prompt_lookup_max is None): - # TODO(woosuk): Tune these values. They are arbitrarily chosen. - self.prompt_lookup_min = 5 - self.prompt_lookup_max = 5 - elif self.prompt_lookup_min is None: - assert self.prompt_lookup_max is not None - self.prompt_lookup_min = self.prompt_lookup_max - elif self.prompt_lookup_max is None: - assert self.prompt_lookup_min is not None - self.prompt_lookup_max = self.prompt_lookup_min - - # Validate values - if self.prompt_lookup_min < 1: - raise ValueError( - f"prompt_lookup_min={self.prompt_lookup_min} must be > 0") - if self.prompt_lookup_max < 1: - raise ValueError( - f"prompt_lookup_max={self.prompt_lookup_max} must be > 0") - if self.prompt_lookup_min > self.prompt_lookup_max: - raise ValueError( - f"prompt_lookup_min={self.prompt_lookup_min} must " - f"be <= prompt_lookup_max={self.prompt_lookup_max}") - - # TODO: current we still need extract vocab_size from target model - # config, in future, we may try refactor it out, and set - # draft related config as None here. - self.draft_model_config = self.target_model_config - self.draft_parallel_config = self.target_parallel_config - elif self.method == "suffix": - self.draft_model_config = self.target_model_config - self.draft_parallel_config = self.target_parallel_config - self._validate_suffix_decoding() - else: - self.prompt_lookup_max = 0 - self.prompt_lookup_min = 0 - - if self.model is not None: - # TODO: Move this import to the top once `ModelConfig` - # lives in `vllm.config.model`. - from vllm.config import ModelConfig - self.draft_model_config = ModelConfig( - model=self.model, - runner="draft", - tokenizer=self.target_model_config.tokenizer, - tokenizer_mode=self.target_model_config.tokenizer_mode, - trust_remote_code=self.target_model_config.trust_remote_code, - allowed_local_media_path=self.target_model_config. - allowed_local_media_path, - allowed_media_domains=self.target_model_config. - allowed_media_domains, - dtype=self.target_model_config.dtype, - seed=self.target_model_config.seed, - revision=self.revision, - code_revision=self.code_revision, - tokenizer_revision=self.target_model_config.tokenizer_revision, - spec_target_max_model_len=self.target_model_config. - max_model_len, - quantization=self.quantization, - enforce_eager=self.target_model_config.enforce_eager, - max_logprobs=self.target_model_config.max_logprobs, - hf_overrides=SpeculativeConfig.hf_config_override, - ) - - # Automatically detect the method - if self.method in ('eagle', 'eagle3'): - pass - # examples: - # yuhuili/EAGLE-LLaMA3-Instruct-8B - # yuhuili/EAGLE3-LLaMA3.1-Instruct-8B - # AngelSlim/Qwen3-8B_eagle3 - elif "eagle-" in self.draft_model_config.model.lower(): - self.method = "eagle" - elif "eagle3" in self.draft_model_config.model.lower(): - self.method = "eagle3" - elif self.draft_model_config.hf_config.model_type == "medusa": - self.method = "medusa" - elif (self.draft_model_config.hf_config.model_type == - "mlp_speculator"): - self.method = "mlp_speculator" - elif (self.draft_model_config.hf_config.model_type - in ("deepseek_mtp", "mimo_mtp", "glm4_moe_mtp")): - self.method = "deepseek_mtp" - if self.num_speculative_tokens > 1: - logger.warning( - "All Deepseek MTP models only have " \ - "one layer. Might need some code changes " \ - "to support multiple layers." - ) - elif (self.draft_model_config.hf_config.model_type == "ernie_mtp"): - self.method = "ernie_mtp" - if self.num_speculative_tokens > 1: - logger.warning( - "All Ernie MTP models only have " \ - "one layer. Might need some code changes " \ - "to support multiple layers." - ) - elif (self.draft_model_config.hf_config.model_type == - "qwen3_next_mtp"): - self.method = "qwen3_next_mtp" - if self.num_speculative_tokens > 1: - logger.warning( - "All Qwen3Next MTP models only have " \ - "one layer. Might need some code changes " \ - "to support multiple layers." - ) - elif (self.draft_model_config.hf_config.model_type - in ("longcat_flash_mtp")): - self.method = "longcat_flash_mtp" - if self.num_speculative_tokens > 1: - logger.warning( - "LongCat MTP models only have " \ - "one layer. Might need some code changes " \ - "to support multiple layers." - ) - else: - self.method = "draft_model" - raise NotImplementedError( - "Speculative decoding with draft model is not " - "supported yet. Please consider using other " - "speculative decoding methods such as ngram, medusa, " - "eagle, or deepseek_mtp.") - - # Replace hf_config for EAGLE draft_model - if self.method in ("eagle", "eagle3"): - from vllm.transformers_utils.configs import SpeculatorsConfig - from vllm.transformers_utils.configs.eagle import EAGLEConfig - - if isinstance(self.draft_model_config.hf_config, - (EAGLEConfig, SpeculatorsConfig)): - pass - else: - eagle_config = EAGLEConfig( - self.draft_model_config.hf_config, - method=self.method, - model_type="eagle") - self.draft_model_config.hf_config = eagle_config - - if (self.num_speculative_tokens is not None - and hasattr(self.draft_model_config.hf_config, - "num_lookahead_tokens")): - self.draft_model_config.hf_config.num_lookahead_tokens = \ - self.num_speculative_tokens - - n_predict = getattr(self.draft_model_config.hf_config, "n_predict", - None) - if n_predict is not None: - if self.num_speculative_tokens is None: - # Default to max value defined in draft model config. - self.num_speculative_tokens = n_predict - elif self.num_speculative_tokens > n_predict and \ - self.num_speculative_tokens % n_predict != 0: - # Ensure divisibility for MTP module reuse. - raise ValueError( - f"num_speculative_tokens:{self.num_speculative_tokens}" - f" must be divisible by {n_predict=}") - - if self.speculative_token_tree is None: - # Generate chain of tokens. - self.speculative_token_tree = str([ - (i + 1) * (0, ) for i in range(self.num_speculative_tokens) - ]) - else: - # Sort the token tree breadth-first. - tree_choices = ast.literal_eval(self.speculative_token_tree) - self.speculative_token_tree = str( - sorted(tree_choices, key=lambda t: (len(t), t))) - - self.draft_tensor_parallel_size = \ - SpeculativeConfig._verify_and_get_draft_tp( - self.target_parallel_config, - self.draft_tensor_parallel_size, - self.draft_model_config.hf_config - ) - - self.draft_model_config.max_model_len = ( - SpeculativeConfig._maybe_override_draft_max_model_len( - self.max_model_len, - self.draft_model_config.max_model_len, - self.target_model_config.max_model_len, - )) - - self.draft_parallel_config = ( - SpeculativeConfig.create_draft_parallel_config( - self.target_parallel_config, - self.draft_tensor_parallel_size)) - - -SpeculativeConfig.__post_init__ = __post_init__ diff --git a/vllm_ascend/spec_decode/__init__.py b/vllm_ascend/spec_decode/__init__.py index a8d44875..50f65de7 100644 --- a/vllm_ascend/spec_decode/__init__.py +++ b/vllm_ascend/spec_decode/__init__.py @@ -32,7 +32,7 @@ def get_spec_decode_method(method, return NgramProposer(vllm_config, device, runner) elif method in ("eagle", "eagle3"): return EagleProposer(vllm_config, device, runner) - elif method in ('deepseek_mtp', 'qwen3_next_mtp'): + elif method == "mtp": if is_torchair_graph: return TorchairMtpProposer(vllm_config, device, runner) return MtpProposer(vllm_config, device, runner) diff --git a/vllm_ascend/torchair/torchair_mla.py b/vllm_ascend/torchair/torchair_mla.py index b1ed979c..5846bbd2 100644 --- a/vllm_ascend/torchair/torchair_mla.py +++ b/vllm_ascend/torchair/torchair_mla.py @@ -317,7 +317,7 @@ class AscendMLATorchairMetadataBuilder: dtype=self.model_config.dtype, device=device) if self.vllm_config.speculative_config is not None and\ - self.vllm_config.speculative_config.method == 'deepseek_mtp': + self.vllm_config.speculative_config.method == 'mtp': attn_state = AscendAttentionState.SpecDecoding num_decode_tokens = 2 else: diff --git a/vllm_ascend/torchair/torchair_model_runner.py b/vllm_ascend/torchair/torchair_model_runner.py index d7c55c6e..012183e2 100644 --- a/vllm_ascend/torchair/torchair_model_runner.py +++ b/vllm_ascend/torchair/torchair_model_runner.py @@ -501,7 +501,7 @@ class NPUTorchairModelRunner(NPUModelRunner): def update_torchair_graph_batch_sizes(self): # return graph_batch_sizes according to the max number of tokens # first pad according to the number of requests - if self.is_kv_consumer and self.speculative_config and self.speculative_config.method == 'deepseek_mtp': + if self.is_kv_consumer and self.speculative_config and self.speculative_config.method == 'mtp': # pd disaggregation scenario may incorrectly calculate the batch in mtp scenario, so we force set it to max_num_reqs self.torchair_graph_batch_sizes = [self.max_num_reqs] logger.warning( diff --git a/vllm_ascend/torchair/torchair_sfa.py b/vllm_ascend/torchair/torchair_sfa.py index 7e1fe325..19e88017 100644 --- a/vllm_ascend/torchair/torchair_sfa.py +++ b/vllm_ascend/torchair/torchair_sfa.py @@ -319,7 +319,7 @@ class AscendSFATorchairMetadataBuilder: device=device) if self.vllm_config.speculative_config is not None and\ - self.vllm_config.speculative_config.method == 'deepseek_mtp': + self.vllm_config.speculative_config.method == 'mtp': attn_state = AscendAttentionState.SpecDecoding num_decode_tokens = 2 else: diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 04ff0287..f6bd7f08 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2044,13 +2044,13 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin): # We assume it is the decode stage, where prefill occurs but only one token is not hit in cache. elif np.all(num_scheduled_tokens == 1): attn_state = AscendAttentionState.DecodeOnly - if self.speculative_config and self.speculative_config.method == 'deepseek_mtp': + if self.speculative_config and self.speculative_config.method == 'mtp': # SpecDecoding now supports seq_len=1 and seq_len=2 # In Prefilling Decoding Disaggregation scenario, SpecDecoding need to supports seq_len=1 attn_state = AscendAttentionState.SpecDecoding # Speculative decoding. elif np.all(num_valid_tokens == 1): - if self.speculative_config and self.speculative_config.method == 'deepseek_mtp': + if self.speculative_config and self.speculative_config.method == 'mtp': attn_state = AscendAttentionState.SpecDecoding else: attn_state = AscendAttentionState.ChunkedPrefill @@ -2701,7 +2701,7 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin): with ProfileExecuteDuration().capture_async("Draft"): if self.speculative_config: use_padded_batch_for_eagle = self.speculative_config and \ - self.speculative_config.method in ("deepseek_mtp", "qwen3_next_mtp") and \ + self.speculative_config.method == "mtp" and \ not self.speculative_config.disable_padded_drafter_batch if use_padded_batch_for_eagle: # EAGLE speculative decoding can use the GPU sampled tokens @@ -2900,7 +2900,7 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin): block_table_tensor[:num_reqs * self.decode_threshold] attn_state = AscendAttentionState.DecodeOnly if self.speculative_config and \ - self.speculative_config.method == "deepseek_mtp": + self.speculative_config.method == "mtp": attn_state = AscendAttentionState.SpecDecoding common_metadata = CommonAttentionMetadata(