remove useless patch (#4699)

patach_config is useless now. Let's remove it - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-12-08 11:02:42 +08:00
parent 866347a621
commit 0b65ac6c4b
21 changed files with 30 additions and 277 deletions
--- a/vllm_ascend/compilation/acl_graph.py
+++ b/vllm_ascend/compilation/acl_graph.py
@@ -257,7 +257,7 @@ def update_mla_attn_params(update_stream, forward_context, runtime_shape,
             softmax_lse) = param
            seq_lens_list = forward_context.attn_metadata[
                key].decode.seq_lens_list
-            if speculative_config and speculative_config.method == "deepseek_mtp" \
+            if speculative_config and speculative_config.method == "mtp" \
                    and not forward_context.is_mtp_model:
                actual_seq_lengths = forward_context.attn_metadata[
                    key].decode.actual_seq_lengths_q
--- a/vllm_ascend/patch/platform/init.py
+++ b/vllm_ascend/patch/platform/init.py
@@ -16,7 +16,6 @@

 import os

-import vllm_ascend.patch.platform.patch_config  # noqa
 import vllm_ascend.patch.platform.patch_distributed  # noqa
 import vllm_ascend.patch.platform.patch_ec_connector  # noqa
 import vllm_ascend.patch.platform.patch_mamba_config  # noqa
--- a/vllm_ascend/patch/platform/patch_config.py
+++ b/vllm_ascend/patch/platform/patch_config.py
@@ -1,234 +0,0 @@
-import ast
-
-from vllm.config.speculative import SpeculativeConfig
-from vllm.logger import logger
-
-
-def __post_init__(self):
-
-    # Note: "method" is a new parameter that helps to extend the
-    # configuration of non-model-based proposers, and the "model" parameter
-    # will be used to set the draft model, eagle head, or additional weight
-    # when needed. If users do not specify "method", the speculative method
-    # will be detected automatically if possible. If the speculative method
-    # can not be detected, it will be considered as the "draft_model" by
-    # default.
-
-    if self.model is None and self.num_speculative_tokens is not None:
-        # TODO(Shangming): Refactor mtp configuration logic when supporting
-        if (self.target_model_config
-                and self.target_model_config.hf_text_config.model_type
-                in ("deepseek_v3", "deepseek_v32", "mimo", "ernie4_5_moe",
-                    "qwen3_next")):
-            # use the draft model from the same model:
-            self.model = self.target_model_config.model
-            # Align the quantization of draft model for cases such as
-            # --quantization fp8 with a bf16 checkpoint.
-            if not self.quantization:
-                self.quantization = self.target_model_config.quantization
-        elif self.method in ("ngram", "[ngram]"):
-            self.model = "ngram"
-        elif self.method == "suffix":
-            self.model = "suffix"
-        else:
-            raise ValueError("num_speculative_tokens was provided but without "
-                             "speculative model.")
-
-    # Automatically configure the method for ngram when "model" is used
-    # instead of "method"
-    if self.method is None and (self.model is not None
-                                and self.model in ("ngram", "[ngram]")):
-        self.method = "ngram"
-
-    if self.method in ("ngram", "[ngram]"):
-        # Unified to "ngram" internally
-        self.method = "ngram"
-        # Set default values if not provided
-        if (self.prompt_lookup_min is None and self.prompt_lookup_max is None):
-            # TODO(woosuk): Tune these values. They are arbitrarily chosen.
-            self.prompt_lookup_min = 5
-            self.prompt_lookup_max = 5
-        elif self.prompt_lookup_min is None:
-            assert self.prompt_lookup_max is not None
-            self.prompt_lookup_min = self.prompt_lookup_max
-        elif self.prompt_lookup_max is None:
-            assert self.prompt_lookup_min is not None
-            self.prompt_lookup_max = self.prompt_lookup_min
-
-        # Validate values
-        if self.prompt_lookup_min < 1:
-            raise ValueError(
-                f"prompt_lookup_min={self.prompt_lookup_min} must be > 0")
-        if self.prompt_lookup_max < 1:
-            raise ValueError(
-                f"prompt_lookup_max={self.prompt_lookup_max} must be > 0")
-        if self.prompt_lookup_min > self.prompt_lookup_max:
-            raise ValueError(
-                f"prompt_lookup_min={self.prompt_lookup_min} must "
-                f"be <= prompt_lookup_max={self.prompt_lookup_max}")
-
-        # TODO: current we still need extract vocab_size from target model
-        # config, in future, we may try refactor it out, and set
-        # draft related config as None here.
-        self.draft_model_config = self.target_model_config
-        self.draft_parallel_config = self.target_parallel_config
-    elif self.method == "suffix":
-        self.draft_model_config = self.target_model_config
-        self.draft_parallel_config = self.target_parallel_config
-        self._validate_suffix_decoding()
-    else:
-        self.prompt_lookup_max = 0
-        self.prompt_lookup_min = 0
-
-        if self.model is not None:
-            # TODO: Move this import to the top once `ModelConfig`
-            # lives in `vllm.config.model`.
-            from vllm.config import ModelConfig
-            self.draft_model_config = ModelConfig(
-                model=self.model,
-                runner="draft",
-                tokenizer=self.target_model_config.tokenizer,
-                tokenizer_mode=self.target_model_config.tokenizer_mode,
-                trust_remote_code=self.target_model_config.trust_remote_code,
-                allowed_local_media_path=self.target_model_config.
-                allowed_local_media_path,
-                allowed_media_domains=self.target_model_config.
-                allowed_media_domains,
-                dtype=self.target_model_config.dtype,
-                seed=self.target_model_config.seed,
-                revision=self.revision,
-                code_revision=self.code_revision,
-                tokenizer_revision=self.target_model_config.tokenizer_revision,
-                spec_target_max_model_len=self.target_model_config.
-                max_model_len,
-                quantization=self.quantization,
-                enforce_eager=self.target_model_config.enforce_eager,
-                max_logprobs=self.target_model_config.max_logprobs,
-                hf_overrides=SpeculativeConfig.hf_config_override,
-            )
-
-            # Automatically detect the method
-            if self.method in ('eagle', 'eagle3'):
-                pass
-            # examples:
-            # yuhuili/EAGLE-LLaMA3-Instruct-8B
-            # yuhuili/EAGLE3-LLaMA3.1-Instruct-8B
-            # AngelSlim/Qwen3-8B_eagle3
-            elif "eagle-" in self.draft_model_config.model.lower():
-                self.method = "eagle"
-            elif "eagle3" in self.draft_model_config.model.lower():
-                self.method = "eagle3"
-            elif self.draft_model_config.hf_config.model_type == "medusa":
-                self.method = "medusa"
-            elif (self.draft_model_config.hf_config.model_type ==
-                  "mlp_speculator"):
-                self.method = "mlp_speculator"
-            elif (self.draft_model_config.hf_config.model_type
-                  in ("deepseek_mtp", "mimo_mtp", "glm4_moe_mtp")):
-                self.method = "deepseek_mtp"
-                if self.num_speculative_tokens > 1:
-                    logger.warning(
-                            "All Deepseek MTP models only have " \
-                            "one layer. Might need some code changes " \
-                            "to support multiple layers."
-                        )
-            elif (self.draft_model_config.hf_config.model_type == "ernie_mtp"):
-                self.method = "ernie_mtp"
-                if self.num_speculative_tokens > 1:
-                    logger.warning(
-                            "All Ernie MTP models only have " \
-                            "one layer. Might need some code changes " \
-                            "to support multiple layers."
-                        )
-            elif (self.draft_model_config.hf_config.model_type ==
-                  "qwen3_next_mtp"):
-                self.method = "qwen3_next_mtp"
-                if self.num_speculative_tokens > 1:
-                    logger.warning(
-                            "All Qwen3Next MTP models only have " \
-                            "one layer. Might need some code changes " \
-                            "to support multiple layers."
-                        )
-            elif (self.draft_model_config.hf_config.model_type
-                  in ("longcat_flash_mtp")):
-                self.method = "longcat_flash_mtp"
-                if self.num_speculative_tokens > 1:
-                    logger.warning(
-                            "LongCat MTP models only have " \
-                            "one layer. Might need some code changes " \
-                            "to support multiple layers."
-                        )
-            else:
-                self.method = "draft_model"
-                raise NotImplementedError(
-                    "Speculative decoding with draft model is not "
-                    "supported yet. Please consider using other "
-                    "speculative decoding methods such as ngram, medusa, "
-                    "eagle, or deepseek_mtp.")
-
-            # Replace hf_config for EAGLE draft_model
-            if self.method in ("eagle", "eagle3"):
-                from vllm.transformers_utils.configs import SpeculatorsConfig
-                from vllm.transformers_utils.configs.eagle import EAGLEConfig
-
-                if isinstance(self.draft_model_config.hf_config,
-                              (EAGLEConfig, SpeculatorsConfig)):
-                    pass
-                else:
-                    eagle_config = EAGLEConfig(
-                        self.draft_model_config.hf_config,
-                        method=self.method,
-                        model_type="eagle")
-                    self.draft_model_config.hf_config = eagle_config
-
-            if (self.num_speculative_tokens is not None
-                    and hasattr(self.draft_model_config.hf_config,
-                                "num_lookahead_tokens")):
-                self.draft_model_config.hf_config.num_lookahead_tokens = \
-                self.num_speculative_tokens
-
-            n_predict = getattr(self.draft_model_config.hf_config, "n_predict",
-                                None)
-            if n_predict is not None:
-                if self.num_speculative_tokens is None:
-                    # Default to max value defined in draft model config.
-                    self.num_speculative_tokens = n_predict
-                elif self.num_speculative_tokens > n_predict and \
-                        self.num_speculative_tokens % n_predict != 0:
-                    # Ensure divisibility for MTP module reuse.
-                    raise ValueError(
-                        f"num_speculative_tokens:{self.num_speculative_tokens}"
-                        f" must be divisible by {n_predict=}")
-
-            if self.speculative_token_tree is None:
-                # Generate chain of tokens.
-                self.speculative_token_tree = str([
-                    (i + 1) * (0, ) for i in range(self.num_speculative_tokens)
-                ])
-            else:
-                # Sort the token tree breadth-first.
-                tree_choices = ast.literal_eval(self.speculative_token_tree)
-                self.speculative_token_tree = str(
-                    sorted(tree_choices, key=lambda t: (len(t), t)))
-
-            self.draft_tensor_parallel_size = \
-                SpeculativeConfig._verify_and_get_draft_tp(
-                    self.target_parallel_config,
-                    self.draft_tensor_parallel_size,
-                    self.draft_model_config.hf_config
-            )
-
-            self.draft_model_config.max_model_len = (
-                SpeculativeConfig._maybe_override_draft_max_model_len(
-                    self.max_model_len,
-                    self.draft_model_config.max_model_len,
-                    self.target_model_config.max_model_len,
-                ))
-
-            self.draft_parallel_config = (
-                SpeculativeConfig.create_draft_parallel_config(
-                    self.target_parallel_config,
-                    self.draft_tensor_parallel_size))
-
-
-SpeculativeConfig.__post_init__ = __post_init__
--- a/vllm_ascend/spec_decode/init.py
+++ b/vllm_ascend/spec_decode/init.py
@@ -32,7 +32,7 @@ def get_spec_decode_method(method,
        return NgramProposer(vllm_config, device, runner)
    elif method in ("eagle", "eagle3"):
        return EagleProposer(vllm_config, device, runner)
-    elif method in ('deepseek_mtp', 'qwen3_next_mtp'):
+    elif method == "mtp":
        if is_torchair_graph:
            return TorchairMtpProposer(vllm_config, device, runner)
        return MtpProposer(vllm_config, device, runner)
--- a/vllm_ascend/torchair/torchair_mla.py
+++ b/vllm_ascend/torchair/torchair_mla.py
@@ -317,7 +317,7 @@ class AscendMLATorchairMetadataBuilder:
                         dtype=self.model_config.dtype,
                         device=device)
        if self.vllm_config.speculative_config is not None and\
-            self.vllm_config.speculative_config.method == 'deepseek_mtp':
+            self.vllm_config.speculative_config.method == 'mtp':
            attn_state = AscendAttentionState.SpecDecoding
            num_decode_tokens = 2
        else:
--- a/vllm_ascend/torchair/torchair_model_runner.py
+++ b/vllm_ascend/torchair/torchair_model_runner.py
@@ -501,7 +501,7 @@ class NPUTorchairModelRunner(NPUModelRunner):
    def update_torchair_graph_batch_sizes(self):
        # return graph_batch_sizes according to the max number of tokens
        # first pad according to the number of requests
-        if self.is_kv_consumer and self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
+        if self.is_kv_consumer and self.speculative_config and self.speculative_config.method == 'mtp':
            # pd disaggregation scenario may incorrectly calculate the batch in mtp scenario, so we force set it to max_num_reqs
            self.torchair_graph_batch_sizes = [self.max_num_reqs]
            logger.warning(
--- a/vllm_ascend/torchair/torchair_sfa.py
+++ b/vllm_ascend/torchair/torchair_sfa.py
@@ -319,7 +319,7 @@ class AscendSFATorchairMetadataBuilder:
                         device=device)

        if self.vllm_config.speculative_config is not None and\
-            self.vllm_config.speculative_config.method == 'deepseek_mtp':
+            self.vllm_config.speculative_config.method == 'mtp':
            attn_state = AscendAttentionState.SpecDecoding
            num_decode_tokens = 2
        else:
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -2044,13 +2044,13 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
        # We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
        elif np.all(num_scheduled_tokens == 1):
            attn_state = AscendAttentionState.DecodeOnly
-            if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
+            if self.speculative_config and self.speculative_config.method == 'mtp':
                # SpecDecoding now supports seq_len=1 and seq_len=2
                # In Prefilling Decoding Disaggregation scenario, SpecDecoding need to supports seq_len=1
                attn_state = AscendAttentionState.SpecDecoding
        # Speculative decoding.
        elif np.all(num_valid_tokens == 1):
-            if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
+            if self.speculative_config and self.speculative_config.method == 'mtp':
                attn_state = AscendAttentionState.SpecDecoding
            else:
                attn_state = AscendAttentionState.ChunkedPrefill
@@ -2701,7 +2701,7 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
        with ProfileExecuteDuration().capture_async("Draft"):
            if self.speculative_config:
                use_padded_batch_for_eagle = self.speculative_config and \
-                    self.speculative_config.method in ("deepseek_mtp", "qwen3_next_mtp") and \
+                    self.speculative_config.method == "mtp" and \
                    not self.speculative_config.disable_padded_drafter_batch
                if use_padded_batch_for_eagle:
                    # EAGLE speculative decoding can use the GPU sampled tokens
@@ -2900,7 +2900,7 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
                        block_table_tensor[:num_reqs * self.decode_threshold]
                attn_state = AscendAttentionState.DecodeOnly
                if self.speculative_config and \
-                        self.speculative_config.method == "deepseek_mtp":
+                        self.speculative_config.method == "mtp":
                    attn_state = AscendAttentionState.SpecDecoding

                common_metadata = CommonAttentionMetadata(