[Model] GLM5 adaptation (#6642)

### What this PR does / why we need it? GLM5 adaptation 1. use torch_npu.npu_lightning_indexer for GLM5 2. forbid eagle proposer when fullgraph mode is enabled because of bugs 3. add quatization config for GLM5 ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? by ci - vLLM main: 978a37c823 --------- Signed-off-by: yydyzr <liuyuncong1@huawei.com> Signed-off-by: shenchuxiaofugui <1311027364@qq.com> Co-authored-by: shenchuxiaofugui <1311027364@qq.com>
2026-02-11 22:22:22 +08:00
parent 140fcaffc3
commit ff3a50d011
17 changed files with 77 additions and 34 deletions
--- a/vllm_ascend/spec_decode/mtp_proposer.py
+++ b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -36,7 +36,14 @@ class MtpProposer(EagleProposer):
        dummy_compute_logits=lambda hidden_states: None,
        is_profile=False,
    ) -> None:
-        if self.pcp_size * self.dcp_size == 1 and not self.speculative_config.disable_padded_drafter_batch:
+        # Currently, both GLM and DS encounter issues when enabling the fullgraph mode and running on EagleProposer.
+        # Therefore, we temporarily bypass this problem by adding a conditional check for fullgraph.
+        # TODO: this conditional check should be removed after bug fixing.
+        if (
+            self.pcp_size * self.dcp_size == 1
+            and not self.speculative_config.disable_padded_drafter_batch
+            and not self.vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs()
+        ):
            super().dummy_run(
                num_tokens,
                with_prefill,
@@ -166,7 +173,14 @@ class MtpProposer(EagleProposer):
        scheduler_output: SchedulerOutput = None,
        num_scheduled_tokens: int = 0,
    ) -> torch.Tensor:
-        if self.pcp_size * self.dcp_size == 1 and not self.speculative_config.disable_padded_drafter_batch:
+        # Currently, both GLM and DS encounter issues when enabling the fullgraph mode and running on EagleProposer.
+        # Therefore, we temporarily bypass this problem by adding a conditional check for fullgraph.
+        # TODO: this conditional check should be removed after bug fixing.
+        if (
+            self.pcp_size * self.dcp_size == 1
+            and not self.speculative_config.disable_padded_drafter_batch
+            and not self.vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs()
+        ):
            draft_token_ids = super()._propose(
                target_token_ids,
                target_positions,