cleanup useless torchair logic (#4856)

This PR clean up useless torchair logic in model runner. The moge doc is only for torchair, it can be removed as well. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-12-11 11:21:13 +08:00
parent c12eb22cbe
commit bb76f7962c
7 changed files with 22 additions and 307 deletions
--- a/vllm_ascend/spec_decode/mtp_proposer.py
+++ b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -607,7 +607,6 @@ class MtpProposer(Proposer):
            attn_mask=self.runner.attn_mask,
            spec_attn_mask=self.runner.spec_attn_mask,
            attn_state=self.runner.attn_state,
-            graph_pad_size=self.runner.graph_pad_size,
            decode_token_per_req=self.runner.decode_token_per_req,
        )
        return spec_common_attn_metadata, token_indices
@@ -762,8 +761,7 @@ class MtpProposer(Proposer):
        ) and aclgraph_runtime_mode == CUDAGraphMode.FULL:
            graph_pad_size = num_input_tokens
        else:
-            # Currently, runner.graph_pad_size will always be -1.
-            graph_pad_size = self.runner.graph_pad_size
+            graph_pad_size = -1

        # If use fullgraph and disable_padded_drafter_batch=True, We need to
        # update the graph_pad_size in common_attn_metadata, to tell the
@@ -1135,7 +1133,6 @@ class MtpProposer(Proposer):
            attn_mask=self.runner.attn_mask,
            spec_attn_mask=self.runner.spec_attn_mask,
            attn_state=self.runner.attn_state,
-            graph_pad_size=self.runner.graph_pad_size,
            decode_token_per_req=self.runner.decode_token_per_req,
            num_computed_tokens_cpu=common_attn_metadata.
            num_computed_tokens_cpu,