diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py index 8e14b4d2..f28bb0f4 100644 --- a/vllm_ascend/ascend_forward_context.py +++ b/vllm_ascend/ascend_forward_context.py @@ -19,6 +19,7 @@ from vllm_ascend.utils import ( is_drafter_moe_model, is_moe_model, speculative_enable_dispatch_gmm_combine_decode, + vllm_version_is, ) @@ -152,6 +153,9 @@ def set_ascend_forward_context( mc2_mask[num_actual_tokens:] = False forward_context.mc2_mask = mc2_mask + if is_draft_model and vllm_version_is("0.15.0"): + forward_context.remaining_moe_layers = None + try: yield finally: diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 17abdc42..054469c5 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -553,17 +553,6 @@ class EagleProposer(VllmEagleProposer): model_hidden_states, model_positions = self.maybe_pad_and_reduce(model_hidden_states, model_positions) - # Expend the remaining moe layers for suiting vllm. - forward_context = get_forward_context() - if forward_context and hasattr(forward_context, "remaining_moe_layers"): - if self.num_speculative_tokens > 1: - moe_layers_needed = len(forward_context.remaining_moe_layers) * self.num_speculative_tokens - if len(forward_context.remaining_moe_layers) < moe_layers_needed: - original_layers = list(forward_context.remaining_moe_layers) - repeat_count = (moe_layers_needed + len(original_layers) - 1) // len(original_layers) - expanded_layers = original_layers * repeat_count - forward_context.remaining_moe_layers = expanded_layers - ret_hidden_states = self.model( input_ids=model_input_ids, positions=model_positions,