diff --git a/vllm_ascend/ops/register_custom_ops.py b/vllm_ascend/ops/register_custom_ops.py index 06a52ae1..b7100991 100644 --- a/vllm_ascend/ops/register_custom_ops.py +++ b/vllm_ascend/ops/register_custom_ops.py @@ -116,7 +116,7 @@ def _maybe_prefetch_mlp_gate_up_proj_impl(x_dependency: torch.Tensor, except AssertionError: return - if not forward_context.prefetch_mlp_enabled: + if not getattr(forward_context, 'prefetch_mlp_enabled', False): return model_instance = forward_context.model_instance prefetch_stream = forward_context.prefetch_stream @@ -173,7 +173,7 @@ def _maybe_prefetch_mlp_down_proj_impl(x_dependency: torch.Tensor) -> None: except AssertionError: return - if not forward_context.prefetch_mlp_enabled: + if not getattr(forward_context, 'prefetch_mlp_enabled', False): return forward_context.prefetch_mlp_down_proj = True model_instance = forward_context.model_instance @@ -202,7 +202,7 @@ def _maybe_wait_prefetch_done_impl(x: torch.Tensor) -> None: except AssertionError: return - if not forward_context.prefetch_mlp_enabled: + if not getattr(forward_context, 'prefetch_mlp_enabled', False): return if forward_context.prefetch_mlp_gate_up_proj or \ forward_context.prefetch_mlp_down_proj: diff --git a/vllm_ascend/patch/worker/patch_qwen2_5_omni.py b/vllm_ascend/patch/worker/patch_qwen2_5_omni.py index f52d1a1d..c272edb3 100644 --- a/vllm_ascend/patch/worker/patch_qwen2_5_omni.py +++ b/vllm_ascend/patch/worker/patch_qwen2_5_omni.py @@ -18,8 +18,7 @@ import torch import torch.nn as nn from vllm.model_executor.models.qwen2_5_omni_thinker import ( - Qwen2_5_VLImageInputs, Qwen2_5_VLVideoInputs, - Qwen2_5OmniThinkerForConditionalGeneration) + Qwen2_5_VLImageInputs, Qwen2_5_VLVideoInputs) from vllm_ascend.ascend_forward_context import set_ascend_forward_context @@ -65,8 +64,3 @@ class AscendQwen2_5OmniThinkerForConditionalGeneration(nn.Module): sizes = grid_thw.prod(-1) // merge_size // merge_size return video_embeds.split(sizes.tolist()) - - -# NOTE: These will be removed after ascend_forward_context is refactored. -Qwen2_5OmniThinkerForConditionalGeneration._process_image_input = AscendQwen2_5OmniThinkerForConditionalGeneration._process_image_input -Qwen2_5OmniThinkerForConditionalGeneration._process_video_input = AscendQwen2_5OmniThinkerForConditionalGeneration._process_video_input diff --git a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py index 7db4323d..62a1e67e 100644 --- a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py +++ b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py @@ -20,9 +20,9 @@ import torch import torch.nn as nn import torch.nn.functional as F import torch_npu -from vllm.model_executor.models.qwen2_5_vl import ( - Qwen2_5_VisionAttention, Qwen2_5_VLForConditionalGeneration, - Qwen2_5_VLImageInputs, Qwen2_5_VLVideoInputs) +from vllm.model_executor.models.qwen2_5_vl import (Qwen2_5_VisionAttention, + Qwen2_5_VLImageInputs, + Qwen2_5_VLVideoInputs) from vllm.model_executor.models.qwen2_vl import Qwen2VisionAttention from vllm.model_executor.models.vision import run_dp_sharded_mrope_vision_model @@ -169,7 +169,3 @@ class AscendQwen2_5_VLForConditionalGeneration(nn.Module): # NOTE: This will be removed after MMEncoderAttention has been extract as a CustomOp in vllm. Qwen2VisionAttention.forward = AscendQwen2_5_VisionAttention.forward Qwen2_5_VisionAttention.forward = AscendQwen2_5_VisionAttention.forward - -# NOTE: These will be removed after ascend_forward_context is refactored. -Qwen2_5_VLForConditionalGeneration._process_image_input = AscendQwen2_5_VLForConditionalGeneration._process_image_input -Qwen2_5_VLForConditionalGeneration._process_video_input = AscendQwen2_5_VLForConditionalGeneration._process_video_input