diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index 4e50526c..08b1c7a4 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -30,18 +30,9 @@ # -------------------------------- # * Platform Patch: # ================= -# ** File: platform/patch_distributed.py** +# ** 1. File: platform/patch_distributed.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. `vllm.config.ParallelConfig.get_next_dp_init_port` -# Why: -# vllm doesn't support get port from environment. -# How: -# Add the logic to get port from environment. -# Related PR (if no, explain why): -# Need a PR to vllm to support get port from environment. -# Future Plan: -# Remove those patch when vllm merged them -# 2. `torch.distributed.all_reduce`, `torch.distributed.broadcast` +# 1. `torch.distributed.all_reduce`, `torch.distributed.broadcast` # Why: # tensor alignment for 310p # How: @@ -51,21 +42,85 @@ # Future Plan: # Find a better way to support tensor alignment for 310p without this patch. # -# ** File: worker/patch_multimodal_merge.py** +# ** 2. File: platform/patch_ec_connector.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. `vllm.model_executor.models.utils._merge_multimodal_embeddings` +# 1. `vllm.distributed.ec_transfer.ec_connector.shared_storage_connector.ECSharedStorageConnector.start_load_caches` # Why: -# '_merge_multimodal_embeddings' func of vllm is incompatible with Ascend. +# it's hard code to cuda # How: -# Replace with CPU operation that can be executed asynchronously. +# change the cuda to npu # Related PR (if no, explain why): -# This is a bug by Ascend only. It can' be fixed in vLLM. +# https://github.com/vllm-project/vllm/pull/30225 # Future Plan: -# Identify this pattern in torch-npu and remove this patch. +# Remove this patch when vllm merges the PR. +# +# ** 3. File: platform/patch_mamba_config.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.model_executor.models.config.HybridAttentionMambaModelConfig.verify_and_update_config` +# Why: +# block size is set to 16 in vLLM which is not supported by Ascend. +# How: +# Set block size to 128 on npu. +# Related PR (if no, explain why): +# we'll fix this in vLLM soon. +# Future Plan: +# Remove this patch when vLLM merges the PR. +# +# ** 4. File: platform/patch_multiproc_executor.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.v1.executor.multiproc_executor.MultiprocExecutor` +# Why: +# vLLM create child process with daemon=True, which doesn't work with EPLB case, since EPLB will create +# a new process which is not allowed by daemon=True. +# How: +# Set daemon=False in MultiprocExecutor. +# Related PR (if no, explain why): +# Find a way to support daemon=False in vLLM +# Future Plan: +# Remove this patch when vLLM fix the issue. +# +# ** 5. File: platform/patch_sched_yield.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.distributed.utils.USE_SCHED_YIELD` +# Why: +# os.sched_yield() doesn't work on Arm systems. +# How: +# avoid using os.sched_yield() on Arm systems. +# Related PR (if no, explain why): +# https://github.com/vllm-project/vllm/pull/30228 +# Future Plan: +# Remove this patch when vLLM merge the PR. +# # # * Worker Patch: # =============== -# ** File: worker/patch_minicpm.py ** +# +# ** 1. File: worker/patch_deepseek.py ** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `DeepseekV2Model.forward` +# Why: +# getattr(self.config, "llama_4_scaling", None) will raise AttributeError +# on npu with graph mode. +# How: +# catch the AttributeError and set llama_4_scaling to None. +# Related PR (if no, explain why): +# No, this is a bug in vLLM Ascend +# Future Plan: +# Find the root cause of this bug and fix it in vLLM Ascend. +# +# ** 2. File: worker/patch_distributed.py ** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.distributed.parallel_state.GroupCoordinator` +# Why: +# vllm doesn't support all_to_all for GroupCoordinator. +# How: +# Add all_to_all implementation for GroupCoordinator. +# Related PR (if no, explain why): +# No, we should use vlLM all2all manager to support all_to_all for npu. +# Future Plan: +# Remove this patch when the refactor of all2all manager is done. +# +# ** 3. File: worker/patch_minicpm.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.models.minicpm.MiniCPMAttention.forward` # Why: @@ -79,32 +134,65 @@ # Future Plan: # Keep this patch in vllm-ascend. # -# ** File: worker/patch_distributed.py ** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. `vllm.distributed.parallel_state.GroupCoordinator` -# (1) __init__() +# ** 4. File: worker/patch_multimodal_merge.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.model_executor.models.utils._merge_multimodal_embeddings` # Why: -# The original GroupCoordinator initialization lacks pg_options to generate new -# process group with customized options. -# How: -# Inject HCCL options during process group initialization. -# Related PR (if no, explain why): -# Need a PR to vllm to support a dictionary as input while initializing distributed -# environment (e.g., Dict[str, torch.distributed.ProcessGroupHCCL.Options]) -# https://github.com/vllm-project/vllm/pull/25417 -# Future Plan: -# Remove this patch when vllm merges this PR. -# (2) all_to_all() -# Why: -# vllm doesn't support all_to_all for GroupCoordinator. +# '_merge_multimodal_embeddings' func of vllm is incompatible with Ascend. # How: -# Add all_to_all implementation for GroupCoordinator. +# Replace with CPU operation that can be executed asynchronously. # Related PR (if no, explain why): -# Need a PR to vllm to support all_to_all for GroupCoordinator. +# This is a bug by Ascend only. It can' be fixed in vLLM. # Future Plan: -# Remove this patch when vllm merged them. +# Identify this pattern in torch-npu and remove this patch. # -# ** File: worker/patch_roberta.py ** +# ** 5. File: worker/patch_qwen2_5_omni.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.model_executor.models.qwen2_5_omni_thinker.Qwen2_5OmniThinkerForConditionalGeneration` +# Why: +# we have ascend forward context which doesn't work with upstream. +# How: +# override forward_context in the model file +# Related PR (if no, explain why): +# This is a bug by Ascend only. we should drop set_ascend_forward_context +# Future Plan: +# Remove this patch once forward_context is refactor. +# +# ** 6. File: worker/patch_qwen2_5_vl.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VLForConditionalGeneration` +# Why: +# we have ascend forward context which doesn't work with upstream. +# How: +# override forward_context in the model file +# Related PR (if no, explain why): +# This is a bug by Ascend only. we should drop set_ascend_forward_context +# Future Plan: +# Remove this patch once forward_context is refactor. +# +# 2. `vllm.model_executor.models.qwen2_vl.Qwen2VisionAttention.forward` +# Why: +# the attention is not custom ops +# How: +# make it to custom ops and pluggable +# Related PR (if no, explain why): +# https://github.com/vllm-project/vllm/pull/30125 +# Future Plan: +# Remove this patch one the PR is merged into vLLM. +# +# ** 7. File: worker/patch_qwen3_vl.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.model_executor.models.qwen3_vl.Qwen3_VisionTransformer.forward` +# Why: +# the attention is not custom ops +# How: +# make it to custom ops and pluggable +# Related PR (if no, explain why): +# https://github.com/vllm-project/vllm/pull/30125 +# Future Plan: +# Remove this patch one the PR is merged into vLLM. +# +# ** 8. File: worker/patch_roberta.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.models.bert ` # Why: @@ -116,18 +204,29 @@ # Future Plan: # Revert this when CANN support shift aclnn operation # -# ** File: worker/patch_deepseek_mtp.py** +# ** 9. File: worker/patch_triton.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. `vllm.model_executor.models.deepseek_mtp.DeepSeekMultiTokenPredictorLayer.__init__` +# 1. `vllm.model_executor.layers.mamba.ops`, `vllm.model_executor.layers.fla.ops` # Why: -# '__init__' func of DeepSeekMultiTokenPredictorLayer didn't pass prefix to SharedHead. +# triton ops in vLLM perform not good on NPU. And there is no dispatch mechanism for triton ops. # How: -# Replace with a new __init__. -# Use a new SharedHead which passes prefix to ParallelLMHead. +# override triton ops in vLLM with ascend implementation # Related PR (if no, explain why): -# https://github.com/vllm-project/vllm/pull/25805 +# Let vLLM support triton ops dispatch. # Future Plan: -# Remove this patch when adapted vllm version contains the above PR. +# Remove this patch when vLLM support the dispatch function. +# +# ** 10. File: worker/patch_weight_loader.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.model_executor.layers.linear.UnquantizedLinearMethod` +# Why: +# vLLM Ascend doesn't work with weight loader v2 +# How: +# patch it to fix the bug. +# Related PR (if no, explain why): +# This is a bug by Ascend only. We should fix it soon +# Future Plan: +# Remove this patch when the bug is fixed. # # ** File: worker/patch_qwen3_next_mtp.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/vllm_ascend/patch/worker/patch_qwen2_5_omni.py b/vllm_ascend/patch/worker/patch_qwen2_5_omni.py index bd91a33e..f52d1a1d 100644 --- a/vllm_ascend/patch/worker/patch_qwen2_5_omni.py +++ b/vllm_ascend/patch/worker/patch_qwen2_5_omni.py @@ -67,6 +67,6 @@ class AscendQwen2_5OmniThinkerForConditionalGeneration(nn.Module): return video_embeds.split(sizes.tolist()) -# NOTE: These will be removed after https://github.com/vllm-project/vllm/pull/29388 is merged. +# NOTE: These will be removed after ascend_forward_context is refactored. Qwen2_5OmniThinkerForConditionalGeneration._process_image_input = AscendQwen2_5OmniThinkerForConditionalGeneration._process_image_input Qwen2_5OmniThinkerForConditionalGeneration._process_video_input = AscendQwen2_5OmniThinkerForConditionalGeneration._process_video_input diff --git a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py index 1c2f356b..7db4323d 100644 --- a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py +++ b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py @@ -170,6 +170,6 @@ class AscendQwen2_5_VLForConditionalGeneration(nn.Module): Qwen2VisionAttention.forward = AscendQwen2_5_VisionAttention.forward Qwen2_5_VisionAttention.forward = AscendQwen2_5_VisionAttention.forward -# NOTE: These will be removed after https://github.com/vllm-project/vllm/pull/29388 is merged. +# NOTE: These will be removed after ascend_forward_context is refactored. Qwen2_5_VLForConditionalGeneration._process_image_input = AscendQwen2_5_VLForConditionalGeneration._process_image_input Qwen2_5_VLForConditionalGeneration._process_video_input = AscendQwen2_5_VLForConditionalGeneration._process_video_input