Files
2026-04-02 04:55:00 +00:00

679 lines
30 KiB
Python

from vllm_vacc.patch_util import PatchManager
def patch_block_manager():
# block table manager
from vllm_vacc.vllm.core.block.block_table import BlockTable as VaccBlockTable
import vllm.core.block.block_table
setattr(vllm.core.block.block_table, "BlockTable", VaccBlockTable)
# cpu_gpu_block_allocator
from vllm_vacc.vllm.core.block.cpu_gpu_block_allocator \
import CpuGpuBlockAllocator as VaccCpuGpuBlockAllocator
import vllm.core.block.cpu_gpu_block_allocator
setattr(vllm.core.block.cpu_gpu_block_allocator, "CpuGpuBlockAllocator", VaccCpuGpuBlockAllocator)
# naive block allocator
from vllm_vacc.vllm.core.block.naive_block \
import NaiveBlockAllocator as VaccNaiveBlockAllocator
import vllm.core.block.naive_block
setattr(vllm.core.block.naive_block, "NaiveBlockAllocator", VaccNaiveBlockAllocator)
# setattr(vllm.core.block.naive_block.NaiveBlockAllocator, "partition_blocks", VaccNaiveBlockAllocator.partition_blocks)
# block manager
from vllm_vacc.vllm.core.block_manager \
import SelfAttnBlockSpaceManager as VaccSelfAttnBlockSpaceManager
import vllm.core.block_manager
setattr(vllm.core.block_manager, "SelfAttnBlockSpaceManager", VaccSelfAttnBlockSpaceManager)
# from vllm_vacc.vllm.core.block.prefix_caching_block \
# import PrefixCachingBlockAllocator as VaccPrefixCachingBlockAllocator
# import vllm.core.block.prefix_caching_block
# setattr(vllm.core.block.prefix_caching_block, "PrefixCachingBlockAllocator", VaccPrefixCachingBlockAllocator)
# from vllm_vacc.vllm.core.block.prefix_caching_block \
# import PrefixCachingBlock as VaccPrefixCachingBlock
# import vllm.core.block.prefix_caching_block
# setattr(vllm.core.block.prefix_caching_block, "PrefixCachingBlock", VaccPrefixCachingBlock)
def patch_vllm_envs():
import os
import vllm.envs as env
# prefill timeout set to 1000s
env.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS = int(os.getenv("VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS", 3600))
# patch_block_manager()
patch_vllm_envs()
TRANSPOSE_MLP_MOE_W2 = True
class VllmPatchManager(PatchManager):
"""vllm Patch Manager"""
vllm_patch_info: dict = {}
@classmethod
def get_patch_info(cls):
# Override to return child_patch_info
return cls.vllm_patch_info
def patch_torch():
import torch
def null_decorator(*args, **kwargs):
def decorator(func):
def wrapper(*args, **kwargs):
return func(*args, **kwargs)
return wrapper
return decorator
# unregist torch.compile
torch.compile = null_decorator
def regist_mock_module():
import sys
import types
class _mock_object(object):
def __init__(self, name=""):
self.name = name
def __getattr__(self, item):
return _mock_object(f"{self.name}.{item}")
def __call__(self, *args, **kwargs):
print(f"mock module: {self.name}")
return 0
sys.modules['flash_attn'] = types.ModuleType('flash_attn')
sys.modules['flash_attn'].__spec__ = False
setattr(sys.modules['flash_attn'], "flash_attn_varlen_func", _mock_object("flash_attn_varlen_func"))
def patch_vllm(vpm: VllmPatchManager) -> None:
vpm.batch_register_patch(
[
"vllm.model_executor.custom_op.CustomOp.dispatch_forward",
"vllm.distributed.parallel_state.GroupCoordinator.all_reduce",
"vllm.distributed.parallel_state.GroupCoordinator.broadcast_tensor_dict",
"vllm.distributed.parallel_state.GroupCoordinator.all_gather",
"vllm.distributed.parallel_state.GroupCoordinator.recv_tensor_dict",
"vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod.apply",
"vllm.sequence.SequenceData.append_token_id",
# "vllm.attention.backends.mla.utils.MLACommonImpl.process_weights_after_loading",
"vllm.attention.backends.mla.common.MLACommonImpl.__init__",
"vllm.attention.backends.mla.common.MLACommonImpl.forward",
"vllm.spec_decode.spec_decode_worker.SpecDecodeWorker.determine_num_available_blocks",
"vllm.spec_decode.top1_proposer.Top1Proposer.get_spec_proposals",
"vllm.spec_decode.multi_step_worker.MultiStepWorker.sampler_output",
"vllm.inputs.preprocess.InputPreprocessor._process_embeds",
"vllm.inputs.data.EmbedsPrompt",
"vllm.inputs.data.EmbedsInputs",
"vllm.inputs.data.embeds_inputs",
"vllm.entrypoints.renderer.BaseRenderer.load_prompt_embeds",
"vllm.entrypoints.renderer.CompletionRenderer.render_prompt_and_embeds",
# "vllm.model_executor.layers.sampler.Sampler.forward",
"vllm.model_executor.layers.quantization.fp8.Fp8LinearMethod.create_weights",
"vllm.model_executor.layers.quantization.fp8.Fp8LinearMethod.apply",
"vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod.create_weights",
"vllm.model_executor.layers.linear.MergedColumnParallelLinear.weight_loader_v2",
"vllm.model_executor.layers.linear.UnquantizedLinearMethod.apply",
"vllm.model_executor.layers.quantization.gptq.GPTQConfig.get_supported_act_dtypes",
"vllm.model_executor.layers.quantization.gptq.GPTQLinearMethod.process_weights_after_loading",
"vllm.model_executor.layers.quantization.gptq.GPTQLinearMethod.apply",
"vllm.model_executor.layers.quantization.gptq.GPTQLinearMethod.create_weights",
"vllm.model_executor.layers.quantization.fp8.Fp8LinearMethod.process_weights_after_loading",
"vllm.model_executor.layers.fused_moe.layer.FusedMoE._load_w13",
"vllm.model_executor.layers.fused_moe.layer.FusedMoE._load_w2",
"vllm.model_executor.layers.fused_moe.layer.FusedMoE.weight_loader",
"vllm.model_executor.layers.fused_moe.layer.FusedMoE._load_model_weight_or_group_weight_scale",
"vllm.model_executor.models.deepseek_v2.DeepseekV2MoE.forward",
"vllm.model_executor.models.deepseek_v2.DeepseekV2MLP.forward",
# "vllm.model_executor.models.deepseek_v2.DeepseekV2DecoderLayer.forward",
"vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits",
"vllm.model_executor.models.deepseek_v2.DeepseekV2ForCausalLM.load_weights",
"vllm.model_executor.models.deepseek_v2.DeepseekV2ForCausalLM.forward",
"vllm.model_executor.models.deepseek_mtp.DeepSeekMTP.load_weights",
"vllm.model_executor.models.deepseek_mtp.DeepSeekMultiTokenPredictorLayer.forward",
"vllm.model_executor.models.deepseek_mtp.SharedHead.forward",
"vllm.model_executor.models.deepseek_v2.DeepseekV2Model.forward",
"vllm.core.block.common.BlockList.append_token_ids",
"vllm.core.block.naive_block.NaiveBlock.append_token_ids",
"vllm.model_executor.layers.sampler.SamplerOutput",
# "vllm.core.scheduler.Scheduler._schedule_prefills",
"vllm.engine.multiprocessing.engine.run_mp_engine",
"vllm.engine.llm_engine.LLMEngine.from_engine_args",
"vllm.engine.multiprocessing.engine.MQLLMEngine._handle_process_request",
"vllm.engine.multiprocessing.engine.MQLLMEngine._handle_abort_request",
"vllm.engine.metrics.LoggingStatLogger.log",
"vllm.entrypoints.openai.serving_engine.OpenAIServing._validate_input",
"vllm.entrypoints.openai.serving_engine.OpenAIServing._log_inputs",
"vllm.entrypoints.openai.serving_engine.EmbedsPrompt",
"vllm.reasoning.deepseek_r1_reasoning_parser.DeepSeekR1ReasoningParser",
"vllm.model_executor.layers.fused_moe.fused_moe.fused_topk",
"vllm.model_executor.models.qwen3_moe.Qwen3MoeDecoderLayer.forward",
"vllm.model_executor.models.qwen3_moe.Qwen3MoeAttention.forward",
"vllm.model_executor.models.qwen3_moe.Qwen3MoeModel.forward",
"vllm.model_executor.models.qwen3_moe.Qwen3MoeForCausalLM.forward",
"vllm.model_executor.models.qwen3_moe.Qwen3MoeForCausalLM.load_weights",
"vllm.model_executor.models.qwen2.Qwen2DecoderLayer.forward",
"vllm.model_executor.models.qwen2.Qwen2MLP.forward",
"vllm.model_executor.models.qwen2.Qwen2Attention.forward",
"vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionAttention.split_qkv",
"vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionPatchEmbed.forward",
"vllm.model_executor.models.qwen3_vl.Qwen3_VisionPatchEmbed.forward",
"vllm.model_executor.models.qwen3_vl.Qwen3_VisionTransformer.rot_pos_emb",
"vllm.model_executor.models.qwen3_vl.Qwen3_VisionTransformer.fast_pos_embed_interpolate",
"vllm.model_executor.models.qwen3_vl.Qwen3_VisionTransformer.forward",
"vllm.model_executor.models.qwen3_vl.Qwen3_VisionBlock.forward",
"vllm.model_executor.models.qwen3_vl.Qwen3_VisionPatchMerger.forward",
"vllm.model_executor.models.qwen3_vl.Qwen3_VisionMLP.forward",
"vllm.model_executor.models.qwen3_vl.Qwen3VLForConditionalGeneration.get_input_embeddings",
"vllm.model_executor.models.qwen3_vl.Qwen3VLForConditionalGeneration._clear_deepstack_input_embeds",
# "vllm.model_executor.models.qwen3_vl.Qwen3VLProcessingInfo.get_hf_processor",
# "vllm.model_executor.models.qwen3_vl.Qwen3VLProcessingInfo.get_image_processor",
# "vllm.model_executor.models.qwen2_vl.Qwen2VLProcessingInfo.get_hf_processor",
# "vllm.model_executor.models.qwen2_vl.Qwen2VLProcessingInfo.get_image_processor",
"vllm.model_executor.models.qwen2_vl.Qwen2VisionAttention.__init__",
"vllm.model_executor.models.qwen2_vl.Qwen2VisionAttention.split_qkv",
"vllm.model_executor.models.qwen2_vl.Qwen2VisionMLP.forward",
"vllm.model_executor.models.qwen2_vl.Qwen2VisionPatchMerger.forward",
"vllm.model_executor.models.qwen2_vl.Qwen2VisionPatchEmbed.forward",
"vllm.model_executor.models.qwen2_vl.Qwen2VisionTransformer.forward",
"vllm.model_executor.models.qwen2_vl.Qwen2VisionBlock.forward",
"vllm.model_executor.layers.rotary_embedding.mrope.MRotaryEmbedding._qwen3vl_get_input_positions_tensor",
"vllm.spec_decode.metrics.AsyncMetricsCollector._copy_rejsample_metrics_async",
"vllm.model_executor.layers.quantization.moe_wna16.MoeWNA16Method.create_weights",
"vllm.model_executor.layers.quantization.moe_wna16.MoeWNA16Method.process_weights_after_loading",
"vllm.model_executor.layers.pooler.ClassifierPooler.forward",
"vllm.model_executor.layers.pooler.PoolerNormalize.forward_chunk",
"vllm.model_executor.models.roberta.RobertaEmbedding.forward",
"vllm.model_executor.models.bert.BertLayer.forward",
"vllm.model_executor.models.qwen3.Qwen3DecoderLayer.forward",
"vllm.model_executor.models.qwen3.Qwen3Attention.forward",
]
)
from vllm_vacc.vllm.model_executor.models.qwen3_moe import Qwen3MoeModel
vpm.register_patch(
"vllm.model_executor.models.qwen3_vl_moe.Qwen3MoeLLMModel.forward",
Qwen3MoeModel.forward,
)
from vllm_vacc.vllm.attention.backends.mla.utils import MLACommonImpl
vpm.register_patch(
# "vllm.v1.attention.backends.mla.common.MLACommonImpl.process_weights_after_loading",
"vllm.v1.attention.backends.mla.common.MLACommonImpl.process_weights_after_loading",
MLACommonImpl.process_weights_after_loading,
)
from vllm_vacc.vllm.model_executor.layers.rotary_embedding import _compute_inv_freq_vacc
vpm.register_patch(
"vllm.model_executor.layers.rotary_embedding.DeepseekScalingRotaryEmbedding._compute_inv_freq",
_compute_inv_freq_vacc,
)
from vllm_vacc.vllm.model_executor.layers.rotary_embedding import _deepseek_compute_cos_sin_cache_vacc
vpm.register_patch(
"vllm.model_executor.layers.rotary_embedding.DeepseekScalingRotaryEmbedding._compute_cos_sin_cache",
_deepseek_compute_cos_sin_cache_vacc,
)
from vllm_vacc.vllm.model_executor.layers.rotary_embedding import _compute_inv_freq_vacc
vpm.register_patch(
"vllm.model_executor.layers.rotary_embedding.YaRNScalingRotaryEmbedding._compute_inv_freq",
_compute_inv_freq_vacc,
)
from vllm_vacc.vllm.model_executor.layers.rotary_embedding import _yarn_compute_cos_sin_cache_vacc
vpm.register_patch(
"vllm.model_executor.layers.rotary_embedding.YaRNScalingRotaryEmbedding._compute_cos_sin_cache",
_yarn_compute_cos_sin_cache_vacc,
)
from vllm_vacc.vllm.model_executor.layers.rotary_embedding import _compute_cos_sin_cache_vacc
vpm.register_patch(
"vllm.model_executor.layers.rotary_embedding.RotaryEmbedding._compute_cos_sin_cache",
_compute_cos_sin_cache_vacc,
)
from vllm_vacc.vllm.model_executor.layers.rotary_embedding import RotaryEmbedding_forward_vacc
vpm.register_patch(
"vllm.model_executor.layers.rotary_embedding.RotaryEmbedding.forward",
RotaryEmbedding_forward_vacc,
)
from vllm_vacc.vllm.model_executor.layers.rotary_embedding import RotaryEmbedding_init_vacc
vpm.register_patch(
"vllm.model_executor.layers.rotary_embedding.RotaryEmbedding.__init__",
RotaryEmbedding_init_vacc,
)
from vllm_vacc.vllm.model_executor.layers.rotary_embedding import ScalingRotaryEmbedding_forward_vacc
vpm.register_patch(
"vllm.model_executor.layers.rotary_embedding.DeepseekScalingRotaryEmbedding.forward",
ScalingRotaryEmbedding_forward_vacc,
)
from vllm_vacc.vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding_forward
vpm.register_patch(
"vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.forward",
VocabParallelEmbedding_forward,
)
from vllm_vacc.vllm.model_executor.layers.activation import SiluAndMul_forward_vacc
vpm.register_patch(
"vllm.model_executor.layers.activation.SiluAndMul.forward",
SiluAndMul_forward_vacc,
)
from vllm_vacc.vllm.model_executor.layers.layernorm import RMSNorm_forward_vacc
vpm.register_patch(
"vllm.model_executor.layers.layernorm.RMSNorm.forward",
RMSNorm_forward_vacc,
)
from vllm_vacc.vllm.model_executor.layers.fused_moe.layer import FusedMoE_init_
vpm.register_patch(
"vllm.model_executor.layers.fused_moe.layer.FusedMoE.__init__",
FusedMoE_init_
)
from vllm_vacc.vllm.model_executor.layers.quantization.fp8 import Fp8MoEMethod_init_
vpm.register_patch(
"vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod.__init__",
Fp8MoEMethod_init_
)
from vllm_vacc.vllm.model_executor.layers.quantization.fp8 import moe_fp8_apply
vpm.register_patch(
"vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod.apply",
moe_fp8_apply
)
from vllm_vacc.vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod__init
vpm.register_patch(
"vllm.model_executor.layers.quantization.fp8.Fp8LinearMethod.__init__",
Fp8LinearMethod__init
)
from vllm_vacc.vllm.config import ModelConfig___verify_quantization
vpm.register_patch(
"vllm.config.ModelConfig._verify_quantization",
ModelConfig___verify_quantization
)
from vllm_vacc.vllm.config import _get_head_dtype
vpm.register_patch(
"vllm.config.model._get_head_dtype",
_get_head_dtype
)
from vllm_vacc.vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
vpm.register_patch(
"vllm.model_executor.layers.quantization.QUANTIZATION_METHODS",
QUANTIZATION_METHODS
)
from vllm_vacc.vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod__init
vpm.register_patch(
"vllm.model_executor.layers.quantization.gptq.GPTQLinearMethod.__init__",
GPTQLinearMethod__init
)
from vllm_vacc.vllm.model_executor.layers.linear import ReplicatedLinear__init__
vpm.register_patch(
"vllm.model_executor.layers.linear.ReplicatedLinear.__init__",
ReplicatedLinear__init__
)
from vllm_vacc.vllm.model_executor.layers.linear import ReplicatedLinear_weight_loader
vpm.register_patch(
"vllm.model_executor.layers.linear.ReplicatedLinear.weight_loader",
ReplicatedLinear_weight_loader
)
from vllm_vacc.vllm.model_executor.layers.linear import ColumnParallelLinear__init__
vpm.register_patch(
"vllm.model_executor.layers.linear.ColumnParallelLinear.__init__",
ColumnParallelLinear__init__
)
from vllm_vacc.vllm.model_executor.layers.linear import ColumnParallelLinear_weight_loader_v2
vpm.register_patch(
"vllm.model_executor.layers.linear.ColumnParallelLinear.weight_loader_v2",
ColumnParallelLinear_weight_loader_v2
)
from vllm_vacc.vllm.model_executor.layers.linear import RowParallelLinear__init__
vpm.register_patch(
"vllm.model_executor.layers.linear.RowParallelLinear.__init__",
RowParallelLinear__init__
)
from vllm_vacc.vllm.model_executor.layers.linear import RowParallelLinear_weight_loader_v2_vacc
vpm.register_patch(
"vllm.model_executor.layers.linear.RowParallelLinear.weight_loader_v2",
RowParallelLinear_weight_loader_v2_vacc
)
from vllm_vacc.vllm.model_executor.models.deepseek_v2_fused import DeepseekV2MLAAttention_init__
vpm.register_patch(
"vllm.model_executor.models.deepseek_v2.DeepseekV2MLAAttention.__init__",
DeepseekV2MLAAttention_init__
)
from vllm_vacc.vllm.model_executor.models.deepseek_v2_fused import DeepseekV2MLAAttention_forward
vpm.register_patch(
"vllm.model_executor.models.deepseek_v2.DeepseekV2MLAAttention.forward",
DeepseekV2MLAAttention_forward
)
from vllm_vacc.vllm.model_executor.models.deepseek_v2_fused import merge_qkv_weights
vpm.register_patch(
"vllm.model_executor.models.deepseek_v2.DeepseekV2MLAAttention.merge_qkv_weights",
merge_qkv_weights,
create_dummy=True,
allow_create=True
)
# for input_layernorm fuse
from vllm_vacc.vllm.model_executor.models.deepseek_v2_fused import DeepseekV2DecoderLayer_forward
vpm.register_patch(
"vllm.model_executor.models.deepseek_v2.DeepseekV2DecoderLayer.forward",
DeepseekV2DecoderLayer_forward
)
if TRANSPOSE_MLP_MOE_W2:
from vllm_vacc.vllm.model_executor.models.deepseek_v2_fused import DeepseekV2MLP__init__, DeepseekV2MoE__init__
vpm.register_patch(
"vllm.model_executor.models.deepseek_v2.DeepseekV2MLP.__init__",
DeepseekV2MLP__init__
)
vpm.register_patch(
"vllm.model_executor.models.deepseek_v2.DeepseekV2MoE.__init__",
DeepseekV2MoE__init__
)
from vllm_vacc.vllm.model_executor.models.qwen2 import Qwen2MLP__init__
vpm.register_patch(
"vllm.model_executor.models.qwen2.Qwen2MLP.__init__",
Qwen2MLP__init__
)
from vllm_vacc.vllm._custom_ops import cutlass_scaled_mm_vacc
vpm.register_patch(
"vllm._custom_ops.cutlass_scaled_mm",
cutlass_scaled_mm_vacc
)
from vllm_vacc.vllm._custom_ops import concat_and_cache_mla
vpm.register_patch(
"vllm._custom_ops.concat_and_cache_mla",
concat_and_cache_mla
)
from vllm_vacc.vllm.model_executor.layers.quantization.utils.fp8_utils import _apply_w8a8_block_fp8_linear
vpm.register_patch(
"vllm.model_executor.layers.quantization.utils.fp8_utils.apply_w8a8_block_fp8_linear",
_apply_w8a8_block_fp8_linear
)
from vllm_vacc.vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk_with_itype
vpm.register_patch(
"vllm.model_executor.layers.fused_moe.fused_moe.grouped_topk",
grouped_topk_with_itype
)
# from vllm_vacc.vllm.model_executor.sampling_metadata import SamplingMetadata_prepare
# vpm.register_patch(
# "vllm.v1.sample.sampling_metadata.SamplingMetadata.prepare",
# SamplingMetadata_prepare
# )
# from vllm_vacc.vllm.model_executor.sampling_metadata import SamplingTensors_from_lists
# vpm.register_patch(
# "vllm.model_executor.sampling_metadata.SamplingTensors.from_lists",
# SamplingTensors_from_lists
# )
# from vllm_vacc.vllm.model_executor.sampling_metadata import SamplingMetadata_from_sampling_metadata
# vpm.register_patch(
# "vllm.model_executor.sampling_metadata.SamplingTensors.from_sampling_metadata",
# SamplingMetadata_from_sampling_metadata
# )
# from vllm_vacc.vllm.model_executor.layers.sampler import Sampler_forward
# vpm.register_patch(
# "vllm.model_executor.layers.sampler.Sampler.forward",
# Sampler_forward
# )
# from vllm_vacc.vllm.model_executor.layers.sampler import rejection_forward
# vpm.register_patch(
# "vllm.model_executor.layers.rejection_sampler.RejectionSampler.forward",
# rejection_forward
# )
# from vllm_vacc.vllm.spec_decode.spec_decode_worker import _verify_tokens
# vpm.register_patch(
# "vllm.spec_decode.spec_decode_worker.SpecDecodeWorker._verify_tokens",
# _verify_tokens
# )
# from vllm_vacc.vllm.spec_decode.spec_decode_worker import _maybe_log_stage_times
# vpm.register_patch(
# "vllm.spec_decode.spec_decode_worker.SpecDecodeWorker._maybe_log_stage_times",
# _maybe_log_stage_times
# )
# from vllm_vacc.vllm.spec_decode.spec_decode_worker import _run_no_spec
# vpm.register_patch(
# "vllm.spec_decode.spec_decode_worker.SpecDecodeWorker._run_no_spec",
# _run_no_spec
# )
# from vllm_vacc.vllm.model_executor.layers.sampler import _apply_top_k_top_p_vacc
# vpm.register_patch(
# "vllm.model_executor.layers.sampler._apply_top_k_top_p",
# _apply_top_k_top_p_vacc
# )
from vllm_vacc.vllm.distributed.parallel_state import all_gather_to_rank0
vpm.register_patch(
"vllm.distributed.parallel_state.GroupCoordinator.all_gather_to_rank0",
all_gather_to_rank0,
create_dummy=True,
allow_create=True
)
from vllm_vacc.vllm.distributed.device_communicators.base_device_communicator import all_gather_into_tensor
vpm.register_patch(
"vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase.all_gather_into_tensor",
all_gather_into_tensor,
create_dummy=True,
allow_create=True
)
from vllm_vacc.vllm.distributed.parallel_state import generate_group_id
vpm.register_patch(
"vllm.distributed.parallel_state.GroupCoordinator.generate_group_id",
generate_group_id,
create_dummy=True,
allow_create=True
)
from vllm_vacc.vllm.distributed.parallel_state import generate_rank_device_infos
vpm.register_patch(
"vllm.distributed.parallel_state.GroupCoordinator.generate_rank_device_infos",
generate_rank_device_infos,
create_dummy=True,
allow_create=True
)
from vllm_vacc.vllm.distributed.communication_op import tensor_model_parallel_all_reduce_with_odsp
vpm.register_patch(
"vllm.distributed.communication_op.tensor_model_parallel_all_reduce",
tensor_model_parallel_all_reduce_with_odsp
)
from vllm_vacc.vllm.model_executor.models.qwen3_moe import Qwen3MoeSparseMoeBlock__init__
vpm.register_patch(
"vllm.model_executor.models.qwen3_moe.Qwen3MoeSparseMoeBlock.__init__",
Qwen3MoeSparseMoeBlock__init__
)
from vllm_vacc.vllm.model_executor.models.deepseek_mtp import DeepSeekMultiTokenPredictorLayer__init__
vpm.register_patch(
"vllm.model_executor.models.deepseek_mtp.DeepSeekMultiTokenPredictorLayer.__init__",
DeepSeekMultiTokenPredictorLayer__init__
)
# from vllm_vacc.vllm.spec_decode.spec_decode_worker import _prepare_prefill_hidden_states
# vpm.register_patch(
# "vllm.spec_decode.spec_decode_worker.prepare_prefill_hidden_states",
# _prepare_prefill_hidden_states
# )
from vllm_vacc.vllm.executor.executor_base import execute_model_async
vpm.register_patch(
"vllm.executor.mp_distributed_executor.MultiprocessingDistributedExecutor.execute_model_async",
execute_model_async,
create_dummy=True,
allow_create=True
)
from vllm_vacc.vllm.model_executor.layers.activation import QuickGELU_forward_vacc
vpm.register_patch(
"vllm.model_executor.layers.activation.QuickGELU.forward",
QuickGELU_forward_vacc,
)
def patch_vllm_v1(vpm: VllmPatchManager) -> None:
vpm.batch_register_patch(
[
"vllm.v1.core.block_pool.BlockPool.__init__",
"vllm.v1.core.block_pool.BlockPool.get_usage",
"vllm.v1.worker.block_table.BlockTable.__init__",
"vllm.v1.worker.gpu_input_batch.CachedRequestState",
"vllm.v1.core.single_type_kv_cache_manager.SingleTypeKVCacheManager.free",
"vllm.v1.core.single_type_kv_cache_manager.SingleTypeKVCacheManager.allocate_new_blocks",
"vllm.entrypoints.llm.LLM._validate_and_add_requests",
"vllm.entrypoints.openai.serving_completion.OpenAIServingCompletion.__init__",
"vllm.entrypoints.openai.serving_completion.OpenAIServingCompletion.create_completion",
"vllm.v1.engine.EngineCoreRequestType",
"vllm.v1.engine.EngineCoreRequest",
"vllm.v1.engine.EngineCoreOutputs",
"vllm.v1.engine.async_llm.AsyncLLM.from_vllm_config",
"vllm.v1.engine.async_llm.AsyncLLM.from_engine_args",
"vllm.v1.engine.async_llm.AsyncLLM._add_request",
"vllm.v1.engine.llm_engine.LLMEngine.from_vllm_config",
"vllm.v1.engine.llm_engine.LLMEngine.from_engine_args",
"vllm.v1.engine.processor.Processor.process_inputs",
"vllm.v1.engine.core.EngineCoreProc.process_input_sockets",
"vllm.v1.engine.core.preprocess_add_request",
"vllm.v1.metrics.loggers.LoggingStatLogger.log",
"vllm.v1.metrics.stats.SchedulerStats",
"vllm.v1.core.sched.scheduler.Scheduler.make_stats",
"vllm.v1.core.sched.scheduler.Scheduler.schedule",
"vllm.v1.core.sched.scheduler.Scheduler.__init__",
"vllm.v1.core.sched.scheduler.Scheduler.add_request",
"vllm.v1.core.sched.scheduler.Scheduler.finish_requests",
"vllm.v1.core.sched.output.NewRequestData",
"vllm.v1.request.Request",
# "vllm.v1.request.Request.from_engine_core_request", # AttributeError: 'Request' object has no attribute 'record_event'
"vllm.v1.sample.rejection_sampler.RejectionSampler.forward",
"vllm.v1.sample.sampler.Sampler.forward",
"vllm.v1.sample.metadata.SamplingMetadata",
"vllm.v1.spec_decode.eagle.EagleProposer.propose",
"vllm.v1.spec_decode.eagle.EagleProposer.prepare_next_token_ids_padded",
"vllm.v1.spec_decode.eagle.EagleProposer.prepare_inputs_padded",
"vllm.v1.spec_decode.eagle.EagleProposer.prepare_inputs",
"vllm.v1.core.kv_cache_utils.get_num_blocks",
"vllm.v1.core.kv_cache_utils.estimate_max_model_len",
"vllm.v1.core.kv_cache_utils.check_enough_kv_cache_memory",
"vllm.v1.worker.gpu_input_batch.InputBatch.__init__",
"vllm.v1.worker.gpu_input_batch.InputBatch.add_request",
"vllm.v1.worker.gpu_input_batch.InputBatch._make_sampling_metadata",
]
)
from vllm_vacc.vllm.v1.core.sched.scheduler import Scheduler
vpm.register_patch(
"vllm.v1.core.sched.scheduler.Scheduler._schedule_running_requests_for_mode",
Scheduler._schedule_running_requests_for_mode,
create_dummy=True,
allow_create=True
)
vpm.register_patch(
"vllm.v1.core.sched.scheduler.Scheduler._estimate_future_kv_tokens",
Scheduler._estimate_future_kv_tokens,
create_dummy=True,
allow_create=True
)
vpm.register_patch(
"vllm.v1.core.sched.scheduler.Scheduler._compute_total_future_kv_tokens",
Scheduler._compute_total_future_kv_tokens,
create_dummy=True,
allow_create=True
)
from vllm_vacc.vllm.engine.arg_utils import _set_default_args
vpm.register_patch(
"vllm.engine.arg_utils.EngineArgs._set_default_args",
_set_default_args,
)
from vllm_vacc.vllm.v1.engine.core_client import EngineCoreClient,SyncMPClient
vpm.register_patch(
"vllm.v1.engine.core_client.EngineCoreClient._send_input",
EngineCoreClient._send_input,
create_dummy=True,
allow_create=True
)
vpm.register_patch(
"vllm.v1.engine.core_client.EngineCoreClient.add_requests",
EngineCoreClient.add_requests,
create_dummy=True,
allow_create=True
)
vpm.register_patch(
"vllm.v1.engine.core_client.SyncMPClient.add_requests",
SyncMPClient.add_requests,
create_dummy=True,
allow_create=True
)
from vllm_vacc.vllm.v1.engine.llm_engine import LLMEngine
vpm.register_patch(
"vllm.v1.engine.llm_engine.LLMEngine.add_requests",
LLMEngine.add_requests,
create_dummy=True,
allow_create=True
)
from vllm_vacc.vllm.v1.engine.core import EngineCore
vpm.register_patch(
"vllm.v1.engine.core.EngineCore._initialize_kv_caches",
EngineCore._initialize_kv_caches,
create_dummy=True,
allow_create=True
)
from vllm_vacc.vllm.v1.executor.abstract import Executor
vpm.register_patch(
"vllm.v1.executor.abstract.Executor.determine_available_memory_block",
Executor.determine_available_memory_block,
create_dummy=True,
allow_create=True
)
from vllm_vacc.vllm.v1.spec_decode.eagle import EagleProposer_init_
vpm.register_patch(
"vllm.v1.spec_decode.eagle.EagleProposer.__init__",
EagleProposer_init_
)