from vllm_vacc.patch_util import PatchManager def patch_block_manager(): # block table manager from vllm_vacc.vllm.core.block.block_table import BlockTable as VaccBlockTable import vllm.core.block.block_table setattr(vllm.core.block.block_table, "BlockTable", VaccBlockTable) # cpu_gpu_block_allocator from vllm_vacc.vllm.core.block.cpu_gpu_block_allocator \ import CpuGpuBlockAllocator as VaccCpuGpuBlockAllocator import vllm.core.block.cpu_gpu_block_allocator setattr(vllm.core.block.cpu_gpu_block_allocator, "CpuGpuBlockAllocator", VaccCpuGpuBlockAllocator) # naive block allocator from vllm_vacc.vllm.core.block.naive_block \ import NaiveBlockAllocator as VaccNaiveBlockAllocator import vllm.core.block.naive_block setattr(vllm.core.block.naive_block, "NaiveBlockAllocator", VaccNaiveBlockAllocator) # setattr(vllm.core.block.naive_block.NaiveBlockAllocator, "partition_blocks", VaccNaiveBlockAllocator.partition_blocks) # block manager from vllm_vacc.vllm.core.block_manager \ import SelfAttnBlockSpaceManager as VaccSelfAttnBlockSpaceManager import vllm.core.block_manager setattr(vllm.core.block_manager, "SelfAttnBlockSpaceManager", VaccSelfAttnBlockSpaceManager) # from vllm_vacc.vllm.core.block.prefix_caching_block \ # import PrefixCachingBlockAllocator as VaccPrefixCachingBlockAllocator # import vllm.core.block.prefix_caching_block # setattr(vllm.core.block.prefix_caching_block, "PrefixCachingBlockAllocator", VaccPrefixCachingBlockAllocator) # from vllm_vacc.vllm.core.block.prefix_caching_block \ # import PrefixCachingBlock as VaccPrefixCachingBlock # import vllm.core.block.prefix_caching_block # setattr(vllm.core.block.prefix_caching_block, "PrefixCachingBlock", VaccPrefixCachingBlock) def patch_vllm_envs(): import os import vllm.envs as env # prefill timeout set to 1000s env.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS = int(os.getenv("VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS", 3600)) # patch_block_manager() patch_vllm_envs() TRANSPOSE_MLP_MOE_W2 = True class VllmPatchManager(PatchManager): """vllm Patch Manager""" vllm_patch_info: dict = {} @classmethod def get_patch_info(cls): # Override to return child_patch_info return cls.vllm_patch_info def patch_torch(): import torch def null_decorator(*args, **kwargs): def decorator(func): def wrapper(*args, **kwargs): return func(*args, **kwargs) return wrapper return decorator # unregist torch.compile torch.compile = null_decorator def regist_mock_module(): import sys import types class _mock_object(object): def __init__(self, name=""): self.name = name def __getattr__(self, item): return _mock_object(f"{self.name}.{item}") def __call__(self, *args, **kwargs): print(f"mock module: {self.name}") return 0 sys.modules['flash_attn'] = types.ModuleType('flash_attn') sys.modules['flash_attn'].__spec__ = False setattr(sys.modules['flash_attn'], "flash_attn_varlen_func", _mock_object("flash_attn_varlen_func")) def patch_vllm(vpm: VllmPatchManager) -> None: vpm.batch_register_patch( [ "vllm.model_executor.custom_op.CustomOp.dispatch_forward", "vllm.distributed.parallel_state.GroupCoordinator.all_reduce", "vllm.distributed.parallel_state.GroupCoordinator.broadcast_tensor_dict", "vllm.distributed.parallel_state.GroupCoordinator.all_gather", "vllm.distributed.parallel_state.GroupCoordinator.recv_tensor_dict", "vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod.apply", "vllm.sequence.SequenceData.append_token_id", # "vllm.attention.backends.mla.utils.MLACommonImpl.process_weights_after_loading", "vllm.attention.backends.mla.common.MLACommonImpl.__init__", "vllm.attention.backends.mla.common.MLACommonImpl.forward", "vllm.spec_decode.spec_decode_worker.SpecDecodeWorker.determine_num_available_blocks", "vllm.spec_decode.top1_proposer.Top1Proposer.get_spec_proposals", "vllm.spec_decode.multi_step_worker.MultiStepWorker.sampler_output", "vllm.inputs.preprocess.InputPreprocessor._process_embeds", "vllm.inputs.data.EmbedsPrompt", "vllm.inputs.data.EmbedsInputs", "vllm.inputs.data.embeds_inputs", "vllm.entrypoints.renderer.BaseRenderer.load_prompt_embeds", "vllm.entrypoints.renderer.CompletionRenderer.render_prompt_and_embeds", # "vllm.model_executor.layers.sampler.Sampler.forward", "vllm.model_executor.layers.quantization.fp8.Fp8LinearMethod.create_weights", "vllm.model_executor.layers.quantization.fp8.Fp8LinearMethod.apply", "vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod.create_weights", "vllm.model_executor.layers.linear.MergedColumnParallelLinear.weight_loader_v2", "vllm.model_executor.layers.linear.UnquantizedLinearMethod.apply", "vllm.model_executor.layers.quantization.gptq.GPTQConfig.get_supported_act_dtypes", "vllm.model_executor.layers.quantization.gptq.GPTQLinearMethod.process_weights_after_loading", "vllm.model_executor.layers.quantization.gptq.GPTQLinearMethod.apply", "vllm.model_executor.layers.quantization.gptq.GPTQLinearMethod.create_weights", "vllm.model_executor.layers.quantization.fp8.Fp8LinearMethod.process_weights_after_loading", "vllm.model_executor.layers.fused_moe.layer.FusedMoE._load_w13", "vllm.model_executor.layers.fused_moe.layer.FusedMoE._load_w2", "vllm.model_executor.layers.fused_moe.layer.FusedMoE.weight_loader", "vllm.model_executor.layers.fused_moe.layer.FusedMoE._load_model_weight_or_group_weight_scale", "vllm.model_executor.models.deepseek_v2.DeepseekV2MoE.forward", "vllm.model_executor.models.deepseek_v2.DeepseekV2MLP.forward", # "vllm.model_executor.models.deepseek_v2.DeepseekV2DecoderLayer.forward", "vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits", "vllm.model_executor.models.deepseek_v2.DeepseekV2ForCausalLM.load_weights", "vllm.model_executor.models.deepseek_v2.DeepseekV2ForCausalLM.forward", "vllm.model_executor.models.deepseek_mtp.DeepSeekMTP.load_weights", "vllm.model_executor.models.deepseek_mtp.DeepSeekMultiTokenPredictorLayer.forward", "vllm.model_executor.models.deepseek_mtp.SharedHead.forward", "vllm.model_executor.models.deepseek_v2.DeepseekV2Model.forward", "vllm.core.block.common.BlockList.append_token_ids", "vllm.core.block.naive_block.NaiveBlock.append_token_ids", "vllm.model_executor.layers.sampler.SamplerOutput", # "vllm.core.scheduler.Scheduler._schedule_prefills", "vllm.engine.multiprocessing.engine.run_mp_engine", "vllm.engine.llm_engine.LLMEngine.from_engine_args", "vllm.engine.multiprocessing.engine.MQLLMEngine._handle_process_request", "vllm.engine.multiprocessing.engine.MQLLMEngine._handle_abort_request", "vllm.engine.metrics.LoggingStatLogger.log", "vllm.entrypoints.openai.serving_engine.OpenAIServing._validate_input", "vllm.entrypoints.openai.serving_engine.OpenAIServing._log_inputs", "vllm.entrypoints.openai.serving_engine.EmbedsPrompt", "vllm.reasoning.deepseek_r1_reasoning_parser.DeepSeekR1ReasoningParser", "vllm.model_executor.layers.fused_moe.fused_moe.fused_topk", "vllm.model_executor.models.qwen3_moe.Qwen3MoeDecoderLayer.forward", "vllm.model_executor.models.qwen3_moe.Qwen3MoeAttention.forward", "vllm.model_executor.models.qwen3_moe.Qwen3MoeModel.forward", "vllm.model_executor.models.qwen3_moe.Qwen3MoeForCausalLM.forward", "vllm.model_executor.models.qwen3_moe.Qwen3MoeForCausalLM.load_weights", "vllm.model_executor.models.qwen2.Qwen2DecoderLayer.forward", "vllm.model_executor.models.qwen2.Qwen2MLP.forward", "vllm.model_executor.models.qwen2.Qwen2Attention.forward", "vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionAttention.split_qkv", "vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionPatchEmbed.forward", "vllm.model_executor.models.qwen3_vl.Qwen3_VisionPatchEmbed.forward", "vllm.model_executor.models.qwen3_vl.Qwen3_VisionTransformer.rot_pos_emb", "vllm.model_executor.models.qwen3_vl.Qwen3_VisionTransformer.fast_pos_embed_interpolate", "vllm.model_executor.models.qwen3_vl.Qwen3_VisionTransformer.forward", "vllm.model_executor.models.qwen3_vl.Qwen3_VisionBlock.forward", "vllm.model_executor.models.qwen3_vl.Qwen3_VisionPatchMerger.forward", "vllm.model_executor.models.qwen3_vl.Qwen3_VisionMLP.forward", "vllm.model_executor.models.qwen3_vl.Qwen3VLForConditionalGeneration.get_input_embeddings", "vllm.model_executor.models.qwen3_vl.Qwen3VLForConditionalGeneration._clear_deepstack_input_embeds", # "vllm.model_executor.models.qwen3_vl.Qwen3VLProcessingInfo.get_hf_processor", # "vllm.model_executor.models.qwen3_vl.Qwen3VLProcessingInfo.get_image_processor", # "vllm.model_executor.models.qwen2_vl.Qwen2VLProcessingInfo.get_hf_processor", # "vllm.model_executor.models.qwen2_vl.Qwen2VLProcessingInfo.get_image_processor", "vllm.model_executor.models.qwen2_vl.Qwen2VisionAttention.__init__", "vllm.model_executor.models.qwen2_vl.Qwen2VisionAttention.split_qkv", "vllm.model_executor.models.qwen2_vl.Qwen2VisionMLP.forward", "vllm.model_executor.models.qwen2_vl.Qwen2VisionPatchMerger.forward", "vllm.model_executor.models.qwen2_vl.Qwen2VisionPatchEmbed.forward", "vllm.model_executor.models.qwen2_vl.Qwen2VisionTransformer.forward", "vllm.model_executor.models.qwen2_vl.Qwen2VisionBlock.forward", "vllm.model_executor.layers.rotary_embedding.mrope.MRotaryEmbedding._qwen3vl_get_input_positions_tensor", "vllm.spec_decode.metrics.AsyncMetricsCollector._copy_rejsample_metrics_async", "vllm.model_executor.layers.quantization.moe_wna16.MoeWNA16Method.create_weights", "vllm.model_executor.layers.quantization.moe_wna16.MoeWNA16Method.process_weights_after_loading", "vllm.model_executor.layers.pooler.ClassifierPooler.forward", "vllm.model_executor.layers.pooler.PoolerNormalize.forward_chunk", "vllm.model_executor.models.roberta.RobertaEmbedding.forward", "vllm.model_executor.models.bert.BertLayer.forward", "vllm.model_executor.models.qwen3.Qwen3DecoderLayer.forward", "vllm.model_executor.models.qwen3.Qwen3Attention.forward", ] ) from vllm_vacc.vllm.model_executor.models.qwen3_moe import Qwen3MoeModel vpm.register_patch( "vllm.model_executor.models.qwen3_vl_moe.Qwen3MoeLLMModel.forward", Qwen3MoeModel.forward, ) from vllm_vacc.vllm.attention.backends.mla.utils import MLACommonImpl vpm.register_patch( # "vllm.v1.attention.backends.mla.common.MLACommonImpl.process_weights_after_loading", "vllm.v1.attention.backends.mla.common.MLACommonImpl.process_weights_after_loading", MLACommonImpl.process_weights_after_loading, ) from vllm_vacc.vllm.model_executor.layers.rotary_embedding import _compute_inv_freq_vacc vpm.register_patch( "vllm.model_executor.layers.rotary_embedding.DeepseekScalingRotaryEmbedding._compute_inv_freq", _compute_inv_freq_vacc, ) from vllm_vacc.vllm.model_executor.layers.rotary_embedding import _deepseek_compute_cos_sin_cache_vacc vpm.register_patch( "vllm.model_executor.layers.rotary_embedding.DeepseekScalingRotaryEmbedding._compute_cos_sin_cache", _deepseek_compute_cos_sin_cache_vacc, ) from vllm_vacc.vllm.model_executor.layers.rotary_embedding import _compute_inv_freq_vacc vpm.register_patch( "vllm.model_executor.layers.rotary_embedding.YaRNScalingRotaryEmbedding._compute_inv_freq", _compute_inv_freq_vacc, ) from vllm_vacc.vllm.model_executor.layers.rotary_embedding import _yarn_compute_cos_sin_cache_vacc vpm.register_patch( "vllm.model_executor.layers.rotary_embedding.YaRNScalingRotaryEmbedding._compute_cos_sin_cache", _yarn_compute_cos_sin_cache_vacc, ) from vllm_vacc.vllm.model_executor.layers.rotary_embedding import _compute_cos_sin_cache_vacc vpm.register_patch( "vllm.model_executor.layers.rotary_embedding.RotaryEmbedding._compute_cos_sin_cache", _compute_cos_sin_cache_vacc, ) from vllm_vacc.vllm.model_executor.layers.rotary_embedding import RotaryEmbedding_forward_vacc vpm.register_patch( "vllm.model_executor.layers.rotary_embedding.RotaryEmbedding.forward", RotaryEmbedding_forward_vacc, ) from vllm_vacc.vllm.model_executor.layers.rotary_embedding import RotaryEmbedding_init_vacc vpm.register_patch( "vllm.model_executor.layers.rotary_embedding.RotaryEmbedding.__init__", RotaryEmbedding_init_vacc, ) from vllm_vacc.vllm.model_executor.layers.rotary_embedding import ScalingRotaryEmbedding_forward_vacc vpm.register_patch( "vllm.model_executor.layers.rotary_embedding.DeepseekScalingRotaryEmbedding.forward", ScalingRotaryEmbedding_forward_vacc, ) from vllm_vacc.vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding_forward vpm.register_patch( "vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.forward", VocabParallelEmbedding_forward, ) from vllm_vacc.vllm.model_executor.layers.activation import SiluAndMul_forward_vacc vpm.register_patch( "vllm.model_executor.layers.activation.SiluAndMul.forward", SiluAndMul_forward_vacc, ) from vllm_vacc.vllm.model_executor.layers.layernorm import RMSNorm_forward_vacc vpm.register_patch( "vllm.model_executor.layers.layernorm.RMSNorm.forward", RMSNorm_forward_vacc, ) from vllm_vacc.vllm.model_executor.layers.fused_moe.layer import FusedMoE_init_ vpm.register_patch( "vllm.model_executor.layers.fused_moe.layer.FusedMoE.__init__", FusedMoE_init_ ) from vllm_vacc.vllm.model_executor.layers.quantization.fp8 import Fp8MoEMethod_init_ vpm.register_patch( "vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod.__init__", Fp8MoEMethod_init_ ) from vllm_vacc.vllm.model_executor.layers.quantization.fp8 import moe_fp8_apply vpm.register_patch( "vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod.apply", moe_fp8_apply ) from vllm_vacc.vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod__init vpm.register_patch( "vllm.model_executor.layers.quantization.fp8.Fp8LinearMethod.__init__", Fp8LinearMethod__init ) from vllm_vacc.vllm.config import ModelConfig___verify_quantization vpm.register_patch( "vllm.config.ModelConfig._verify_quantization", ModelConfig___verify_quantization ) from vllm_vacc.vllm.config import _get_head_dtype vpm.register_patch( "vllm.config.model._get_head_dtype", _get_head_dtype ) from vllm_vacc.vllm.model_executor.layers.quantization import QUANTIZATION_METHODS vpm.register_patch( "vllm.model_executor.layers.quantization.QUANTIZATION_METHODS", QUANTIZATION_METHODS ) from vllm_vacc.vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod__init vpm.register_patch( "vllm.model_executor.layers.quantization.gptq.GPTQLinearMethod.__init__", GPTQLinearMethod__init ) from vllm_vacc.vllm.model_executor.layers.linear import ReplicatedLinear__init__ vpm.register_patch( "vllm.model_executor.layers.linear.ReplicatedLinear.__init__", ReplicatedLinear__init__ ) from vllm_vacc.vllm.model_executor.layers.linear import ReplicatedLinear_weight_loader vpm.register_patch( "vllm.model_executor.layers.linear.ReplicatedLinear.weight_loader", ReplicatedLinear_weight_loader ) from vllm_vacc.vllm.model_executor.layers.linear import ColumnParallelLinear__init__ vpm.register_patch( "vllm.model_executor.layers.linear.ColumnParallelLinear.__init__", ColumnParallelLinear__init__ ) from vllm_vacc.vllm.model_executor.layers.linear import ColumnParallelLinear_weight_loader_v2 vpm.register_patch( "vllm.model_executor.layers.linear.ColumnParallelLinear.weight_loader_v2", ColumnParallelLinear_weight_loader_v2 ) from vllm_vacc.vllm.model_executor.layers.linear import RowParallelLinear__init__ vpm.register_patch( "vllm.model_executor.layers.linear.RowParallelLinear.__init__", RowParallelLinear__init__ ) from vllm_vacc.vllm.model_executor.layers.linear import RowParallelLinear_weight_loader_v2_vacc vpm.register_patch( "vllm.model_executor.layers.linear.RowParallelLinear.weight_loader_v2", RowParallelLinear_weight_loader_v2_vacc ) from vllm_vacc.vllm.model_executor.models.deepseek_v2_fused import DeepseekV2MLAAttention_init__ vpm.register_patch( "vllm.model_executor.models.deepseek_v2.DeepseekV2MLAAttention.__init__", DeepseekV2MLAAttention_init__ ) from vllm_vacc.vllm.model_executor.models.deepseek_v2_fused import DeepseekV2MLAAttention_forward vpm.register_patch( "vllm.model_executor.models.deepseek_v2.DeepseekV2MLAAttention.forward", DeepseekV2MLAAttention_forward ) from vllm_vacc.vllm.model_executor.models.deepseek_v2_fused import merge_qkv_weights vpm.register_patch( "vllm.model_executor.models.deepseek_v2.DeepseekV2MLAAttention.merge_qkv_weights", merge_qkv_weights, create_dummy=True, allow_create=True ) # for input_layernorm fuse from vllm_vacc.vllm.model_executor.models.deepseek_v2_fused import DeepseekV2DecoderLayer_forward vpm.register_patch( "vllm.model_executor.models.deepseek_v2.DeepseekV2DecoderLayer.forward", DeepseekV2DecoderLayer_forward ) if TRANSPOSE_MLP_MOE_W2: from vllm_vacc.vllm.model_executor.models.deepseek_v2_fused import DeepseekV2MLP__init__, DeepseekV2MoE__init__ vpm.register_patch( "vllm.model_executor.models.deepseek_v2.DeepseekV2MLP.__init__", DeepseekV2MLP__init__ ) vpm.register_patch( "vllm.model_executor.models.deepseek_v2.DeepseekV2MoE.__init__", DeepseekV2MoE__init__ ) from vllm_vacc.vllm.model_executor.models.qwen2 import Qwen2MLP__init__ vpm.register_patch( "vllm.model_executor.models.qwen2.Qwen2MLP.__init__", Qwen2MLP__init__ ) from vllm_vacc.vllm._custom_ops import cutlass_scaled_mm_vacc vpm.register_patch( "vllm._custom_ops.cutlass_scaled_mm", cutlass_scaled_mm_vacc ) from vllm_vacc.vllm._custom_ops import concat_and_cache_mla vpm.register_patch( "vllm._custom_ops.concat_and_cache_mla", concat_and_cache_mla ) from vllm_vacc.vllm.model_executor.layers.quantization.utils.fp8_utils import _apply_w8a8_block_fp8_linear vpm.register_patch( "vllm.model_executor.layers.quantization.utils.fp8_utils.apply_w8a8_block_fp8_linear", _apply_w8a8_block_fp8_linear ) from vllm_vacc.vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk_with_itype vpm.register_patch( "vllm.model_executor.layers.fused_moe.fused_moe.grouped_topk", grouped_topk_with_itype ) # from vllm_vacc.vllm.model_executor.sampling_metadata import SamplingMetadata_prepare # vpm.register_patch( # "vllm.v1.sample.sampling_metadata.SamplingMetadata.prepare", # SamplingMetadata_prepare # ) # from vllm_vacc.vllm.model_executor.sampling_metadata import SamplingTensors_from_lists # vpm.register_patch( # "vllm.model_executor.sampling_metadata.SamplingTensors.from_lists", # SamplingTensors_from_lists # ) # from vllm_vacc.vllm.model_executor.sampling_metadata import SamplingMetadata_from_sampling_metadata # vpm.register_patch( # "vllm.model_executor.sampling_metadata.SamplingTensors.from_sampling_metadata", # SamplingMetadata_from_sampling_metadata # ) # from vllm_vacc.vllm.model_executor.layers.sampler import Sampler_forward # vpm.register_patch( # "vllm.model_executor.layers.sampler.Sampler.forward", # Sampler_forward # ) # from vllm_vacc.vllm.model_executor.layers.sampler import rejection_forward # vpm.register_patch( # "vllm.model_executor.layers.rejection_sampler.RejectionSampler.forward", # rejection_forward # ) # from vllm_vacc.vllm.spec_decode.spec_decode_worker import _verify_tokens # vpm.register_patch( # "vllm.spec_decode.spec_decode_worker.SpecDecodeWorker._verify_tokens", # _verify_tokens # ) # from vllm_vacc.vllm.spec_decode.spec_decode_worker import _maybe_log_stage_times # vpm.register_patch( # "vllm.spec_decode.spec_decode_worker.SpecDecodeWorker._maybe_log_stage_times", # _maybe_log_stage_times # ) # from vllm_vacc.vllm.spec_decode.spec_decode_worker import _run_no_spec # vpm.register_patch( # "vllm.spec_decode.spec_decode_worker.SpecDecodeWorker._run_no_spec", # _run_no_spec # ) # from vllm_vacc.vllm.model_executor.layers.sampler import _apply_top_k_top_p_vacc # vpm.register_patch( # "vllm.model_executor.layers.sampler._apply_top_k_top_p", # _apply_top_k_top_p_vacc # ) from vllm_vacc.vllm.distributed.parallel_state import all_gather_to_rank0 vpm.register_patch( "vllm.distributed.parallel_state.GroupCoordinator.all_gather_to_rank0", all_gather_to_rank0, create_dummy=True, allow_create=True ) from vllm_vacc.vllm.distributed.device_communicators.base_device_communicator import all_gather_into_tensor vpm.register_patch( "vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase.all_gather_into_tensor", all_gather_into_tensor, create_dummy=True, allow_create=True ) from vllm_vacc.vllm.distributed.parallel_state import generate_group_id vpm.register_patch( "vllm.distributed.parallel_state.GroupCoordinator.generate_group_id", generate_group_id, create_dummy=True, allow_create=True ) from vllm_vacc.vllm.distributed.parallel_state import generate_rank_device_infos vpm.register_patch( "vllm.distributed.parallel_state.GroupCoordinator.generate_rank_device_infos", generate_rank_device_infos, create_dummy=True, allow_create=True ) from vllm_vacc.vllm.distributed.communication_op import tensor_model_parallel_all_reduce_with_odsp vpm.register_patch( "vllm.distributed.communication_op.tensor_model_parallel_all_reduce", tensor_model_parallel_all_reduce_with_odsp ) from vllm_vacc.vllm.model_executor.models.qwen3_moe import Qwen3MoeSparseMoeBlock__init__ vpm.register_patch( "vllm.model_executor.models.qwen3_moe.Qwen3MoeSparseMoeBlock.__init__", Qwen3MoeSparseMoeBlock__init__ ) from vllm_vacc.vllm.model_executor.models.deepseek_mtp import DeepSeekMultiTokenPredictorLayer__init__ vpm.register_patch( "vllm.model_executor.models.deepseek_mtp.DeepSeekMultiTokenPredictorLayer.__init__", DeepSeekMultiTokenPredictorLayer__init__ ) # from vllm_vacc.vllm.spec_decode.spec_decode_worker import _prepare_prefill_hidden_states # vpm.register_patch( # "vllm.spec_decode.spec_decode_worker.prepare_prefill_hidden_states", # _prepare_prefill_hidden_states # ) from vllm_vacc.vllm.executor.executor_base import execute_model_async vpm.register_patch( "vllm.executor.mp_distributed_executor.MultiprocessingDistributedExecutor.execute_model_async", execute_model_async, create_dummy=True, allow_create=True ) from vllm_vacc.vllm.model_executor.layers.activation import QuickGELU_forward_vacc vpm.register_patch( "vllm.model_executor.layers.activation.QuickGELU.forward", QuickGELU_forward_vacc, ) def patch_vllm_v1(vpm: VllmPatchManager) -> None: vpm.batch_register_patch( [ "vllm.v1.core.block_pool.BlockPool.__init__", "vllm.v1.core.block_pool.BlockPool.get_usage", "vllm.v1.worker.block_table.BlockTable.__init__", "vllm.v1.worker.gpu_input_batch.CachedRequestState", "vllm.v1.core.single_type_kv_cache_manager.SingleTypeKVCacheManager.free", "vllm.v1.core.single_type_kv_cache_manager.SingleTypeKVCacheManager.allocate_new_blocks", "vllm.entrypoints.llm.LLM._validate_and_add_requests", "vllm.entrypoints.openai.serving_completion.OpenAIServingCompletion.__init__", "vllm.entrypoints.openai.serving_completion.OpenAIServingCompletion.create_completion", "vllm.v1.engine.EngineCoreRequestType", "vllm.v1.engine.EngineCoreRequest", "vllm.v1.engine.EngineCoreOutputs", "vllm.v1.engine.async_llm.AsyncLLM.from_vllm_config", "vllm.v1.engine.async_llm.AsyncLLM.from_engine_args", "vllm.v1.engine.async_llm.AsyncLLM._add_request", "vllm.v1.engine.llm_engine.LLMEngine.from_vllm_config", "vllm.v1.engine.llm_engine.LLMEngine.from_engine_args", "vllm.v1.engine.processor.Processor.process_inputs", "vllm.v1.engine.core.EngineCoreProc.process_input_sockets", "vllm.v1.engine.core.preprocess_add_request", "vllm.v1.metrics.loggers.LoggingStatLogger.log", "vllm.v1.metrics.stats.SchedulerStats", "vllm.v1.core.sched.scheduler.Scheduler.make_stats", "vllm.v1.core.sched.scheduler.Scheduler.schedule", "vllm.v1.core.sched.scheduler.Scheduler.__init__", "vllm.v1.core.sched.scheduler.Scheduler.add_request", "vllm.v1.core.sched.scheduler.Scheduler.finish_requests", "vllm.v1.core.sched.output.NewRequestData", "vllm.v1.request.Request", # "vllm.v1.request.Request.from_engine_core_request", # AttributeError: 'Request' object has no attribute 'record_event' "vllm.v1.sample.rejection_sampler.RejectionSampler.forward", "vllm.v1.sample.sampler.Sampler.forward", "vllm.v1.sample.metadata.SamplingMetadata", "vllm.v1.spec_decode.eagle.EagleProposer.propose", "vllm.v1.spec_decode.eagle.EagleProposer.prepare_next_token_ids_padded", "vllm.v1.spec_decode.eagle.EagleProposer.prepare_inputs_padded", "vllm.v1.spec_decode.eagle.EagleProposer.prepare_inputs", "vllm.v1.core.kv_cache_utils.get_num_blocks", "vllm.v1.core.kv_cache_utils.estimate_max_model_len", "vllm.v1.core.kv_cache_utils.check_enough_kv_cache_memory", "vllm.v1.worker.gpu_input_batch.InputBatch.__init__", "vllm.v1.worker.gpu_input_batch.InputBatch.add_request", "vllm.v1.worker.gpu_input_batch.InputBatch._make_sampling_metadata", ] ) from vllm_vacc.vllm.v1.core.sched.scheduler import Scheduler vpm.register_patch( "vllm.v1.core.sched.scheduler.Scheduler._schedule_running_requests_for_mode", Scheduler._schedule_running_requests_for_mode, create_dummy=True, allow_create=True ) vpm.register_patch( "vllm.v1.core.sched.scheduler.Scheduler._estimate_future_kv_tokens", Scheduler._estimate_future_kv_tokens, create_dummy=True, allow_create=True ) vpm.register_patch( "vllm.v1.core.sched.scheduler.Scheduler._compute_total_future_kv_tokens", Scheduler._compute_total_future_kv_tokens, create_dummy=True, allow_create=True ) from vllm_vacc.vllm.engine.arg_utils import _set_default_args vpm.register_patch( "vllm.engine.arg_utils.EngineArgs._set_default_args", _set_default_args, ) from vllm_vacc.vllm.v1.engine.core_client import EngineCoreClient,SyncMPClient vpm.register_patch( "vllm.v1.engine.core_client.EngineCoreClient._send_input", EngineCoreClient._send_input, create_dummy=True, allow_create=True ) vpm.register_patch( "vllm.v1.engine.core_client.EngineCoreClient.add_requests", EngineCoreClient.add_requests, create_dummy=True, allow_create=True ) vpm.register_patch( "vllm.v1.engine.core_client.SyncMPClient.add_requests", SyncMPClient.add_requests, create_dummy=True, allow_create=True ) from vllm_vacc.vllm.v1.engine.llm_engine import LLMEngine vpm.register_patch( "vllm.v1.engine.llm_engine.LLMEngine.add_requests", LLMEngine.add_requests, create_dummy=True, allow_create=True ) from vllm_vacc.vllm.v1.engine.core import EngineCore vpm.register_patch( "vllm.v1.engine.core.EngineCore._initialize_kv_caches", EngineCore._initialize_kv_caches, create_dummy=True, allow_create=True ) from vllm_vacc.vllm.v1.executor.abstract import Executor vpm.register_patch( "vllm.v1.executor.abstract.Executor.determine_available_memory_block", Executor.determine_available_memory_block, create_dummy=True, allow_create=True ) from vllm_vacc.vllm.v1.spec_decode.eagle import EagleProposer_init_ vpm.register_patch( "vllm.v1.spec_decode.eagle.EagleProposer.__init__", EagleProposer_init_ )