679 lines
30 KiB
Python
679 lines
30 KiB
Python
from vllm_vacc.patch_util import PatchManager
|
|
|
|
def patch_block_manager():
|
|
# block table manager
|
|
from vllm_vacc.vllm.core.block.block_table import BlockTable as VaccBlockTable
|
|
import vllm.core.block.block_table
|
|
setattr(vllm.core.block.block_table, "BlockTable", VaccBlockTable)
|
|
|
|
# cpu_gpu_block_allocator
|
|
from vllm_vacc.vllm.core.block.cpu_gpu_block_allocator \
|
|
import CpuGpuBlockAllocator as VaccCpuGpuBlockAllocator
|
|
import vllm.core.block.cpu_gpu_block_allocator
|
|
setattr(vllm.core.block.cpu_gpu_block_allocator, "CpuGpuBlockAllocator", VaccCpuGpuBlockAllocator)
|
|
|
|
# naive block allocator
|
|
from vllm_vacc.vllm.core.block.naive_block \
|
|
import NaiveBlockAllocator as VaccNaiveBlockAllocator
|
|
import vllm.core.block.naive_block
|
|
setattr(vllm.core.block.naive_block, "NaiveBlockAllocator", VaccNaiveBlockAllocator)
|
|
# setattr(vllm.core.block.naive_block.NaiveBlockAllocator, "partition_blocks", VaccNaiveBlockAllocator.partition_blocks)
|
|
|
|
# block manager
|
|
from vllm_vacc.vllm.core.block_manager \
|
|
import SelfAttnBlockSpaceManager as VaccSelfAttnBlockSpaceManager
|
|
import vllm.core.block_manager
|
|
setattr(vllm.core.block_manager, "SelfAttnBlockSpaceManager", VaccSelfAttnBlockSpaceManager)
|
|
|
|
# from vllm_vacc.vllm.core.block.prefix_caching_block \
|
|
# import PrefixCachingBlockAllocator as VaccPrefixCachingBlockAllocator
|
|
# import vllm.core.block.prefix_caching_block
|
|
# setattr(vllm.core.block.prefix_caching_block, "PrefixCachingBlockAllocator", VaccPrefixCachingBlockAllocator)
|
|
|
|
# from vllm_vacc.vllm.core.block.prefix_caching_block \
|
|
# import PrefixCachingBlock as VaccPrefixCachingBlock
|
|
# import vllm.core.block.prefix_caching_block
|
|
# setattr(vllm.core.block.prefix_caching_block, "PrefixCachingBlock", VaccPrefixCachingBlock)
|
|
|
|
|
|
|
|
def patch_vllm_envs():
|
|
import os
|
|
import vllm.envs as env
|
|
# prefill timeout set to 1000s
|
|
env.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS = int(os.getenv("VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS", 3600))
|
|
|
|
# patch_block_manager()
|
|
patch_vllm_envs()
|
|
|
|
TRANSPOSE_MLP_MOE_W2 = True
|
|
|
|
class VllmPatchManager(PatchManager):
|
|
"""vllm Patch Manager"""
|
|
|
|
vllm_patch_info: dict = {}
|
|
|
|
@classmethod
|
|
def get_patch_info(cls):
|
|
# Override to return child_patch_info
|
|
return cls.vllm_patch_info
|
|
|
|
def patch_torch():
|
|
import torch
|
|
def null_decorator(*args, **kwargs):
|
|
def decorator(func):
|
|
def wrapper(*args, **kwargs):
|
|
return func(*args, **kwargs)
|
|
return wrapper
|
|
return decorator
|
|
# unregist torch.compile
|
|
torch.compile = null_decorator
|
|
|
|
def regist_mock_module():
|
|
import sys
|
|
import types
|
|
class _mock_object(object):
|
|
def __init__(self, name=""):
|
|
self.name = name
|
|
def __getattr__(self, item):
|
|
return _mock_object(f"{self.name}.{item}")
|
|
def __call__(self, *args, **kwargs):
|
|
print(f"mock module: {self.name}")
|
|
return 0
|
|
|
|
sys.modules['flash_attn'] = types.ModuleType('flash_attn')
|
|
sys.modules['flash_attn'].__spec__ = False
|
|
setattr(sys.modules['flash_attn'], "flash_attn_varlen_func", _mock_object("flash_attn_varlen_func"))
|
|
|
|
def patch_vllm(vpm: VllmPatchManager) -> None:
|
|
|
|
vpm.batch_register_patch(
|
|
[
|
|
"vllm.model_executor.custom_op.CustomOp.dispatch_forward",
|
|
"vllm.distributed.parallel_state.GroupCoordinator.all_reduce",
|
|
"vllm.distributed.parallel_state.GroupCoordinator.broadcast_tensor_dict",
|
|
"vllm.distributed.parallel_state.GroupCoordinator.all_gather",
|
|
"vllm.distributed.parallel_state.GroupCoordinator.recv_tensor_dict",
|
|
"vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod.apply",
|
|
"vllm.sequence.SequenceData.append_token_id",
|
|
# "vllm.attention.backends.mla.utils.MLACommonImpl.process_weights_after_loading",
|
|
"vllm.attention.backends.mla.common.MLACommonImpl.__init__",
|
|
"vllm.attention.backends.mla.common.MLACommonImpl.forward",
|
|
"vllm.spec_decode.spec_decode_worker.SpecDecodeWorker.determine_num_available_blocks",
|
|
"vllm.spec_decode.top1_proposer.Top1Proposer.get_spec_proposals",
|
|
"vllm.spec_decode.multi_step_worker.MultiStepWorker.sampler_output",
|
|
"vllm.inputs.preprocess.InputPreprocessor._process_embeds",
|
|
"vllm.inputs.data.EmbedsPrompt",
|
|
"vllm.inputs.data.EmbedsInputs",
|
|
"vllm.inputs.data.embeds_inputs",
|
|
"vllm.entrypoints.renderer.BaseRenderer.load_prompt_embeds",
|
|
"vllm.entrypoints.renderer.CompletionRenderer.render_prompt_and_embeds",
|
|
# "vllm.model_executor.layers.sampler.Sampler.forward",
|
|
"vllm.model_executor.layers.quantization.fp8.Fp8LinearMethod.create_weights",
|
|
"vllm.model_executor.layers.quantization.fp8.Fp8LinearMethod.apply",
|
|
"vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod.create_weights",
|
|
"vllm.model_executor.layers.linear.MergedColumnParallelLinear.weight_loader_v2",
|
|
"vllm.model_executor.layers.linear.UnquantizedLinearMethod.apply",
|
|
"vllm.model_executor.layers.quantization.gptq.GPTQConfig.get_supported_act_dtypes",
|
|
"vllm.model_executor.layers.quantization.gptq.GPTQLinearMethod.process_weights_after_loading",
|
|
"vllm.model_executor.layers.quantization.gptq.GPTQLinearMethod.apply",
|
|
"vllm.model_executor.layers.quantization.gptq.GPTQLinearMethod.create_weights",
|
|
"vllm.model_executor.layers.quantization.fp8.Fp8LinearMethod.process_weights_after_loading",
|
|
"vllm.model_executor.layers.fused_moe.layer.FusedMoE._load_w13",
|
|
"vllm.model_executor.layers.fused_moe.layer.FusedMoE._load_w2",
|
|
"vllm.model_executor.layers.fused_moe.layer.FusedMoE.weight_loader",
|
|
"vllm.model_executor.layers.fused_moe.layer.FusedMoE._load_model_weight_or_group_weight_scale",
|
|
"vllm.model_executor.models.deepseek_v2.DeepseekV2MoE.forward",
|
|
"vllm.model_executor.models.deepseek_v2.DeepseekV2MLP.forward",
|
|
# "vllm.model_executor.models.deepseek_v2.DeepseekV2DecoderLayer.forward",
|
|
"vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits",
|
|
"vllm.model_executor.models.deepseek_v2.DeepseekV2ForCausalLM.load_weights",
|
|
"vllm.model_executor.models.deepseek_v2.DeepseekV2ForCausalLM.forward",
|
|
"vllm.model_executor.models.deepseek_mtp.DeepSeekMTP.load_weights",
|
|
"vllm.model_executor.models.deepseek_mtp.DeepSeekMultiTokenPredictorLayer.forward",
|
|
"vllm.model_executor.models.deepseek_mtp.SharedHead.forward",
|
|
"vllm.model_executor.models.deepseek_v2.DeepseekV2Model.forward",
|
|
"vllm.core.block.common.BlockList.append_token_ids",
|
|
"vllm.core.block.naive_block.NaiveBlock.append_token_ids",
|
|
"vllm.model_executor.layers.sampler.SamplerOutput",
|
|
# "vllm.core.scheduler.Scheduler._schedule_prefills",
|
|
"vllm.engine.multiprocessing.engine.run_mp_engine",
|
|
"vllm.engine.llm_engine.LLMEngine.from_engine_args",
|
|
"vllm.engine.multiprocessing.engine.MQLLMEngine._handle_process_request",
|
|
"vllm.engine.multiprocessing.engine.MQLLMEngine._handle_abort_request",
|
|
"vllm.engine.metrics.LoggingStatLogger.log",
|
|
"vllm.entrypoints.openai.serving_engine.OpenAIServing._validate_input",
|
|
"vllm.entrypoints.openai.serving_engine.OpenAIServing._log_inputs",
|
|
"vllm.entrypoints.openai.serving_engine.EmbedsPrompt",
|
|
"vllm.reasoning.deepseek_r1_reasoning_parser.DeepSeekR1ReasoningParser",
|
|
"vllm.model_executor.layers.fused_moe.fused_moe.fused_topk",
|
|
"vllm.model_executor.models.qwen3_moe.Qwen3MoeDecoderLayer.forward",
|
|
"vllm.model_executor.models.qwen3_moe.Qwen3MoeAttention.forward",
|
|
"vllm.model_executor.models.qwen3_moe.Qwen3MoeModel.forward",
|
|
"vllm.model_executor.models.qwen3_moe.Qwen3MoeForCausalLM.forward",
|
|
"vllm.model_executor.models.qwen3_moe.Qwen3MoeForCausalLM.load_weights",
|
|
"vllm.model_executor.models.qwen2.Qwen2DecoderLayer.forward",
|
|
"vllm.model_executor.models.qwen2.Qwen2MLP.forward",
|
|
"vllm.model_executor.models.qwen2.Qwen2Attention.forward",
|
|
"vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionAttention.split_qkv",
|
|
"vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionPatchEmbed.forward",
|
|
"vllm.model_executor.models.qwen3_vl.Qwen3_VisionPatchEmbed.forward",
|
|
"vllm.model_executor.models.qwen3_vl.Qwen3_VisionTransformer.rot_pos_emb",
|
|
"vllm.model_executor.models.qwen3_vl.Qwen3_VisionTransformer.fast_pos_embed_interpolate",
|
|
"vllm.model_executor.models.qwen3_vl.Qwen3_VisionTransformer.forward",
|
|
"vllm.model_executor.models.qwen3_vl.Qwen3_VisionBlock.forward",
|
|
"vllm.model_executor.models.qwen3_vl.Qwen3_VisionPatchMerger.forward",
|
|
"vllm.model_executor.models.qwen3_vl.Qwen3_VisionMLP.forward",
|
|
"vllm.model_executor.models.qwen3_vl.Qwen3VLForConditionalGeneration.get_input_embeddings",
|
|
"vllm.model_executor.models.qwen3_vl.Qwen3VLForConditionalGeneration._clear_deepstack_input_embeds",
|
|
# "vllm.model_executor.models.qwen3_vl.Qwen3VLProcessingInfo.get_hf_processor",
|
|
# "vllm.model_executor.models.qwen3_vl.Qwen3VLProcessingInfo.get_image_processor",
|
|
# "vllm.model_executor.models.qwen2_vl.Qwen2VLProcessingInfo.get_hf_processor",
|
|
# "vllm.model_executor.models.qwen2_vl.Qwen2VLProcessingInfo.get_image_processor",
|
|
"vllm.model_executor.models.qwen2_vl.Qwen2VisionAttention.__init__",
|
|
"vllm.model_executor.models.qwen2_vl.Qwen2VisionAttention.split_qkv",
|
|
"vllm.model_executor.models.qwen2_vl.Qwen2VisionMLP.forward",
|
|
"vllm.model_executor.models.qwen2_vl.Qwen2VisionPatchMerger.forward",
|
|
"vllm.model_executor.models.qwen2_vl.Qwen2VisionPatchEmbed.forward",
|
|
"vllm.model_executor.models.qwen2_vl.Qwen2VisionTransformer.forward",
|
|
"vllm.model_executor.models.qwen2_vl.Qwen2VisionBlock.forward",
|
|
"vllm.model_executor.layers.rotary_embedding.mrope.MRotaryEmbedding._qwen3vl_get_input_positions_tensor",
|
|
"vllm.spec_decode.metrics.AsyncMetricsCollector._copy_rejsample_metrics_async",
|
|
"vllm.model_executor.layers.quantization.moe_wna16.MoeWNA16Method.create_weights",
|
|
"vllm.model_executor.layers.quantization.moe_wna16.MoeWNA16Method.process_weights_after_loading",
|
|
"vllm.model_executor.layers.pooler.ClassifierPooler.forward",
|
|
"vllm.model_executor.layers.pooler.PoolerNormalize.forward_chunk",
|
|
"vllm.model_executor.models.roberta.RobertaEmbedding.forward",
|
|
"vllm.model_executor.models.bert.BertLayer.forward",
|
|
"vllm.model_executor.models.qwen3.Qwen3DecoderLayer.forward",
|
|
"vllm.model_executor.models.qwen3.Qwen3Attention.forward",
|
|
]
|
|
)
|
|
from vllm_vacc.vllm.model_executor.models.qwen3_moe import Qwen3MoeModel
|
|
vpm.register_patch(
|
|
"vllm.model_executor.models.qwen3_vl_moe.Qwen3MoeLLMModel.forward",
|
|
Qwen3MoeModel.forward,
|
|
)
|
|
|
|
|
|
from vllm_vacc.vllm.attention.backends.mla.utils import MLACommonImpl
|
|
vpm.register_patch(
|
|
# "vllm.v1.attention.backends.mla.common.MLACommonImpl.process_weights_after_loading",
|
|
"vllm.v1.attention.backends.mla.common.MLACommonImpl.process_weights_after_loading",
|
|
MLACommonImpl.process_weights_after_loading,
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.rotary_embedding import _compute_inv_freq_vacc
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.rotary_embedding.DeepseekScalingRotaryEmbedding._compute_inv_freq",
|
|
_compute_inv_freq_vacc,
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.rotary_embedding import _deepseek_compute_cos_sin_cache_vacc
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.rotary_embedding.DeepseekScalingRotaryEmbedding._compute_cos_sin_cache",
|
|
_deepseek_compute_cos_sin_cache_vacc,
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.rotary_embedding import _compute_inv_freq_vacc
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.rotary_embedding.YaRNScalingRotaryEmbedding._compute_inv_freq",
|
|
_compute_inv_freq_vacc,
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.rotary_embedding import _yarn_compute_cos_sin_cache_vacc
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.rotary_embedding.YaRNScalingRotaryEmbedding._compute_cos_sin_cache",
|
|
_yarn_compute_cos_sin_cache_vacc,
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.rotary_embedding import _compute_cos_sin_cache_vacc
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.rotary_embedding.RotaryEmbedding._compute_cos_sin_cache",
|
|
_compute_cos_sin_cache_vacc,
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.rotary_embedding import RotaryEmbedding_forward_vacc
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.rotary_embedding.RotaryEmbedding.forward",
|
|
RotaryEmbedding_forward_vacc,
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.rotary_embedding import RotaryEmbedding_init_vacc
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.rotary_embedding.RotaryEmbedding.__init__",
|
|
RotaryEmbedding_init_vacc,
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.rotary_embedding import ScalingRotaryEmbedding_forward_vacc
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.rotary_embedding.DeepseekScalingRotaryEmbedding.forward",
|
|
ScalingRotaryEmbedding_forward_vacc,
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding_forward
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.forward",
|
|
VocabParallelEmbedding_forward,
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.activation import SiluAndMul_forward_vacc
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.activation.SiluAndMul.forward",
|
|
SiluAndMul_forward_vacc,
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.layernorm import RMSNorm_forward_vacc
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.layernorm.RMSNorm.forward",
|
|
RMSNorm_forward_vacc,
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.fused_moe.layer import FusedMoE_init_
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.fused_moe.layer.FusedMoE.__init__",
|
|
FusedMoE_init_
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.quantization.fp8 import Fp8MoEMethod_init_
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod.__init__",
|
|
Fp8MoEMethod_init_
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.quantization.fp8 import moe_fp8_apply
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod.apply",
|
|
moe_fp8_apply
|
|
)
|
|
from vllm_vacc.vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod__init
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.quantization.fp8.Fp8LinearMethod.__init__",
|
|
Fp8LinearMethod__init
|
|
)
|
|
|
|
from vllm_vacc.vllm.config import ModelConfig___verify_quantization
|
|
vpm.register_patch(
|
|
"vllm.config.ModelConfig._verify_quantization",
|
|
ModelConfig___verify_quantization
|
|
)
|
|
|
|
from vllm_vacc.vllm.config import _get_head_dtype
|
|
vpm.register_patch(
|
|
"vllm.config.model._get_head_dtype",
|
|
_get_head_dtype
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.quantization.QUANTIZATION_METHODS",
|
|
QUANTIZATION_METHODS
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod__init
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.quantization.gptq.GPTQLinearMethod.__init__",
|
|
GPTQLinearMethod__init
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.linear import ReplicatedLinear__init__
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.linear.ReplicatedLinear.__init__",
|
|
ReplicatedLinear__init__
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.linear import ReplicatedLinear_weight_loader
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.linear.ReplicatedLinear.weight_loader",
|
|
ReplicatedLinear_weight_loader
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.linear import ColumnParallelLinear__init__
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.linear.ColumnParallelLinear.__init__",
|
|
ColumnParallelLinear__init__
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.linear import ColumnParallelLinear_weight_loader_v2
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.linear.ColumnParallelLinear.weight_loader_v2",
|
|
ColumnParallelLinear_weight_loader_v2
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.linear import RowParallelLinear__init__
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.linear.RowParallelLinear.__init__",
|
|
RowParallelLinear__init__
|
|
)
|
|
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.linear import RowParallelLinear_weight_loader_v2_vacc
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.linear.RowParallelLinear.weight_loader_v2",
|
|
RowParallelLinear_weight_loader_v2_vacc
|
|
)
|
|
|
|
|
|
from vllm_vacc.vllm.model_executor.models.deepseek_v2_fused import DeepseekV2MLAAttention_init__
|
|
vpm.register_patch(
|
|
"vllm.model_executor.models.deepseek_v2.DeepseekV2MLAAttention.__init__",
|
|
DeepseekV2MLAAttention_init__
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.models.deepseek_v2_fused import DeepseekV2MLAAttention_forward
|
|
vpm.register_patch(
|
|
"vllm.model_executor.models.deepseek_v2.DeepseekV2MLAAttention.forward",
|
|
DeepseekV2MLAAttention_forward
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.models.deepseek_v2_fused import merge_qkv_weights
|
|
vpm.register_patch(
|
|
"vllm.model_executor.models.deepseek_v2.DeepseekV2MLAAttention.merge_qkv_weights",
|
|
merge_qkv_weights,
|
|
create_dummy=True,
|
|
allow_create=True
|
|
)
|
|
|
|
# for input_layernorm fuse
|
|
from vllm_vacc.vllm.model_executor.models.deepseek_v2_fused import DeepseekV2DecoderLayer_forward
|
|
vpm.register_patch(
|
|
"vllm.model_executor.models.deepseek_v2.DeepseekV2DecoderLayer.forward",
|
|
DeepseekV2DecoderLayer_forward
|
|
)
|
|
|
|
if TRANSPOSE_MLP_MOE_W2:
|
|
from vllm_vacc.vllm.model_executor.models.deepseek_v2_fused import DeepseekV2MLP__init__, DeepseekV2MoE__init__
|
|
vpm.register_patch(
|
|
"vllm.model_executor.models.deepseek_v2.DeepseekV2MLP.__init__",
|
|
DeepseekV2MLP__init__
|
|
)
|
|
vpm.register_patch(
|
|
"vllm.model_executor.models.deepseek_v2.DeepseekV2MoE.__init__",
|
|
DeepseekV2MoE__init__
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.models.qwen2 import Qwen2MLP__init__
|
|
vpm.register_patch(
|
|
"vllm.model_executor.models.qwen2.Qwen2MLP.__init__",
|
|
Qwen2MLP__init__
|
|
)
|
|
|
|
from vllm_vacc.vllm._custom_ops import cutlass_scaled_mm_vacc
|
|
vpm.register_patch(
|
|
"vllm._custom_ops.cutlass_scaled_mm",
|
|
cutlass_scaled_mm_vacc
|
|
)
|
|
|
|
from vllm_vacc.vllm._custom_ops import concat_and_cache_mla
|
|
vpm.register_patch(
|
|
"vllm._custom_ops.concat_and_cache_mla",
|
|
concat_and_cache_mla
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.quantization.utils.fp8_utils import _apply_w8a8_block_fp8_linear
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.quantization.utils.fp8_utils.apply_w8a8_block_fp8_linear",
|
|
_apply_w8a8_block_fp8_linear
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk_with_itype
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.fused_moe.fused_moe.grouped_topk",
|
|
grouped_topk_with_itype
|
|
)
|
|
|
|
# from vllm_vacc.vllm.model_executor.sampling_metadata import SamplingMetadata_prepare
|
|
# vpm.register_patch(
|
|
# "vllm.v1.sample.sampling_metadata.SamplingMetadata.prepare",
|
|
# SamplingMetadata_prepare
|
|
# )
|
|
|
|
# from vllm_vacc.vllm.model_executor.sampling_metadata import SamplingTensors_from_lists
|
|
# vpm.register_patch(
|
|
# "vllm.model_executor.sampling_metadata.SamplingTensors.from_lists",
|
|
# SamplingTensors_from_lists
|
|
# )
|
|
|
|
# from vllm_vacc.vllm.model_executor.sampling_metadata import SamplingMetadata_from_sampling_metadata
|
|
# vpm.register_patch(
|
|
# "vllm.model_executor.sampling_metadata.SamplingTensors.from_sampling_metadata",
|
|
# SamplingMetadata_from_sampling_metadata
|
|
# )
|
|
|
|
# from vllm_vacc.vllm.model_executor.layers.sampler import Sampler_forward
|
|
# vpm.register_patch(
|
|
# "vllm.model_executor.layers.sampler.Sampler.forward",
|
|
# Sampler_forward
|
|
# )
|
|
|
|
# from vllm_vacc.vllm.model_executor.layers.sampler import rejection_forward
|
|
# vpm.register_patch(
|
|
# "vllm.model_executor.layers.rejection_sampler.RejectionSampler.forward",
|
|
# rejection_forward
|
|
# )
|
|
|
|
# from vllm_vacc.vllm.spec_decode.spec_decode_worker import _verify_tokens
|
|
# vpm.register_patch(
|
|
# "vllm.spec_decode.spec_decode_worker.SpecDecodeWorker._verify_tokens",
|
|
# _verify_tokens
|
|
# )
|
|
|
|
# from vllm_vacc.vllm.spec_decode.spec_decode_worker import _maybe_log_stage_times
|
|
# vpm.register_patch(
|
|
# "vllm.spec_decode.spec_decode_worker.SpecDecodeWorker._maybe_log_stage_times",
|
|
# _maybe_log_stage_times
|
|
# )
|
|
|
|
# from vllm_vacc.vllm.spec_decode.spec_decode_worker import _run_no_spec
|
|
# vpm.register_patch(
|
|
# "vllm.spec_decode.spec_decode_worker.SpecDecodeWorker._run_no_spec",
|
|
# _run_no_spec
|
|
# )
|
|
|
|
# from vllm_vacc.vllm.model_executor.layers.sampler import _apply_top_k_top_p_vacc
|
|
# vpm.register_patch(
|
|
# "vllm.model_executor.layers.sampler._apply_top_k_top_p",
|
|
# _apply_top_k_top_p_vacc
|
|
# )
|
|
|
|
from vllm_vacc.vllm.distributed.parallel_state import all_gather_to_rank0
|
|
vpm.register_patch(
|
|
"vllm.distributed.parallel_state.GroupCoordinator.all_gather_to_rank0",
|
|
all_gather_to_rank0,
|
|
create_dummy=True,
|
|
allow_create=True
|
|
)
|
|
|
|
from vllm_vacc.vllm.distributed.device_communicators.base_device_communicator import all_gather_into_tensor
|
|
vpm.register_patch(
|
|
"vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase.all_gather_into_tensor",
|
|
all_gather_into_tensor,
|
|
create_dummy=True,
|
|
allow_create=True
|
|
)
|
|
|
|
from vllm_vacc.vllm.distributed.parallel_state import generate_group_id
|
|
vpm.register_patch(
|
|
"vllm.distributed.parallel_state.GroupCoordinator.generate_group_id",
|
|
generate_group_id,
|
|
create_dummy=True,
|
|
allow_create=True
|
|
)
|
|
|
|
from vllm_vacc.vllm.distributed.parallel_state import generate_rank_device_infos
|
|
vpm.register_patch(
|
|
"vllm.distributed.parallel_state.GroupCoordinator.generate_rank_device_infos",
|
|
generate_rank_device_infos,
|
|
create_dummy=True,
|
|
allow_create=True
|
|
)
|
|
|
|
from vllm_vacc.vllm.distributed.communication_op import tensor_model_parallel_all_reduce_with_odsp
|
|
vpm.register_patch(
|
|
"vllm.distributed.communication_op.tensor_model_parallel_all_reduce",
|
|
tensor_model_parallel_all_reduce_with_odsp
|
|
)
|
|
|
|
|
|
from vllm_vacc.vllm.model_executor.models.qwen3_moe import Qwen3MoeSparseMoeBlock__init__
|
|
vpm.register_patch(
|
|
"vllm.model_executor.models.qwen3_moe.Qwen3MoeSparseMoeBlock.__init__",
|
|
Qwen3MoeSparseMoeBlock__init__
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.models.deepseek_mtp import DeepSeekMultiTokenPredictorLayer__init__
|
|
vpm.register_patch(
|
|
"vllm.model_executor.models.deepseek_mtp.DeepSeekMultiTokenPredictorLayer.__init__",
|
|
DeepSeekMultiTokenPredictorLayer__init__
|
|
)
|
|
|
|
# from vllm_vacc.vllm.spec_decode.spec_decode_worker import _prepare_prefill_hidden_states
|
|
# vpm.register_patch(
|
|
# "vllm.spec_decode.spec_decode_worker.prepare_prefill_hidden_states",
|
|
# _prepare_prefill_hidden_states
|
|
# )
|
|
|
|
from vllm_vacc.vllm.executor.executor_base import execute_model_async
|
|
vpm.register_patch(
|
|
"vllm.executor.mp_distributed_executor.MultiprocessingDistributedExecutor.execute_model_async",
|
|
execute_model_async,
|
|
create_dummy=True,
|
|
allow_create=True
|
|
)
|
|
|
|
from vllm_vacc.vllm.model_executor.layers.activation import QuickGELU_forward_vacc
|
|
vpm.register_patch(
|
|
"vllm.model_executor.layers.activation.QuickGELU.forward",
|
|
QuickGELU_forward_vacc,
|
|
)
|
|
|
|
def patch_vllm_v1(vpm: VllmPatchManager) -> None:
|
|
vpm.batch_register_patch(
|
|
[
|
|
"vllm.v1.core.block_pool.BlockPool.__init__",
|
|
"vllm.v1.core.block_pool.BlockPool.get_usage",
|
|
"vllm.v1.worker.block_table.BlockTable.__init__",
|
|
"vllm.v1.worker.gpu_input_batch.CachedRequestState",
|
|
"vllm.v1.core.single_type_kv_cache_manager.SingleTypeKVCacheManager.free",
|
|
"vllm.v1.core.single_type_kv_cache_manager.SingleTypeKVCacheManager.allocate_new_blocks",
|
|
"vllm.entrypoints.llm.LLM._validate_and_add_requests",
|
|
"vllm.entrypoints.openai.serving_completion.OpenAIServingCompletion.__init__",
|
|
"vllm.entrypoints.openai.serving_completion.OpenAIServingCompletion.create_completion",
|
|
"vllm.v1.engine.EngineCoreRequestType",
|
|
"vllm.v1.engine.EngineCoreRequest",
|
|
"vllm.v1.engine.EngineCoreOutputs",
|
|
"vllm.v1.engine.async_llm.AsyncLLM.from_vllm_config",
|
|
"vllm.v1.engine.async_llm.AsyncLLM.from_engine_args",
|
|
"vllm.v1.engine.async_llm.AsyncLLM._add_request",
|
|
"vllm.v1.engine.llm_engine.LLMEngine.from_vllm_config",
|
|
"vllm.v1.engine.llm_engine.LLMEngine.from_engine_args",
|
|
"vllm.v1.engine.processor.Processor.process_inputs",
|
|
"vllm.v1.engine.core.EngineCoreProc.process_input_sockets",
|
|
"vllm.v1.engine.core.preprocess_add_request",
|
|
|
|
"vllm.v1.metrics.loggers.LoggingStatLogger.log",
|
|
"vllm.v1.metrics.stats.SchedulerStats",
|
|
|
|
"vllm.v1.core.sched.scheduler.Scheduler.make_stats",
|
|
"vllm.v1.core.sched.scheduler.Scheduler.schedule",
|
|
"vllm.v1.core.sched.scheduler.Scheduler.__init__",
|
|
"vllm.v1.core.sched.scheduler.Scheduler.add_request",
|
|
"vllm.v1.core.sched.scheduler.Scheduler.finish_requests",
|
|
"vllm.v1.core.sched.output.NewRequestData",
|
|
"vllm.v1.request.Request",
|
|
# "vllm.v1.request.Request.from_engine_core_request", # AttributeError: 'Request' object has no attribute 'record_event'
|
|
"vllm.v1.sample.rejection_sampler.RejectionSampler.forward",
|
|
"vllm.v1.sample.sampler.Sampler.forward",
|
|
"vllm.v1.sample.metadata.SamplingMetadata",
|
|
"vllm.v1.spec_decode.eagle.EagleProposer.propose",
|
|
"vllm.v1.spec_decode.eagle.EagleProposer.prepare_next_token_ids_padded",
|
|
"vllm.v1.spec_decode.eagle.EagleProposer.prepare_inputs_padded",
|
|
"vllm.v1.spec_decode.eagle.EagleProposer.prepare_inputs",
|
|
"vllm.v1.core.kv_cache_utils.get_num_blocks",
|
|
"vllm.v1.core.kv_cache_utils.estimate_max_model_len",
|
|
"vllm.v1.core.kv_cache_utils.check_enough_kv_cache_memory",
|
|
|
|
"vllm.v1.worker.gpu_input_batch.InputBatch.__init__",
|
|
"vllm.v1.worker.gpu_input_batch.InputBatch.add_request",
|
|
"vllm.v1.worker.gpu_input_batch.InputBatch._make_sampling_metadata",
|
|
|
|
]
|
|
)
|
|
|
|
|
|
from vllm_vacc.vllm.v1.core.sched.scheduler import Scheduler
|
|
vpm.register_patch(
|
|
"vllm.v1.core.sched.scheduler.Scheduler._schedule_running_requests_for_mode",
|
|
Scheduler._schedule_running_requests_for_mode,
|
|
create_dummy=True,
|
|
allow_create=True
|
|
)
|
|
vpm.register_patch(
|
|
"vllm.v1.core.sched.scheduler.Scheduler._estimate_future_kv_tokens",
|
|
Scheduler._estimate_future_kv_tokens,
|
|
create_dummy=True,
|
|
allow_create=True
|
|
)
|
|
vpm.register_patch(
|
|
"vllm.v1.core.sched.scheduler.Scheduler._compute_total_future_kv_tokens",
|
|
Scheduler._compute_total_future_kv_tokens,
|
|
create_dummy=True,
|
|
allow_create=True
|
|
)
|
|
|
|
from vllm_vacc.vllm.engine.arg_utils import _set_default_args
|
|
vpm.register_patch(
|
|
"vllm.engine.arg_utils.EngineArgs._set_default_args",
|
|
_set_default_args,
|
|
)
|
|
|
|
from vllm_vacc.vllm.v1.engine.core_client import EngineCoreClient,SyncMPClient
|
|
vpm.register_patch(
|
|
"vllm.v1.engine.core_client.EngineCoreClient._send_input",
|
|
EngineCoreClient._send_input,
|
|
create_dummy=True,
|
|
allow_create=True
|
|
)
|
|
vpm.register_patch(
|
|
"vllm.v1.engine.core_client.EngineCoreClient.add_requests",
|
|
EngineCoreClient.add_requests,
|
|
create_dummy=True,
|
|
allow_create=True
|
|
)
|
|
vpm.register_patch(
|
|
"vllm.v1.engine.core_client.SyncMPClient.add_requests",
|
|
SyncMPClient.add_requests,
|
|
create_dummy=True,
|
|
allow_create=True
|
|
)
|
|
|
|
from vllm_vacc.vllm.v1.engine.llm_engine import LLMEngine
|
|
vpm.register_patch(
|
|
"vllm.v1.engine.llm_engine.LLMEngine.add_requests",
|
|
LLMEngine.add_requests,
|
|
create_dummy=True,
|
|
allow_create=True
|
|
)
|
|
|
|
from vllm_vacc.vllm.v1.engine.core import EngineCore
|
|
vpm.register_patch(
|
|
"vllm.v1.engine.core.EngineCore._initialize_kv_caches",
|
|
EngineCore._initialize_kv_caches,
|
|
create_dummy=True,
|
|
allow_create=True
|
|
)
|
|
|
|
from vllm_vacc.vllm.v1.executor.abstract import Executor
|
|
vpm.register_patch(
|
|
"vllm.v1.executor.abstract.Executor.determine_available_memory_block",
|
|
Executor.determine_available_memory_block,
|
|
create_dummy=True,
|
|
allow_create=True
|
|
)
|
|
|
|
from vllm_vacc.vllm.v1.spec_decode.eagle import EagleProposer_init_
|
|
vpm.register_patch(
|
|
"vllm.v1.spec_decode.eagle.EagleProposer.__init__",
|
|
EagleProposer_init_
|
|
)
|