Drop 0.11.0 support (#4377)

There is a lot hack code for v0.11.0, which makes the code hard to
upgrade to newer vLLM version. Since v0.11.0 will release soon. Let's
drop v0.11.0 support first. Then we'll upgrade to v0.11.2 soon.


- vLLM version: v0.11.0
- vLLM main:
2918c1b49c

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-11-24 17:08:20 +08:00
committed by GitHub
parent 41ddb06554
commit a1f142b7ad
80 changed files with 467 additions and 1755 deletions

View File

@@ -5,13 +5,15 @@ import numpy as np
import torch
import torch.nn as nn
from vllm.attention.layer import Attention
from vllm.config import CUDAGraphMode, VllmConfig, get_layers_from_vllm_config
from vllm.config import (CompilationMode, CUDAGraphMode, VllmConfig,
get_layers_from_vllm_config)
from vllm.distributed.parallel_state import get_pp_group
from vllm.logger import logger
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
from vllm.model_executor.model_loader import get_model
from vllm.model_executor.models import supports_multimodal
from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
@@ -22,14 +24,6 @@ from vllm_ascend.attention.attention_v1 import (AscendAttentionState,
AscendMetadata)
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
from vllm_ascend.spec_decode.interface import Proposer, SpecDcodeType
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.config import CompilationLevel
from vllm.utils import is_pin_memory_available
else:
from vllm.config import CompilationMode
from vllm.utils.platform_utils import is_pin_memory_available
PADDING_SLOT_ID = -1
@@ -52,16 +46,9 @@ class EagleProposer(Proposer):
self.hidden_size = vllm_config.speculative_config.draft_model_config.get_hidden_size(
)
if vllm_version_is("0.11.0"):
self.use_cuda_graph = (
self.vllm_config.compilation_config.level
== CompilationLevel.PIECEWISE
and not self.vllm_config.model_config.enforce_eager)
else:
self.use_cuda_graph = (
self.vllm_config.compilation_config.mode
== CompilationMode.VLLM_COMPILE
and not self.vllm_config.model_config.enforce_eager)
self.use_cuda_graph = (self.vllm_config.compilation_config.mode
== CompilationMode.VLLM_COMPILE and
not self.vllm_config.model_config.enforce_eager)
self.cudagraph_batch_sizes = list(
reversed(

View File

@@ -15,14 +15,7 @@ from vllm.model_executor.model_loader.utils import \
process_weights_after_loading
from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache
from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import cdiv
else:
from vllm.utils.math_utils import cdiv
from vllm.utils.math_utils import cdiv
from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
CommonAttentionMetadata)
from vllm.v1.core.sched.output import SchedulerOutput
@@ -39,31 +32,21 @@ from vllm_ascend.compilation.acl_graph import (ACLGraphWrapper,
update_mla_attn_params)
from vllm_ascend.spec_decode.interface import Proposer, SpecDcodeType
from vllm_ascend.utils import (ProfileExecuteDuration, lmhead_tp_enable,
prefill_context_parallel_enable,
vllm_version_is)
prefill_context_parallel_enable)
if prefill_context_parallel_enable():
from vllm.distributed import get_pcp_group
if vllm_version_is("0.11.0"):
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
from vllm.utils import is_pin_memory_available
else:
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.torch_utils import set_default_torch_dtype
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.torch_utils import set_default_torch_dtype
logger = init_logger(__name__)
PADDING_SLOT_ID = -1
_deepseek_mtp_path = "vllm.model_executor.models.deepseek_mtp"
_deepseek_mtp_model = "DeepSeekMTP"
if vllm_version_is("0.11.0"):
_deepseek_mtp_path = "vllm_ascend.patch.worker.patch_deepseek_mtp"
_deepseek_mtp_model = "AscendDeepSeekMTP"
_MTP_MODELS = {
"DeepseekV3ForCausalLM": (_deepseek_mtp_path, _deepseek_mtp_model),
"DeepseekV3ForCausalLM":
("vllm.model_executor.models.deepseek_mtp", "DeepSeekMTP"),
"Qwen3NextForCausalLM":
("vllm_ascend.models.qwen3_next_mtp", "CustomQwen3NextMTP")
}