Drop 0.11.0 support (#4377)

There is a lot hack code for v0.11.0, which makes the code hard to
upgrade to newer vLLM version. Since v0.11.0 will release soon. Let's
drop v0.11.0 support first. Then we'll upgrade to v0.11.2 soon.


- vLLM version: v0.11.0
- vLLM main:
2918c1b49c

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-11-24 17:08:20 +08:00
committed by GitHub
parent 41ddb06554
commit a1f142b7ad
80 changed files with 467 additions and 1755 deletions

View File

@@ -412,33 +412,21 @@ def _is_default_capture_sizes(vllm_config: VllmConfig) -> bool:
Check whether it is vLLM default capture sizes.
"""
if vllm_version_is("0.11.0"):
cuda_graph_sizes = vllm_config.scheduler_config.cuda_graph_sizes
if len(cuda_graph_sizes) == 1:
cudagraph_capture_sizes = [1, 2, 4] + [
i for i in range(8, cuda_graph_sizes[0] + 1, 8)
]
else:
max_cudagraph_capture_size = \
vllm_config.compilation_config.max_cudagraph_capture_size
cudagraph_capture_sizes = [
i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
]
if max_cudagraph_capture_size >= 8:
# Step size 8 for small batch sizes, up to 256(not included)
cudagraph_capture_sizes += list(
range(8, min(max_cudagraph_capture_size + 1, 256), 8))
if max_cudagraph_capture_size >= 256:
# Step size 16 for larger batch sizes
cudagraph_capture_sizes += list(
range(256, max_cudagraph_capture_size + 1, 16))
if vllm_version_is("0.11.0"):
target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes,
reverse=True)
else:
# in newer version, vVLLM use ascending order of cudagraph_capture_sizes.
target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes)
max_cudagraph_capture_size = \
vllm_config.compilation_config.max_cudagraph_capture_size
cudagraph_capture_sizes = [
i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
]
if max_cudagraph_capture_size >= 8:
# Step size 8 for small batch sizes, up to 256(not included)
cudagraph_capture_sizes += list(
range(8, min(max_cudagraph_capture_size + 1, 256), 8))
if max_cudagraph_capture_size >= 256:
# Step size 16 for larger batch sizes
cudagraph_capture_sizes += list(
range(256, max_cudagraph_capture_size + 1, 16))
# in newer version, vLLM use ascending order of cudagraph_capture_sizes.
target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes)
if target_cudagraph_capture_sizes == \
vllm_config.compilation_config.cudagraph_capture_sizes:
return True
@@ -465,21 +453,13 @@ def update_default_aclgraph_sizes(vllm_config: VllmConfig) -> None:
if vllm_config.model_config and vllm_config.model_config.hf_config.model_type == "qwen3_moe" \
and vllm_config.parallel_config.tensor_parallel_size == 1 \
and vllm_config.parallel_config.data_parallel_size > 1 :
if vllm_version_is("0.11.0"):
max_capture_size = vllm_config.scheduler_config.cuda_graph_sizes[0]
else:
max_capture_size = vllm_config.compilation_config.max_cudagraph_capture_size
max_capture_size = vllm_config.compilation_config.max_cudagraph_capture_size
new_cudagraph_capture_sizes = [1, 2, 5, 10, 15, 20] + [
i for i in range(24, max_capture_size + 1, 8)
]
if vllm_version_is("0.11.0"):
vllm_config.compilation_config.cudagraph_capture_sizes = new_cudagraph_capture_sizes
vllm_config.compilation_config.init_with_cudagraph_sizes(
new_cudagraph_capture_sizes)
else:
update_cudagraph_capture_sizes(vllm_config,
new_cudagraph_capture_sizes)
update_cudagraph_capture_sizes(vllm_config,
new_cudagraph_capture_sizes)
def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
@@ -573,10 +553,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
indices[0], indices[-1] = 0, len(original_sizes) - 1
sampled_sizes = [original_sizes[i] for i in indices]
if vllm_version_is("0.11.0"):
compilation_config.init_with_cudagraph_sizes(sampled_sizes)
else:
update_cudagraph_capture_sizes(vllm_config, sampled_sizes)
update_cudagraph_capture_sizes(vllm_config, sampled_sizes)
logger.info(
"Adjusted ACL graph batch sizes for %s model (layers: %d): %d%d sizes",
@@ -607,10 +584,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
if original_sizes[0] < (num_speculative_tokens + 1) * max_num_seqs:
enlarged_sizes = [(num_speculative_tokens + 1) * size
for size in original_sizes]
if vllm_version_is("0.11.0"):
compilation_config.init_with_cudagraph_sizes(enlarged_sizes)
else:
update_cudagraph_capture_sizes(vllm_config, enlarged_sizes)
update_cudagraph_capture_sizes(vllm_config, enlarged_sizes)
logger.info(
"Adjusted ACL graphs: %s%s for speculative decoding",
original_sizes, enlarged_sizes)
@@ -719,11 +693,8 @@ def register_ascend_customop(vllm_config: Optional[VllmConfig] = None):
"GemmaRMSNorm": AscendGemmaRMSNorm,
"FusedMoE": AscendFusedMoE,
"SharedFusedMoE": AscendSharedFusedMoE,
"MultiHeadLatentAttentionWrapper": AscendMultiHeadLatentAttention,
}
mla_to_register = "MultiHeadLatentAttention" if vllm_version_is(
"0.11.0") else "MultiHeadLatentAttentionWrapper"
if vllm_config and vllm_config.model_config and vllm_config.model_config.use_mla:
REGISTERED_ASCEND_OPS[mla_to_register] = AscendMultiHeadLatentAttention
for name, op_cls in REGISTERED_ASCEND_OPS.items():
CustomOp.register_oot(_decorated_op_cls=op_cls, name=name)