Drop 0.11.0 support (#4377)

There is a lot hack code for v0.11.0, which makes the code hard to
upgrade to newer vLLM version. Since v0.11.0 will release soon. Let's
drop v0.11.0 support first. Then we'll upgrade to v0.11.2 soon.


- vLLM version: v0.11.0
- vLLM main:
2918c1b49c

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-11-24 17:08:20 +08:00
committed by GitHub
parent 41ddb06554
commit a1f142b7ad
80 changed files with 467 additions and 1755 deletions

View File

@@ -9,6 +9,7 @@ from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
from vllm.multimodal.inputs import (MultiModalFeatureSpec,
MultiModalKwargsItem, PlaceholderRange)
from vllm.sampling_params import SamplingParams
from vllm.utils.hashing import sha256
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
init_none_hash)
from vllm.v1.core.sched.output import SchedulerOutput
@@ -21,12 +22,6 @@ from vllm.v1.structured_output import StructuredOutputManager
from tests.ut.base import TestBase
from vllm_ascend.core.scheduler import AscendScheduler
from vllm_ascend.core.scheduler_dynamic_batch import SchedulerDynamicBatch
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import sha256
else:
from vllm.utils.hashing import sha256
EOS_TOKEN_ID = 50256
MODEL = "Qwen3-0.6B"
@@ -181,23 +176,13 @@ class TestAscendScheduler(TestBase):
)
cache_config.num_gpu_blocks = 10000
if vllm_version_is("0.11.0"):
scheduler = AscendScheduler(
vllm_config=vllm_config,
kv_cache_config=kv_cache_config,
log_stats=True,
structured_output_manager=MagicMock(
spec=StructuredOutputManager),
)
else:
scheduler = AscendScheduler(
vllm_config=vllm_config,
kv_cache_config=kv_cache_config,
log_stats=True,
block_size=block_size,
structured_output_manager=MagicMock(
spec=StructuredOutputManager),
)
scheduler = AscendScheduler(
vllm_config=vllm_config,
kv_cache_config=kv_cache_config,
log_stats=True,
block_size=block_size,
structured_output_manager=MagicMock(spec=StructuredOutputManager),
)
should_advance = MagicMock()
should_advance.return_value = False

View File

@@ -13,13 +13,7 @@ from unittest.mock import MagicMock, patch
import msgspec
import zmq
from vllm.distributed.parallel_state import GroupCoordinator
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import make_zmq_path
else:
from vllm.utils.network_utils import make_zmq_path
from vllm.utils.network_utils import make_zmq_path
fake_engine = types.ModuleType("mooncake.engine")
fake_engine.TransferEngine = MagicMock() # type: ignore[attr-defined]

View File

@@ -10,6 +10,7 @@ import torch
from vllm import SamplingParams
from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
ModelConfig, SchedulerConfig, VllmConfig)
from vllm.utils.hashing import sha256
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
init_none_hash)
from vllm.v1.core.sched.scheduler import Scheduler
@@ -19,13 +20,6 @@ from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request
from vllm.v1.structured_output import StructuredOutputManager
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import sha256
else:
from vllm.utils.hashing import sha256
EOS_TOKEN_ID = 50256
@@ -111,21 +105,14 @@ def create_scheduler(
],
)
vllm_config.cache_config.num_gpu_blocks = num_blocks
if vllm_version_is("0.11.0"):
return Scheduler(
vllm_config=vllm_config,
kv_cache_config=kv_cache_config,
log_stats=True,
structured_output_manager=StructuredOutputManager(vllm_config),
)
else:
return Scheduler(
vllm_config=vllm_config,
kv_cache_config=kv_cache_config,
log_stats=True,
block_size=block_size,
structured_output_manager=StructuredOutputManager(vllm_config),
)
return Scheduler(
vllm_config=vllm_config,
kv_cache_config=kv_cache_config,
log_stats=True,
block_size=block_size,
structured_output_manager=StructuredOutputManager(vllm_config),
)
_none_hash_initialized = False

View File

@@ -22,7 +22,6 @@ import torch
from torch import nn
from vllm_ascend.model_loader.netloader.netloader import ModelNetLoaderElastic
from vllm_ascend.utils import vllm_version_is
class DummyDeviceConfig:
@@ -174,11 +173,7 @@ def test_load_model_elastic_success(mock_logger, monkeypatch, tmp_path):
"vllm_ascend.model_loader.netloader.netloader.process_weights_after_loading",
lambda *a, **k: None)
# patch get_ip
if vllm_version_is("0.11.0"):
monkeypatch.setattr("vllm.utils.get_ip", lambda: "127.0.0.1")
else:
monkeypatch.setattr("vllm.utils.network_utils.get_ip",
lambda: "127.0.0.1")
monkeypatch.setattr("vllm.utils.network_utils.get_ip", lambda: "127.0.0.1")
# patch find_free_port
monkeypatch.setattr(
"vllm_ascend.model_loader.netloader.netloader.find_free_port",

View File

@@ -9,7 +9,6 @@ from vllm.model_executor.layers.mla import MLAModules
from tests.ut.base import TestBase
from vllm_ascend.models.layers.mla import (AscendMultiHeadLatentAttention,
IndexerWrapper)
from vllm_ascend.utils import vllm_version_is
class TestIndexerWrapper(TestBase):
@@ -85,68 +84,35 @@ class TestAscendMultiHeadLatentAttention(TestBase):
"vllm_ascend.models.layers.mla.get_tensor_model_parallel_world_size")
def test_initialization(self, mock_tp_size, mock_ascend_config,
mock_get_vllm_config):
if vllm_version_is("0.11.0"):
with patch("vllm_ascend.models.layers.mla.Attention",
return_value=True):
mock_tp_size.return_value = 1
mock_ascend_config.return_value.enable_shared_expert_dp = False
mock_vllm_config = MagicMock(spec=VllmConfig)
mock_vllm_config.model_config.hf_config = MagicMock(
num_hidden_layers=32, first_k_dense_replace=False)
mock_get_vllm_config.return_value = mock_vllm_config
mock_vllm_config.compilation_config = CompilationConfig()
attn = AscendMultiHeadLatentAttention(
hidden_size=self.hidden_size,
num_heads=self.num_heads,
scale=self.scale,
qk_nope_head_dim=self.qk_nope_head_dim,
qk_rope_head_dim=self.qk_rope_head_dim,
v_head_dim=self.v_head_dim,
q_lora_rank=self.q_lora_rank,
kv_lora_rank=self.kv_lora_rank,
mla_modules=self.mock_mla_modules,
cache_config=self.mock_cache_config,
quant_config=self.mock_quant_config,
prefix=self.prefix,
)
with patch("vllm_ascend.models.layers.mla.MLAAttention",
return_value=True):
mock_tp_size.return_value = 2
mock_ascend_config.return_value.enable_shared_expert_dp = True
mock_vllm_config = MagicMock(spec=VllmConfig)
mock_vllm_config.model_config.hf_config = MagicMock(
num_hidden_layers=32, first_k_dense_replace=True)
mock_get_vllm_config.return_value = mock_vllm_config
mock_vllm_config.compilation_config = CompilationConfig()
self.assertEqual(attn.hidden_size, self.hidden_size)
self.assertEqual(attn.kv_lora_rank, self.kv_lora_rank)
self.assertEqual(attn.debug_layer_idx, 0)
self.assertIsNotNone(attn.mla_attn)
self.assertIn(
self.prefix,
mock_vllm_config.compilation_config.static_forward_context)
else:
with patch("vllm_ascend.models.layers.mla.MLAAttention",
return_value=True):
mock_tp_size.return_value = 2
mock_ascend_config.return_value.enable_shared_expert_dp = True
mock_vllm_config = MagicMock(spec=VllmConfig)
mock_vllm_config.model_config.hf_config = MagicMock(
num_hidden_layers=32, first_k_dense_replace=True)
mock_get_vllm_config.return_value = mock_vllm_config
mock_vllm_config.compilation_config = CompilationConfig()
attn = AscendMultiHeadLatentAttention(
hidden_size=self.hidden_size,
num_heads=self.num_heads,
scale=self.scale,
qk_nope_head_dim=self.qk_nope_head_dim,
qk_rope_head_dim=self.qk_rope_head_dim,
v_head_dim=self.v_head_dim,
q_lora_rank=self.q_lora_rank,
kv_lora_rank=self.kv_lora_rank,
mla_modules=self.mock_mla_modules,
cache_config=self.mock_cache_config,
quant_config=self.mock_quant_config,
prefix=self.prefix,
)
attn = AscendMultiHeadLatentAttention(
hidden_size=self.hidden_size,
num_heads=self.num_heads,
scale=self.scale,
qk_nope_head_dim=self.qk_nope_head_dim,
qk_rope_head_dim=self.qk_rope_head_dim,
v_head_dim=self.v_head_dim,
q_lora_rank=self.q_lora_rank,
kv_lora_rank=self.kv_lora_rank,
mla_modules=self.mock_mla_modules,
cache_config=self.mock_cache_config,
quant_config=self.mock_quant_config,
prefix=self.prefix,
)
self.assertEqual(attn.tp_size, 2)
self.assertTrue(attn.enable_shared_expert_dp)
self.assertIsNotNone(attn.mla_attn)
self.assertEqual(attn.tp_size, 2)
self.assertTrue(attn.enable_shared_expert_dp)
self.assertIsNotNone(attn.mla_attn)
@patch("vllm_ascend.models.layers.mla.torch.ops.vllm.mla_forward")
@patch("vllm_ascend.models.layers.mla.get_current_vllm_config")
@@ -164,41 +130,22 @@ class TestAscendMultiHeadLatentAttention(TestBase):
num_hidden_layers=32, first_k_dense_replace=False)
mock_get_vllm_config.return_value = mock_vllm_config
mock_vllm_config.compilation_config = CompilationConfig()
if vllm_version_is("0.11.0"):
with patch("vllm_ascend.models.layers.mla.Attention",
return_value=True):
attn = AscendMultiHeadLatentAttention(
hidden_size=self.hidden_size,
num_heads=self.num_heads,
scale=self.scale,
qk_nope_head_dim=self.qk_nope_head_dim,
qk_rope_head_dim=self.qk_rope_head_dim,
v_head_dim=self.v_head_dim,
q_lora_rank=self.q_lora_rank,
kv_lora_rank=self.kv_lora_rank,
mla_modules=self.mock_mla_modules,
cache_config=self.mock_cache_config,
quant_config=self.mock_quant_config,
prefix=self.prefix,
)
else:
with patch("vllm_ascend.models.layers.mla.MLAAttention",
return_value=True):
attn = AscendMultiHeadLatentAttention(
hidden_size=self.hidden_size,
num_heads=self.num_heads,
scale=self.scale,
qk_nope_head_dim=self.qk_nope_head_dim,
qk_rope_head_dim=self.qk_rope_head_dim,
v_head_dim=self.v_head_dim,
q_lora_rank=self.q_lora_rank,
kv_lora_rank=self.kv_lora_rank,
mla_modules=self.mock_mla_modules,
cache_config=self.mock_cache_config,
quant_config=self.mock_quant_config,
prefix=self.prefix,
)
with patch("vllm_ascend.models.layers.mla.MLAAttention",
return_value=True):
attn = AscendMultiHeadLatentAttention(
hidden_size=self.hidden_size,
num_heads=self.num_heads,
scale=self.scale,
qk_nope_head_dim=self.qk_nope_head_dim,
qk_rope_head_dim=self.qk_rope_head_dim,
v_head_dim=self.v_head_dim,
q_lora_rank=self.q_lora_rank,
kv_lora_rank=self.kv_lora_rank,
mla_modules=self.mock_mla_modules,
cache_config=self.mock_cache_config,
quant_config=self.mock_quant_config,
prefix=self.prefix,
)
positions = torch.tensor([0, 1, 2])
hidden_states = torch.randn(3, self.hidden_size)

View File

@@ -3,18 +3,13 @@ from unittest.mock import MagicMock, patch
import pytest
import torch
from vllm.config.compilation import CUDAGraphMode
from vllm.config.compilation import CompilationMode, CUDAGraphMode
from vllm.engine.arg_utils import EngineArgs
from vllm.platforms import PlatformEnum
from tests.ut.base import TestBase
from vllm_ascend.platform import NPUPlatform
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.config.compilation import CompilationLevel
else:
from vllm.config.compilation import CompilationMode
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD
class TestNPUPlatform(TestBase):
@@ -313,16 +308,10 @@ class TestNPUPlatform(TestBase):
self.assertTrue("Compilation disabled, using eager mode by default" in
cm.output[0])
if vllm_version_is("0.11.0"):
self.assertEqual(
vllm_config.compilation_config.level,
CompilationLevel.NO_COMPILATION,
)
else:
self.assertEqual(
vllm_config.compilation_config.mode,
CompilationMode.NONE,
)
self.assertEqual(
vllm_config.compilation_config.mode,
CompilationMode.NONE,
)
self.assertEqual(
vllm_config.compilation_config.cudagraph_mode,
@@ -348,10 +337,7 @@ class TestNPUPlatform(TestBase):
mock_init_recompute.return_value = MagicMock()
vllm_config.scheduler_config = MagicMock()
if vllm_version_is("0.11.0"):
vllm_config.compilation_config.level = CompilationLevel.DYNAMO_ONCE
else:
vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
with self.assertLogs(logger="vllm", level="WARNING") as cm:
from vllm_ascend import platform
@@ -359,16 +345,11 @@ class TestNPUPlatform(TestBase):
importlib.reload(platform)
self.platform.check_and_update_config(vllm_config)
self.assertTrue("NPU does not support" in cm.output[0])
if vllm_version_is("0.11.0"):
self.assertEqual(
vllm_config.compilation_config.level,
CompilationLevel.NO_COMPILATION,
)
else:
self.assertEqual(
vllm_config.compilation_config.mode,
CompilationMode.NONE,
)
self.assertEqual(
vllm_config.compilation_config.mode,
CompilationMode.NONE,
)
self.assertEqual(
vllm_config.compilation_config.cudagraph_mode,
CUDAGraphMode.NONE,
@@ -396,16 +377,10 @@ class TestNPUPlatform(TestBase):
"cudagraph_mode is not support on NPU. falling back to NONE" in
cm.output[0])
if vllm_version_is("0.11.0"):
self.assertEqual(
vllm_config.compilation_config.level,
CompilationLevel.NO_COMPILATION,
)
else:
self.assertEqual(
vllm_config.compilation_config.mode,
CompilationMode.NONE,
)
self.assertEqual(
vllm_config.compilation_config.mode,
CompilationMode.NONE,
)
self.assertEqual(
vllm_config.compilation_config.cudagraph_mode,
CUDAGraphMode.NONE,
@@ -431,10 +406,7 @@ class TestNPUPlatform(TestBase):
mock_init_recompute.return_value = MagicMock()
vllm_config.scheduler_config = MagicMock()
if vllm_version_is("0.11.0"):
vllm_config.compilation_config.level = CompilationLevel.PIECEWISE
else:
vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
with self.assertLogs(logger="vllm", level="INFO") as cm:
from vllm_ascend import platform
@@ -443,16 +415,10 @@ class TestNPUPlatform(TestBase):
self.platform.check_and_update_config(vllm_config)
self.assertTrue("Torchair compilation enabled" in cm.output[0])
if vllm_version_is("0.11.0"):
self.assertEqual(
vllm_config.compilation_config.level,
CompilationLevel.NO_COMPILATION,
)
else:
self.assertEqual(
vllm_config.compilation_config.mode,
CompilationMode.NONE,
)
self.assertEqual(
vllm_config.compilation_config.mode,
CompilationMode.NONE,
)
self.assertEqual(
vllm_config.compilation_config.cudagraph_mode,
CUDAGraphMode.NONE,
@@ -658,12 +624,9 @@ class TestNPUPlatform(TestBase):
def test_get_punica_wrapper(self):
result = self.platform.get_punica_wrapper()
if vllm_version_is("0.11.0"):
self.assertEqual(
result, "vllm_ascend.lora.punica_npu.PunicaWrapperNPU0110")
else:
self.assertEqual(result,
"vllm_ascend.lora.punica_npu.PunicaWrapperNPU")
self.assertEqual(result,
"vllm_ascend.lora.punica_npu.PunicaWrapperNPU")
@patch("torch.npu.reset_peak_memory_stats")
@patch("torch.npu.max_memory_allocated")
@@ -742,16 +705,11 @@ class TestNPUPlatform(TestBase):
self.assertTrue(
"PIECEWISE compilation enabled on NPU. use_inductor not supported - "
"using only ACL Graph mode" in cm.output[0])
if vllm_version_is("0.11.0"):
self.assertEqual(
VllmConfig.compilation_config.level,
CompilationLevel.PIECEWISE,
)
else:
self.assertEqual(
VllmConfig.compilation_config.mode,
CompilationMode.VLLM_COMPILE,
)
self.assertEqual(
VllmConfig.compilation_config.mode,
CompilationMode.VLLM_COMPILE,
)
self.assertEqual(
VllmConfig.compilation_config.cudagraph_mode,
CUDAGraphMode.PIECEWISE,

View File

@@ -274,46 +274,8 @@ class TestUtils(TestBase):
utils.update_aclgraph_sizes(test_vllm_config)
del os.environ['HCCL_OP_EXPANSION_MODE']
if utils.vllm_version_is("0.11.0"):
self.assertEqual(
137,
len(test_vllm_config.compilation_config.cudagraph_capture_sizes
))
else:
self.assertEqual(
0,
len(test_vllm_config.compilation_config.cudagraph_capture_sizes
))
return
test_vllm_config.speculative_config = mock.MagicMock()
test_vllm_config.speculative_config.num_speculative_tokens = 2
test_vllm_config.speculative_config.draft_model_config = mock.MagicMock(
)
test_vllm_config.speculative_config.draft_model_config.hf_config = mock.MagicMock(
)
test_vllm_config.speculative_config.draft_model_config.hf_config.num_hidden_layers = 2
os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
utils.update_aclgraph_sizes(test_vllm_config)
del os.environ['HCCL_OP_EXPANSION_MODE']
self.assertEqual(
111,
len(test_vllm_config.compilation_config.cudagraph_capture_sizes))
# max_num_batch_sizes >= len(original_sizes)
test_compilation_config = CompilationConfig(
cudagraph_capture_sizes=[1, 2, 3])
test_vllm_config = VllmConfig(
model_config=test_model_config,
compilation_config=test_compilation_config,
parallel_config=test_parallel_config,
)
utils.update_aclgraph_sizes(test_vllm_config)
os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
utils.update_aclgraph_sizes(test_vllm_config)
del os.environ['HCCL_OP_EXPANSION_MODE']
self.assertEqual(
3,
0,
len(test_vllm_config.compilation_config.cudagraph_capture_sizes))
@mock.patch("vllm.model_executor.custom_op.CustomOp")

View File

@@ -7,7 +7,6 @@ from vllm.config import CacheConfig, VllmConfig
from tests.ut.base import PytestBase
from vllm_ascend.torchair.torchair_mtp_proposer import TorchairMtpProposer
from vllm_ascend.utils import vllm_version_is
class TestTorchairMtpProposer(PytestBase):
@@ -40,14 +39,8 @@ class TestTorchairMtpProposer(PytestBase):
mocker.patch(
"vllm_ascend.torchair.torchair_mtp_proposer.MtpProposer.__init__",
return_value=None)
if vllm_version_is("0.11.0"):
mock_set_default_dtype = mocker.patch(
'vllm.model_executor.model_loader.utils.set_default_torch_dtype'
)
else:
mock_set_default_dtype = mocker.patch(
'vllm.utils.torch_utils.set_default_torch_dtype')
mock_set_default_dtype = mocker.patch(
'vllm.utils.torch_utils.set_default_torch_dtype')
mock_set_default_dtype.return_value.__enter__.return_value = None
mock_model_loader = MagicMock()

View File

@@ -4,10 +4,8 @@ import torch
from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig
from tests.ut.base import TestBase
from vllm_ascend.utils import vllm_version_is
init_cache_hf_modules_path = "vllm.utils.init_cached_hf_modules" if vllm_version_is(
"0.11.0") else "vllm.utils.import_utils.init_cached_hf_modules"
init_cache_hf_modules_path = "vllm.utils.import_utils.init_cached_hf_modules"
class TestNPUTorchairWorker(TestBase):

View File

@@ -20,19 +20,14 @@ import numpy as np
import pytest
import torch
from vllm.sampling_params import SamplingParams
from vllm.utils.torch_utils import make_tensor_with_pad
from vllm.v1.pool.metadata import PoolingMetadata
from vllm.v1.sample.logits_processor import LogitsProcessors
from vllm.v1.sample.metadata import SamplingMetadata
from vllm_ascend.utils import vllm_version_is
from vllm_ascend.worker.block_table import BlockTable, MultiGroupBlockTable
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
if vllm_version_is("0.11.0"):
from vllm.utils import make_tensor_with_pad
else:
from vllm.utils.torch_utils import make_tensor_with_pad
VOCAB_SIZE = 1024
NUM_OUTPUT_TOKENS = 20
MAX_PROMPT_SIZE = 100

View File

@@ -6,10 +6,8 @@ import torch
from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig
from tests.ut.base import TestBase
from vllm_ascend.utils import vllm_version_is
init_cached_hf_modules_path = "vllm.utils.init_cached_hf_modules" if vllm_version_is(
"0.11.0") else "vllm.utils.import_utils.init_cached_hf_modules"
init_cached_hf_modules_path = "vllm.utils.import_utils.init_cached_hf_modules"
class TestNPUWorker(TestBase):
@@ -189,26 +187,15 @@ class TestNPUWorker(TestBase):
# Create NPUWorker instance
from vllm_ascend.worker.worker_v1 import NPUWorker
if vllm_version_is("0.11.0"):
with patch("vllm.utils.STR_DTYPE_TO_TORCH_DTYPE",
{"float32": torch.float32}):
worker = NPUWorker(
vllm_config=self.vllm_config_mock,
local_rank=self.local_rank,
rank=self.rank,
distributed_init_method=self.distributed_init_method,
is_driver_worker=self.is_driver_worker,
)
else:
with patch("vllm.utils.torch_utils.STR_DTYPE_TO_TORCH_DTYPE",
{"float32": torch.float32}):
worker = NPUWorker(
vllm_config=self.vllm_config_mock,
local_rank=self.local_rank,
rank=self.rank,
distributed_init_method=self.distributed_init_method,
is_driver_worker=self.is_driver_worker,
)
with patch("vllm.utils.torch_utils.STR_DTYPE_TO_TORCH_DTYPE",
{"float32": torch.float32}):
worker = NPUWorker(
vllm_config=self.vllm_config_mock,
local_rank=self.local_rank,
rank=self.rank,
distributed_init_method=self.distributed_init_method,
is_driver_worker=self.is_driver_worker,
)
# Verify cache_dtype is set to custom value
self.assertEqual(worker.cache_dtype, torch.float32)