upgrade vLLM to main (#4608)
1. fix https://github.com/vllm-project/vllm/pull/28542 The model structure modifications we involved in are: - Qwen2.5-VL(still exist some patch) - Qwen2-VL - Qwen2 - DeepSeek series - Qwen-moe series 2. fix https://github.com/vllm-project/vllm/pull/29121 the output token now type changed from np to `list[list[int]]` 3. fix https://github.com/vllm-project/vllm/pull/29262 `xformers` backend for multimodal now has been deprecated 4. fix https://github.com/vllm-project/vllm/pull/29342 5. fix https://github.com/vllm-project/vllm/pull/28579 6. fix https://github.com/vllm-project/vllm/pull/28718 7. fix https://github.com/vllm-project/vllm/issues/28665 8. fix https://github.com/vllm-project/vllm/pull/26847 vllm introduced the `optimization-level`, some default config has been changed, and the param `--enforce-eager` has been deprecated 9. fix http://github.com/vllm-project/vllm/pull/29223 it retuns tuple for sampler. 10. fix https://github.com/vllm-project/vllm/pull/29471 we'll remove the related patch to avoid this kind of error. Co-authored-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: wangli <wangli858794774@gmail.com> - vLLM version: v0.11.2 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
@@ -191,7 +191,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.decode_max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_vllm_config.scheduler_config.enable_chunked_prefill = False
|
||||
mock_device = 'cpu'
|
||||
|
||||
mock_dcp.world_size = 1
|
||||
@@ -213,7 +213,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
|
||||
mock_vllm_config.cache_config.block_size)
|
||||
self.assertEqual(
|
||||
builder.chunked_prefill_enabled,
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled)
|
||||
mock_vllm_config.scheduler_config.enable_chunked_prefill)
|
||||
|
||||
@patch('vllm.distributed.parallel_state.get_dcp_group')
|
||||
@patch('vllm.distributed.parallel_state._DCP',
|
||||
@@ -230,7 +230,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.decode_max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_vllm_config.scheduler_config.enable_chunked_prefill = False
|
||||
mock_device = 'cpu'
|
||||
|
||||
mock_dcp.world_size = 1
|
||||
@@ -254,7 +254,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
|
||||
mock_vllm_config.cache_config.block_size)
|
||||
self.assertEqual(
|
||||
builder.chunked_prefill_enabled,
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled)
|
||||
mock_vllm_config.scheduler_config.enable_chunked_prefill)
|
||||
|
||||
@patch('vllm.distributed.parallel_state.get_dcp_group')
|
||||
@patch('vllm.distributed.parallel_state._DCP',
|
||||
@@ -321,7 +321,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.decode_max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_vllm_config.scheduler_config.enable_chunked_prefill = False
|
||||
mock_device = 'cpu'
|
||||
|
||||
mock_dcp.world_size = 1
|
||||
@@ -440,8 +440,10 @@ class TestAscendMLAMetadataBuilderBuild(TestBase):
|
||||
self.mock_vllm_config.model_config = ModelConfig(max_model_len=2048)
|
||||
self.mock_vllm_config.model_config.hf_text_config.qk_rope_head_dim = 32
|
||||
self.mock_vllm_config.cache_config = CacheConfig(block_size=32)
|
||||
self.mock_vllm_config.scheduler_config = SchedulerConfig(
|
||||
max_num_seqs=8, chunked_prefill_enabled=True)
|
||||
mock_scheduler_config = MagicMock(spec=SchedulerConfig)
|
||||
mock_scheduler_config.max_num_seqs = 8
|
||||
mock_scheduler_config.chunked_prefill_enabled = True
|
||||
self.mock_vllm_config.scheduler_config = mock_scheduler_config
|
||||
self.mock_vllm_config.speculative_config = None
|
||||
self.mock_device = torch.device("cpu")
|
||||
|
||||
@@ -454,12 +456,20 @@ class TestAscendMLAMetadataBuilderBuild(TestBase):
|
||||
"vllm_ascend.attention.mla_v1.get_decode_context_model_parallel_world_size"
|
||||
)
|
||||
@patch("vllm_ascend.attention.mla_v1.get_ascend_config")
|
||||
def test_build_prefix_no_cache_metadata(self, mock_get_ascend_config,
|
||||
@patch("vllm_ascend.attention.mla_v1.torch.zeros", wraps=torch.zeros)
|
||||
@patch("torch.Tensor.npu", new=lambda self: self)
|
||||
@patch("torch.npu.is_available")
|
||||
def test_build_prefix_no_cache_metadata(self, mock_npu_available,
|
||||
mock_zeros, mock_get_ascend_config,
|
||||
mock_dcp_world_size):
|
||||
if not torch.npu.is_available():
|
||||
self.skipTest("NPU not available, skipping NPU-dependent tests")
|
||||
mock_npu_available.return_value = False
|
||||
mock_dcp_world_size.return_value = 1
|
||||
|
||||
def zeros_override(*args, **kwargs):
|
||||
kwargs.pop('pin_memory', None)
|
||||
return mock_zeros._mock_wraps(*args, **kwargs)
|
||||
|
||||
mock_zeros.side_effect = zeros_override
|
||||
common_attn_metadata = AscendCommonAttentionMetadata(
|
||||
query_start_loc=torch.tensor([0, 3, 7]),
|
||||
query_start_loc_cpu=torch.tensor([0, 3, 7]),
|
||||
@@ -506,12 +516,21 @@ class TestAscendMLAMetadataBuilderBuild(TestBase):
|
||||
"vllm_ascend.attention.mla_v1.get_decode_context_model_parallel_world_size"
|
||||
)
|
||||
@patch("vllm_ascend.attention.mla_v1.get_ascend_config")
|
||||
def test_build_chunked_prefix_metadata(self, mock_get_ascend_config,
|
||||
@patch("vllm_ascend.attention.mla_v1.torch.zeros", wraps=torch.zeros)
|
||||
@patch("torch.Tensor.npu", new=lambda self: self)
|
||||
@patch("torch.npu.is_available")
|
||||
def test_build_chunked_prefix_metadata(self, mock_npu_available,
|
||||
mock_zeros, mock_get_ascend_config,
|
||||
mock_dcp_world_size):
|
||||
if not torch.npu.is_available():
|
||||
self.skipTest("NPU not available, skipping NPU-dependent tests")
|
||||
mock_npu_available.return_value = False
|
||||
mock_dcp_world_size.return_value = 1
|
||||
|
||||
def zeros_override(*args, **kwargs):
|
||||
kwargs.pop('pin_memory', None)
|
||||
return mock_zeros._mock_wraps(*args, **kwargs)
|
||||
|
||||
mock_zeros.side_effect = zeros_override
|
||||
|
||||
common_attn_metadata = AscendCommonAttentionMetadata(
|
||||
query_start_loc=torch.tensor([0, 2, 5, 9]),
|
||||
query_start_loc_cpu=torch.tensor([0, 2, 5, 9]),
|
||||
|
||||
@@ -32,7 +32,7 @@ class TestACLGraphEntry(TestBase):
|
||||
"""Test ACLGraphEntry initialization with default values"""
|
||||
batch_descriptor = BatchDescriptor(
|
||||
num_tokens=30,
|
||||
uniform_decode=False,
|
||||
uniform=False,
|
||||
)
|
||||
|
||||
entry = ACLGraphEntry(batch_descriptor=batch_descriptor)
|
||||
@@ -46,7 +46,7 @@ class TestACLGraphEntry(TestBase):
|
||||
"""Test ACLGraphEntry initialization with specified values"""
|
||||
batch_descriptor = BatchDescriptor(
|
||||
num_tokens=30,
|
||||
uniform_decode=False,
|
||||
uniform=False,
|
||||
)
|
||||
|
||||
mock_graph = MagicMock()
|
||||
@@ -89,7 +89,7 @@ class TestACLGraphWrapper(TestBase):
|
||||
# Mock BatchDescriptor
|
||||
self.mock_batch_descriptor = BatchDescriptor(
|
||||
num_tokens=30,
|
||||
uniform_decode=False,
|
||||
uniform=False,
|
||||
)
|
||||
|
||||
# Mock ForwardContext
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
|
||||
SchedulerConfig, SpeculativeConfig, VllmConfig)
|
||||
@@ -81,9 +81,7 @@ def make_output(scheduler):
|
||||
req.request_id: i
|
||||
for i, req in enumerate(scheduler.running)
|
||||
}
|
||||
sampled_token_ids = [
|
||||
np.array([1000], dtype=np.int64) for _ in scheduler.running
|
||||
]
|
||||
sampled_token_ids = [[1000]] * len(scheduler.running)
|
||||
|
||||
logprobs = None
|
||||
|
||||
@@ -98,6 +96,7 @@ def make_output(scheduler):
|
||||
return modelrunner_output
|
||||
|
||||
|
||||
@pytest.mark.skip("Ascend Scheduler has been deprecated")
|
||||
class TestAscendScheduler(TestBase):
|
||||
|
||||
@patch("vllm.config.ModelConfig.__post_init__", MagicMock())
|
||||
@@ -372,8 +371,7 @@ class TestAscendScheduler(TestBase):
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[np.array([EOS_TOKEN_ID]),
|
||||
np.array([10, 11])
|
||||
sampled_token_ids=[[EOS_TOKEN_ID], [10, 11]
|
||||
], # First request hits EOS, second continues
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@@ -424,9 +422,8 @@ class TestAscendScheduler(TestBase):
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[np.array([10, 42, 12]),
|
||||
np.array([13, 14])
|
||||
], # First request hits stop token
|
||||
sampled_token_ids=[[10, 42, 12],
|
||||
[13, 14]], # First request hits stop token
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -475,9 +472,8 @@ class TestAscendScheduler(TestBase):
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[np.array([10, 11, 12]),
|
||||
np.array([13])
|
||||
], # First request exceeds max_tokens
|
||||
sampled_token_ids=[[10, 11, 12],
|
||||
[13]], # First request exceeds max_tokens
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -516,7 +512,7 @@ class TestAscendScheduler(TestBase):
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])],
|
||||
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -573,7 +569,7 @@ class TestAscendScheduler(TestBase):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[np.array([0], dtype=np.int64)],
|
||||
sampled_token_ids=[[0]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -589,7 +585,7 @@ class TestAscendScheduler(TestBase):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[requests[1].request_id],
|
||||
req_id_to_index={requests[1].request_id: 0},
|
||||
sampled_token_ids=[np.array([0], dtype=np.int64)],
|
||||
sampled_token_ids=[[0]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -607,12 +603,10 @@ class TestAscendScheduler(TestBase):
|
||||
spec_tokens_list: List[List[List[int]]] = [[[1, 2, 3]], [[1, 2, 3]],
|
||||
[[1, 2], [3]], [[1]], [[]],
|
||||
[[1, 2, 3], [4, 5, 6]]]
|
||||
output_tokens_list: List[List[List[int]]] = [
|
||||
[np.array([1, 2, 3, 4])], [np.array([1, 5])],
|
||||
[np.array([1, 2, 5]), np.array([3, 4])], [np.array([1, 2])],
|
||||
[np.array([5])], [np.array([1, 2, 7]),
|
||||
np.array([4, 8])]
|
||||
]
|
||||
output_tokens_list: List[List[List[int]]] = [[[1, 2, 3, 4]], [[1, 5]],
|
||||
[[1, 2, 5], [3, 4]],
|
||||
[[1, 2]], [[5]],
|
||||
[[1, 2, 7], [4, 8]]]
|
||||
expected_list: List[Tuple[int, int,
|
||||
int, List[int]]] = [(1, 3, 3, [1, 1, 1]),
|
||||
(1, 3, 1, [1, 0, 0]),
|
||||
@@ -650,9 +644,7 @@ class TestAscendScheduler(TestBase):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=[
|
||||
np.array([0]) for _ in range(len(requests))
|
||||
],
|
||||
sampled_token_ids=[[0] for _ in range(len(requests))],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -892,11 +884,13 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
torch.float32, False))
|
||||
],
|
||||
)
|
||||
kv_cache_config.hash_block_size = block_size
|
||||
cache_config.num_gpu_blocks = 10000
|
||||
|
||||
scheduler = SchedulerDynamicBatch(
|
||||
vllm_config=vllm_config,
|
||||
kv_cache_config=kv_cache_config,
|
||||
block_size=block_size,
|
||||
log_stats=True,
|
||||
structured_output_manager=MagicMock(spec=StructuredOutputManager),
|
||||
)
|
||||
@@ -1064,8 +1058,7 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[np.array([EOS_TOKEN_ID]),
|
||||
np.array([10, 11])
|
||||
sampled_token_ids=[[EOS_TOKEN_ID], [10, 11]
|
||||
], # First request hits EOS, second continues
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@@ -1116,9 +1109,8 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[np.array([10, 42, 12]),
|
||||
np.array([13, 14])
|
||||
], # First request hits stop token
|
||||
sampled_token_ids=[[10, 42, 12],
|
||||
[13, 14]], # First request hits stop token
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -1167,9 +1159,8 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[np.array([10, 11, 12]),
|
||||
np.array([13])
|
||||
], # First request exceeds max_tokens
|
||||
sampled_token_ids=[[10, 11, 12],
|
||||
[13]], # First request exceeds max_tokens
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -1208,7 +1199,7 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])],
|
||||
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -1265,7 +1256,7 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[np.array([0])],
|
||||
sampled_token_ids=[[0]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -1281,7 +1272,7 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[requests[1].request_id],
|
||||
req_id_to_index={requests[1].request_id: 0},
|
||||
sampled_token_ids=[np.array([0])],
|
||||
sampled_token_ids=[[0]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -1299,12 +1290,10 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
spec_tokens_list: List[List[List[int]]] = [[[1, 2, 3]], [[1, 2, 3]],
|
||||
[[1, 2], [3]], [[1]], [[]],
|
||||
[[1, 2, 3], [4, 5, 6]]]
|
||||
output_tokens_list: List[List[List[int]]] = [
|
||||
[np.array([1, 2, 3, 4])], [np.array([1, 5])],
|
||||
[np.array([1, 2, 5]), np.array([3, 4])], [np.array([1, 2])],
|
||||
[np.array([5])], [np.array([1, 2, 7]),
|
||||
np.array([4, 8])]
|
||||
]
|
||||
output_tokens_list: List[List[List[int]]] = [[[1, 2, 3, 4]], [[1, 5]],
|
||||
[[1, 2, 5], [3, 4]],
|
||||
[[1, 2]], [[5]],
|
||||
[[1, 2, 7], [4, 8]]]
|
||||
expected_list: List[Tuple[int, int,
|
||||
int, List[int]]] = [(1, 3, 3, [1, 1, 1]),
|
||||
(1, 3, 1, [1, 0, 0]),
|
||||
@@ -1342,9 +1331,7 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=[
|
||||
np.array([0]) for _ in range(len(requests))
|
||||
],
|
||||
sampled_token_ids=[[0] for _ in range(len(requests))],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
import os
|
||||
from typing import Any, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from vllm import SamplingParams
|
||||
from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
|
||||
@@ -189,7 +188,7 @@ def create_model_runner_output(
|
||||
|
||||
# Make sampled tokens.
|
||||
sampled_token = EOS_TOKEN_ID if use_eos else 0
|
||||
sampled_token_ids = [np.array([sampled_token]) for _ in req_ids]
|
||||
sampled_token_ids = [[sampled_token] for _ in req_ids]
|
||||
|
||||
# Make output data structure.
|
||||
extra_args = {}
|
||||
|
||||
@@ -224,7 +224,6 @@ class TestEagleProposerGenerateTokenIds(TestBase):
|
||||
|
||||
def test_generate_token_ids_without_metadata(self):
|
||||
valid_sampled = [[20, 30, 40]]
|
||||
valid_sampled = [np.array(sublist) for sublist in valid_sampled]
|
||||
scheduler_output = MagicMock()
|
||||
scheduler_output.num_scheduled_tokens = [2, 1, 3]
|
||||
positions = torch.tensor([0, 1, 2, 3, 4, 5])
|
||||
@@ -251,7 +250,6 @@ class TestEagleProposerGenerateTokenIds(TestBase):
|
||||
|
||||
def test_generate_token_ids_with_metadata(self):
|
||||
valid_sampled = [[5], [6, 7], [8, 9, 10]]
|
||||
valid_sampled = [np.array(sublist) for sublist in valid_sampled]
|
||||
spec_metadata = MagicMock()
|
||||
spec_metadata.num_draft_tokens = [2, 3, 4]
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@ import torch
|
||||
from transformers import PretrainedConfig
|
||||
from vllm.config import CacheConfig
|
||||
from vllm.distributed.parallel_state import GroupCoordinator
|
||||
from vllm.transformers_utils.config import patch_rope_parameters
|
||||
|
||||
from vllm_ascend.torchair.models.torchair_deepseek_v2 import (
|
||||
TorchairDeepseekV2DecoderLayer, TorchairDeepseekV2ForCausalLM,
|
||||
@@ -59,6 +60,7 @@ def base_config():
|
||||
topk_group=1,
|
||||
vocab_size=10000,
|
||||
)
|
||||
patch_rope_parameters(config)
|
||||
return config
|
||||
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from torch import nn
|
||||
from vllm.distributed.parallel_state import GroupCoordinator
|
||||
@@ -180,17 +181,19 @@ class TestAscendMLATorchairMetadata(TestBase):
|
||||
class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
|
||||
def test_ascend_mla_metadata_builder_default(self):
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.model_config.get_head_size.return_value = 64
|
||||
mock_vllm_config.model_config.dtype = torch.float16
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
ascend_config = MagicMock()
|
||||
ascend_config.torchair_graph_config = MagicMock()
|
||||
ascend_config.torchair_graph_config.enabled = True
|
||||
@@ -204,22 +207,25 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
mock_vllm_config.cache_config.block_size)
|
||||
self.assertEqual(
|
||||
builder.chunked_prefill_enabled,
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled)
|
||||
mock_vllm_config.scheduler_config.enable_chunked_prefill)
|
||||
self.assertEqual(builder.torchair_graph_enabled, True)
|
||||
|
||||
@patch("vllm_ascend.torchair.torchair_mla.get_ascend_config")
|
||||
def test_reorder_batch_with_torchair_graph(self, ascend_config):
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
ascend_config.torchair_graph_config = MagicMock()
|
||||
ascend_config.torchair_graph_config.enabled = True
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
mock_device)
|
||||
@@ -248,15 +254,20 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
ascend_config.torchair_graph_config = MagicMock()
|
||||
ascend_config.torchair_graph_config.enabled = False
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
|
||||
with patch("vllm_ascend.torchair.torchair_mla.get_ascend_config",
|
||||
return_value=ascend_config):
|
||||
builder = AscendMLATorchairMetadataBuilder(None, None,
|
||||
@@ -287,14 +298,21 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
ascend_config = MagicMock()
|
||||
mock_ascend_config.return_value = ascend_config
|
||||
ascend_config.torchair_graph_config.enabled = False
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
mock_device)
|
||||
@@ -305,19 +323,26 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
self.assertEqual(result.shape[1], 64)
|
||||
self.assertTrue(torch.equal(result[:, :10], block_tables))
|
||||
|
||||
@pytest.mark.skip(reason="Skipping this test temporarily.")
|
||||
@patch("vllm_ascend.torchair.torchair_mla.get_ascend_config")
|
||||
def test_get_graph_runner_block_tables_truncated(self, mock_ascend_config):
|
||||
ascend_config = MagicMock()
|
||||
mock_ascend_config.return_value = ascend_config
|
||||
ascend_config.torchair_graph_config.enabled = False
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 64
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
mock_device)
|
||||
@@ -334,14 +359,21 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
ascend_config = MagicMock()
|
||||
mock_ascend_config.return_value = ascend_config
|
||||
ascend_config.torchair_graph_config.enabled = False
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
mock_device)
|
||||
@@ -360,16 +392,20 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
mock_ascend_config.return_value = ascend_config
|
||||
ascend_config.torchair_graph_config.enabled = False
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_vllm_config.get_head_size.return_value = 64
|
||||
mock_vllm_config.model_config.dtype = torch.float16
|
||||
mock_device = 'cpu'
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(
|
||||
None,
|
||||
None,
|
||||
@@ -427,18 +463,23 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
mock_ascend_config.return_value = ascend_config
|
||||
ascend_config.torchair_graph_config.enabled = False
|
||||
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_vllm_config.get_head_size.return_value = 64
|
||||
mock_vllm_config.model_config.dtype = torch.float16
|
||||
mock_device = 'cpu'
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
|
||||
model = MagicMock(spec=nn.Module)
|
||||
model.model = MagicMock(spec=nn.Module)
|
||||
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(
|
||||
None,
|
||||
None,
|
||||
|
||||
@@ -176,17 +176,19 @@ class TestAscendSFATorchairMetadata(TestBase):
|
||||
class TestAscendSFATorchairMetadataBuilder(TestBase):
|
||||
|
||||
def test_ascend_sfa_metadata_builder_default(self):
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.model_config.get_head_size.return_value = 64
|
||||
mock_vllm_config.model_config.dtype = torch.float16
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
ascend_config = MagicMock()
|
||||
ascend_config.torchair_graph_config = MagicMock()
|
||||
ascend_config.torchair_graph_config.enabled = True
|
||||
@@ -200,7 +202,7 @@ class TestAscendSFATorchairMetadataBuilder(TestBase):
|
||||
mock_vllm_config.cache_config.block_size)
|
||||
self.assertEqual(
|
||||
builder.chunked_prefill_enabled,
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled)
|
||||
mock_vllm_config.scheduler_config.enable_chunked_prefill)
|
||||
self.assertEqual(builder.torchair_graph_enabled, True)
|
||||
self.assertEqual(builder.max_blocks, (mock_vllm_config.model_config.max_model_len +
|
||||
mock_vllm_config.cache_config.block_size - 1) \
|
||||
@@ -208,17 +210,22 @@ class TestAscendSFATorchairMetadataBuilder(TestBase):
|
||||
|
||||
@patch("vllm_ascend.torchair.torchair_sfa.get_ascend_config")
|
||||
def test_reorder_batch_with_torchair_graph(self, ascend_config):
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
ascend_config.torchair_graph_config = MagicMock()
|
||||
ascend_config.torchair_graph_config.enabled = True
|
||||
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
builder = AscendSFATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
mock_device)
|
||||
@@ -247,13 +254,18 @@ class TestAscendSFATorchairMetadataBuilder(TestBase):
|
||||
ascend_config = MagicMock()
|
||||
mock_ascend_config.return_value = ascend_config
|
||||
ascend_config.torchair_graph_config.enabled = False
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
mock_device = torch.device('cpu')
|
||||
|
||||
builder = AscendSFATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
@@ -270,18 +282,25 @@ class TestAscendSFATorchairMetadataBuilder(TestBase):
|
||||
ascend_config = MagicMock()
|
||||
mock_ascend_config.return_value = ascend_config
|
||||
ascend_config.torchair_graph_config.enabled = False
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 64
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
|
||||
builder = AscendSFATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
mock_device)
|
||||
|
||||
builder.max_blocks = 4
|
||||
block_tables = torch.randint(0, 100, (3, 10), dtype=torch.int32)
|
||||
|
||||
result = builder._get_graph_runner_block_tables(3, block_tables)
|
||||
@@ -295,14 +314,19 @@ class TestAscendSFATorchairMetadataBuilder(TestBase):
|
||||
ascend_config = MagicMock()
|
||||
mock_ascend_config.return_value = ascend_config
|
||||
ascend_config.torchair_graph_config.enabled = False
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
builder = AscendSFATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
mock_device)
|
||||
|
||||
Reference in New Issue
Block a user