[CI] speed up ut (#4901)
avoid model download to speed up ut test.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -192,3 +192,22 @@ def test_output_between_eager_and_full_decode_only(
|
|||||||
name_0="vllm_eager_outputs",
|
name_0="vllm_eager_outputs",
|
||||||
name_1="vllm_aclgraph_outputs",
|
name_1="vllm_aclgraph_outputs",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_aclgraph_enable():
|
||||||
|
# Generally, this test is not belong to e2e, but it is a good way to check if
|
||||||
|
# aclgraph is enabled in real environment
|
||||||
|
from vllm.config.compilation import CompilationMode, CUDAGraphMode
|
||||||
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
|
|
||||||
|
from vllm_ascend.platform import NPUPlatform
|
||||||
|
|
||||||
|
# vLLM default mode is piecewise cudagraph
|
||||||
|
config = EngineArgs()
|
||||||
|
VllmConfig = config.create_engine_config()
|
||||||
|
assert VllmConfig.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
|
||||||
|
|
||||||
|
# after check_and_update_config, mode should be VLLM_COMPILE and piecewise cudagraph
|
||||||
|
NPUPlatform.check_and_update_config(VllmConfig)
|
||||||
|
assert VllmConfig.compilation_config.mode == CompilationMode.VLLM_COMPILE
|
||||||
|
assert VllmConfig.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import os
|
||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
@@ -507,8 +508,6 @@ class TestAscendMLAMetadataBuilderBuild(TestBase):
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.mock_vllm_config = MagicMock(spec=VllmConfig)
|
self.mock_vllm_config = MagicMock(spec=VllmConfig)
|
||||||
self.mock_vllm_config.model_config = ModelConfig(max_model_len=2048)
|
|
||||||
self.mock_vllm_config.model_config.hf_text_config.qk_rope_head_dim = 32
|
|
||||||
self.mock_vllm_config.cache_config = CacheConfig(block_size=32)
|
self.mock_vllm_config.cache_config = CacheConfig(block_size=32)
|
||||||
mock_scheduler_config = MagicMock(spec=SchedulerConfig)
|
mock_scheduler_config = MagicMock(spec=SchedulerConfig)
|
||||||
mock_scheduler_config.max_num_seqs = 8
|
mock_scheduler_config.max_num_seqs = 8
|
||||||
@@ -516,7 +515,15 @@ class TestAscendMLAMetadataBuilderBuild(TestBase):
|
|||||||
self.mock_vllm_config.scheduler_config = mock_scheduler_config
|
self.mock_vllm_config.scheduler_config = mock_scheduler_config
|
||||||
self.mock_vllm_config.speculative_config = None
|
self.mock_vllm_config.speculative_config = None
|
||||||
self.mock_device = torch.device("cpu")
|
self.mock_device = torch.device("cpu")
|
||||||
|
fake_weight_path = os.path.join(os.path.dirname(__file__), "..",
|
||||||
|
"fake_weight")
|
||||||
|
model_config = ModelConfig(
|
||||||
|
model=fake_weight_path,
|
||||||
|
skip_tokenizer_init=True,
|
||||||
|
)
|
||||||
|
model_config.hf_text_config.head_dim = 128
|
||||||
|
model_config.hf_text_config.qk_rope_head_dim = 32
|
||||||
|
self.mock_vllm_config.model_config = model_config
|
||||||
self.kv_cache_spec = MagicMock()
|
self.kv_cache_spec = MagicMock()
|
||||||
self.kv_cache_spec.num_layers = 32
|
self.kv_cache_spec.num_layers = 32
|
||||||
self.kv_cache_spec.head_size = 128
|
self.kv_cache_spec.head_size = 128
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ from unittest.mock import MagicMock, patch
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from vllm.config.compilation import CompilationMode, CUDAGraphMode
|
from vllm.config.compilation import CompilationMode, CUDAGraphMode
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
|
||||||
from vllm.platforms import PlatformEnum
|
from vllm.platforms import PlatformEnum
|
||||||
|
|
||||||
from tests.ut.base import TestBase
|
from tests.ut.base import TestBase
|
||||||
@@ -579,30 +578,3 @@ class TestNPUPlatform(TestBase):
|
|||||||
self.platform.get_static_graph_wrapper_cls(),
|
self.platform.get_static_graph_wrapper_cls(),
|
||||||
"vllm_ascend.compilation.acl_graph.ACLGraphWrapper",
|
"vllm_ascend.compilation.acl_graph.ACLGraphWrapper",
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_aclgraph_enable(self):
|
|
||||||
config = EngineArgs()
|
|
||||||
VllmConfig = config.create_engine_config()
|
|
||||||
self.assertEqual(VllmConfig.compilation_config.cudagraph_mode,
|
|
||||||
CUDAGraphMode.PIECEWISE)
|
|
||||||
|
|
||||||
with self.assertLogs(logger="vllm", level="INFO") as cm:
|
|
||||||
from vllm_ascend import platform
|
|
||||||
|
|
||||||
importlib.reload(platform)
|
|
||||||
self.platform.check_and_update_config(VllmConfig)
|
|
||||||
target_msg = "PIECEWISE compilation enabled on NPU. use_inductor not supported - using only ACL Graph mode"
|
|
||||||
found = any(target_msg in log for log in cm.output)
|
|
||||||
|
|
||||||
self.assertTrue(
|
|
||||||
found,
|
|
||||||
f"Expected log message not found. Captured logs: {cm.output}")
|
|
||||||
|
|
||||||
self.assertEqual(
|
|
||||||
VllmConfig.compilation_config.mode,
|
|
||||||
CompilationMode.VLLM_COMPILE,
|
|
||||||
)
|
|
||||||
self.assertEqual(
|
|
||||||
VllmConfig.compilation_config.cudagraph_mode,
|
|
||||||
CUDAGraphMode.PIECEWISE,
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ from abc import abstractmethod
|
|||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from vllm.logger import logger
|
||||||
|
|
||||||
|
|
||||||
class DynamicConfig:
|
class DynamicConfig:
|
||||||
@@ -69,21 +70,21 @@ class DynamicEplbV2(EplbPolicy):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def safe_divide(a, b):
|
def safe_divide(a, b):
|
||||||
if b == 0:
|
if b == 0:
|
||||||
print("Division by zero is not allowed")
|
logger.info("Division by zero is not allowed")
|
||||||
return 0
|
return 0
|
||||||
return a / b
|
return a / b
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def safe_exact_divide(a, b):
|
def safe_exact_divide(a, b):
|
||||||
if b == 0:
|
if b == 0:
|
||||||
print("Division by zero is not allowed")
|
logger.info("Division by zero is not allowed")
|
||||||
return 0
|
return 0
|
||||||
return a // b
|
return a // b
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def safe_mod(a, b):
|
def safe_mod(a, b):
|
||||||
if b == 0:
|
if b == 0:
|
||||||
print("Division by zero is not allowed")
|
logger.info("Division by zero is not allowed")
|
||||||
return 0
|
return 0
|
||||||
return a % b
|
return a % b
|
||||||
|
|
||||||
|
|||||||
@@ -60,7 +60,7 @@ class ElasticClient:
|
|||||||
ip, port_str = source.split(':')
|
ip, port_str = source.split(':')
|
||||||
port = int(port_str)
|
port = int(port_str)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"IP format error: {source}, detail: {e}")
|
logger.info(f"IP format error: {source}, detail: {e}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
self.server_addr = ip
|
self.server_addr = ip
|
||||||
|
|||||||
@@ -80,5 +80,5 @@ def elastic_load(
|
|||||||
time.perf_counter() - t0))
|
time.perf_counter() - t0))
|
||||||
return model_loaded
|
return model_loaded
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"elastic_load error: {e}")
|
logger.info(f"elastic_load error: {e}")
|
||||||
return None
|
return None
|
||||||
|
|||||||
Reference in New Issue
Block a user