[CI] drop ascend scheduler test (#4582)
let' drop ascend scheduler test first to ensure all function works without it. - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -84,67 +84,3 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
|
||||
name_0="vllm_output",
|
||||
name_1="prefix_cache_output",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Fix me, the accuracy is not correct")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [50])
|
||||
def test_prefix_cache_with_ascend_scheduler(model: str,
|
||||
max_tokens: int) -> None:
|
||||
|
||||
with VllmRunner(model,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
},
|
||||
},
|
||||
enforce_eager=False,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=2,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
|
||||
|
||||
with VllmRunner(model,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
'enable_prefix_caching': True,
|
||||
},
|
||||
},
|
||||
enforce_eager=False,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=2,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
prefix_cache_output = vllm_model.generate_greedy(
|
||||
INPUT_PROMPTS, max_tokens)
|
||||
|
||||
# TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
|
||||
# Disable it now. Fix it or drop the ascend scheduler in the future.
|
||||
# with VllmRunner(model,
|
||||
# additional_config={
|
||||
# 'ascend_scheduler_config': {
|
||||
# 'enabled': True,
|
||||
# 'enable_prefix_caching': True,
|
||||
# "enable_chunked_prefill": True,
|
||||
# },
|
||||
# },
|
||||
# enforce_eager=True,
|
||||
# max_model_len=2048,
|
||||
# tensor_parallel_size=2,
|
||||
# gpu_memory_utilization=0.7) as vllm_model:
|
||||
# chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
|
||||
# INPUT_PROMPTS, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_output,
|
||||
outputs_1_lst=prefix_cache_output,
|
||||
name_0="vllm_output",
|
||||
name_1="prefix_cache_output",
|
||||
)
|
||||
|
||||
# check_outputs_equal(
|
||||
# outputs_0_lst=chunk_prefill_prefix_cache_output,
|
||||
# outputs_1_lst=prefix_cache_output,
|
||||
# name_0="chunk_prefill_prefix_cache_output",
|
||||
# name_1="prefix_cache_output",
|
||||
# )
|
||||
|
||||
Reference in New Issue
Block a user