[CI] drop ascend scheduler test (#4582)
let' drop ascend scheduler test first to ensure all function works without it. - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -15,23 +15,14 @@ def test_e2e_ep_correctness(model_name):
|
||||
max_tokens = 5
|
||||
|
||||
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
tensor_parallel_size=2,
|
||||
additional_config={"ascend_scheduler_config": {
|
||||
"enabled": True
|
||||
}},
|
||||
enforce_eager=False) as vllm_model:
|
||||
with VllmRunner(model_name, tensor_parallel_size=2,
|
||||
enforce_eager=False) as vllm_model:
|
||||
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
additional_config={"ascend_scheduler_config": {
|
||||
"enabled": True
|
||||
}},
|
||||
enforce_eager=False) as vllm_model:
|
||||
with VllmRunner(model_name,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=False) as vllm_model:
|
||||
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
|
||||
@@ -49,13 +49,7 @@ def test_generate_with_allgather():
|
||||
tensor_parallel_size=2,
|
||||
max_model_len=1024,
|
||||
dtype="auto",
|
||||
enable_expert_parallel=True,
|
||||
additional_config={
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
"chunked_prefill_enabled": False,
|
||||
},
|
||||
}) as vllm_model:
|
||||
enable_expert_parallel=True) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@@ -76,11 +70,5 @@ def test_generate_with_alltoall():
|
||||
tensor_parallel_size=2,
|
||||
max_model_len=1024,
|
||||
dtype="auto",
|
||||
enable_expert_parallel=True,
|
||||
additional_config={
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
"chunked_prefill_enabled": False,
|
||||
},
|
||||
}) as vllm_model:
|
||||
enable_expert_parallel=True) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
@@ -82,9 +82,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
|
||||
"enabled": True,
|
||||
},
|
||||
"enable_multistream_moe": True,
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
"refresh": True,
|
||||
},
|
||||
) as vllm_model:
|
||||
@@ -154,14 +151,9 @@ def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
|
||||
quantization="ascend",
|
||||
enforce_eager=True,
|
||||
enable_expert_parallel=True,
|
||||
additional_config={
|
||||
"torchair_graph_config": {
|
||||
"enabled": False,
|
||||
},
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
}
|
||||
},
|
||||
additional_config={"torchair_graph_config": {
|
||||
"enabled": False,
|
||||
}},
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(prompts, max_tokens)
|
||||
|
||||
|
||||
@@ -84,67 +84,3 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
|
||||
name_0="vllm_output",
|
||||
name_1="prefix_cache_output",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Fix me, the accuracy is not correct")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [50])
|
||||
def test_prefix_cache_with_ascend_scheduler(model: str,
|
||||
max_tokens: int) -> None:
|
||||
|
||||
with VllmRunner(model,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
},
|
||||
},
|
||||
enforce_eager=False,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=2,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
|
||||
|
||||
with VllmRunner(model,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
'enable_prefix_caching': True,
|
||||
},
|
||||
},
|
||||
enforce_eager=False,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=2,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
prefix_cache_output = vllm_model.generate_greedy(
|
||||
INPUT_PROMPTS, max_tokens)
|
||||
|
||||
# TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
|
||||
# Disable it now. Fix it or drop the ascend scheduler in the future.
|
||||
# with VllmRunner(model,
|
||||
# additional_config={
|
||||
# 'ascend_scheduler_config': {
|
||||
# 'enabled': True,
|
||||
# 'enable_prefix_caching': True,
|
||||
# "enable_chunked_prefill": True,
|
||||
# },
|
||||
# },
|
||||
# enforce_eager=True,
|
||||
# max_model_len=2048,
|
||||
# tensor_parallel_size=2,
|
||||
# gpu_memory_utilization=0.7) as vllm_model:
|
||||
# chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
|
||||
# INPUT_PROMPTS, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_output,
|
||||
outputs_1_lst=prefix_cache_output,
|
||||
name_0="vllm_output",
|
||||
name_1="prefix_cache_output",
|
||||
)
|
||||
|
||||
# check_outputs_equal(
|
||||
# outputs_0_lst=chunk_prefill_prefix_cache_output,
|
||||
# outputs_1_lst=prefix_cache_output,
|
||||
# name_0="chunk_prefill_prefix_cache_output",
|
||||
# name_1="prefix_cache_output",
|
||||
# )
|
||||
|
||||
@@ -89,12 +89,6 @@ def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
|
||||
gpu_memory_utilization=0.8,
|
||||
distributed_executor_backend="mp",
|
||||
enforce_eager=True,
|
||||
additional_config={
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
"enable_chunked_prefill": False
|
||||
}
|
||||
},
|
||||
speculative_config={
|
||||
"method": "qwen3_next_mtp",
|
||||
"num_speculative_tokens": 1
|
||||
|
||||
@@ -44,9 +44,6 @@ def _deepseek_torchair_test_fixture(
|
||||
kwargs = {}
|
||||
if not use_v1_schduler:
|
||||
kwargs = {
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
"refresh": True,
|
||||
}
|
||||
additional_config.update(**kwargs)
|
||||
@@ -121,9 +118,6 @@ def _pangu_torchair_test_fixture(
|
||||
|
||||
# torchair is only work without chunked-prefill now
|
||||
kwargs = {
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
"refresh": True,
|
||||
}
|
||||
additional_config.update(**kwargs)
|
||||
@@ -186,9 +180,6 @@ def _qwen_torchair_test_fixture(
|
||||
"torchair_graph_config": {
|
||||
"enabled": False,
|
||||
},
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
"refresh": True,
|
||||
}
|
||||
|
||||
@@ -245,9 +236,6 @@ def _deepseek_v2_lite_torchair_test_fixure(
|
||||
kwargs = {}
|
||||
if not use_v1_schduler:
|
||||
kwargs = {
|
||||
"ascend_scheduler_config": {
|
||||
"enable": True,
|
||||
},
|
||||
"refresh": True,
|
||||
}
|
||||
additional_config.update(**kwargs)
|
||||
|
||||
Reference in New Issue
Block a user