drop ascend scheduler (#4498)
Ascend scheduler was added for non chunk prefill case before, since that the npu ops didn't work well with chunked prefill. Now the ops with chunked prefill work better, it's time to remove the ascend scheduler to use vLLM default scheduler. - vLLM version: v0.11.2 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -15,23 +15,14 @@ def test_e2e_ep_correctness(model_name):
|
||||
max_tokens = 5
|
||||
|
||||
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
tensor_parallel_size=2,
|
||||
additional_config={"ascend_scheduler_config": {
|
||||
"enabled": True
|
||||
}},
|
||||
enforce_eager=False) as vllm_model:
|
||||
with VllmRunner(model_name, tensor_parallel_size=2,
|
||||
enforce_eager=False) as vllm_model:
|
||||
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
additional_config={"ascend_scheduler_config": {
|
||||
"enabled": True
|
||||
}},
|
||||
enforce_eager=False) as vllm_model:
|
||||
with VllmRunner(model_name,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=False) as vllm_model:
|
||||
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
|
||||
@@ -49,13 +49,7 @@ def test_generate_with_allgather():
|
||||
tensor_parallel_size=2,
|
||||
max_model_len=1024,
|
||||
dtype="auto",
|
||||
enable_expert_parallel=True,
|
||||
additional_config={
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
"chunked_prefill_enabled": False,
|
||||
},
|
||||
}) as vllm_model:
|
||||
enable_expert_parallel=True) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@@ -76,11 +70,5 @@ def test_generate_with_alltoall():
|
||||
tensor_parallel_size=2,
|
||||
max_model_len=1024,
|
||||
dtype="auto",
|
||||
enable_expert_parallel=True,
|
||||
additional_config={
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
"chunked_prefill_enabled": False,
|
||||
},
|
||||
}) as vllm_model:
|
||||
enable_expert_parallel=True) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
@@ -82,9 +82,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
|
||||
"enabled": True,
|
||||
},
|
||||
"enable_multistream_moe": True,
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
"refresh": True,
|
||||
},
|
||||
) as vllm_model:
|
||||
@@ -154,14 +151,9 @@ def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
|
||||
quantization="ascend",
|
||||
enforce_eager=True,
|
||||
enable_expert_parallel=True,
|
||||
additional_config={
|
||||
"torchair_graph_config": {
|
||||
"enabled": False,
|
||||
},
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
}
|
||||
},
|
||||
additional_config={"torchair_graph_config": {
|
||||
"enabled": False,
|
||||
}},
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(prompts, max_tokens)
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Compare the with and without prefix caching on V1 scheduler or AscendScheduler."""
|
||||
"""Compare the with and without prefix caching on V1 scheduler."""
|
||||
|
||||
import pytest
|
||||
|
||||
@@ -84,67 +84,3 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
|
||||
name_0="vllm_output",
|
||||
name_1="prefix_cache_output",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Fix me, the accuracy is not correct")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [50])
|
||||
def test_prefix_cache_with_ascend_scheduler(model: str,
|
||||
max_tokens: int) -> None:
|
||||
|
||||
with VllmRunner(model,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
},
|
||||
},
|
||||
enforce_eager=False,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=2,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
|
||||
|
||||
with VllmRunner(model,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
'enable_prefix_caching': True,
|
||||
},
|
||||
},
|
||||
enforce_eager=False,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=2,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
prefix_cache_output = vllm_model.generate_greedy(
|
||||
INPUT_PROMPTS, max_tokens)
|
||||
|
||||
# TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
|
||||
# Disable it now. Fix it or drop the ascend scheduler in the future.
|
||||
# with VllmRunner(model,
|
||||
# additional_config={
|
||||
# 'ascend_scheduler_config': {
|
||||
# 'enabled': True,
|
||||
# 'enable_prefix_caching': True,
|
||||
# "enable_chunked_prefill": True,
|
||||
# },
|
||||
# },
|
||||
# enforce_eager=True,
|
||||
# max_model_len=2048,
|
||||
# tensor_parallel_size=2,
|
||||
# gpu_memory_utilization=0.7) as vllm_model:
|
||||
# chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
|
||||
# INPUT_PROMPTS, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_output,
|
||||
outputs_1_lst=prefix_cache_output,
|
||||
name_0="vllm_output",
|
||||
name_1="prefix_cache_output",
|
||||
)
|
||||
|
||||
# check_outputs_equal(
|
||||
# outputs_0_lst=chunk_prefill_prefix_cache_output,
|
||||
# outputs_1_lst=prefix_cache_output,
|
||||
# name_0="chunk_prefill_prefix_cache_output",
|
||||
# name_1="prefix_cache_output",
|
||||
# )
|
||||
|
||||
@@ -24,6 +24,7 @@ Run `pytest tests/e2e/multicard/test_qwen3_next.py`.
|
||||
import os
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
@@ -63,6 +64,8 @@ def test_models_distributed_Qwen3_NEXT_TP4_FULL_DECODE_ONLY():
|
||||
del vllm_model
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason="Qwen3-Next + MTP doesn't work with chunked prefill. Fix Me")
|
||||
def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
@@ -89,12 +92,6 @@ def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
|
||||
gpu_memory_utilization=0.8,
|
||||
distributed_executor_backend="mp",
|
||||
enforce_eager=True,
|
||||
additional_config={
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
"enable_chunked_prefill": False
|
||||
}
|
||||
},
|
||||
speculative_config={
|
||||
"method": "qwen3_next_mtp",
|
||||
"num_speculative_tokens": 1
|
||||
|
||||
@@ -44,9 +44,6 @@ def _deepseek_torchair_test_fixture(
|
||||
kwargs = {}
|
||||
if not use_v1_schduler:
|
||||
kwargs = {
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
"refresh": True,
|
||||
}
|
||||
additional_config.update(**kwargs)
|
||||
@@ -120,9 +117,6 @@ def _pangu_torchair_test_fixture(
|
||||
|
||||
# torchair is only work without chunked-prefill now
|
||||
kwargs = {
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
"refresh": True,
|
||||
}
|
||||
additional_config.update(**kwargs)
|
||||
@@ -185,9 +179,6 @@ def _qwen_torchair_test_fixture(
|
||||
"torchair_graph_config": {
|
||||
"enabled": False,
|
||||
},
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
"refresh": True,
|
||||
}
|
||||
|
||||
@@ -244,9 +235,6 @@ def _deepseek_v2_lite_torchair_test_fixure(
|
||||
kwargs = {}
|
||||
if not use_v1_schduler:
|
||||
kwargs = {
|
||||
"ascend_scheduler_config": {
|
||||
"enable": True,
|
||||
},
|
||||
"refresh": True,
|
||||
}
|
||||
additional_config.update(**kwargs)
|
||||
|
||||
Reference in New Issue
Block a user