Ascend scheduler was added for non chunk prefill case before, since that the npu ops didn't work well with chunked prefill. Now the ops with chunked prefill work better, it's time to remove the ascend scheduler to use vLLM default scheduler. - vLLM version: v0.11.2 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
34 lines
1.1 KiB
Python
34 lines
1.1 KiB
Python
import pytest
|
|
|
|
from tests.e2e.conftest import VllmRunner
|
|
from tests.e2e.model_utils import check_outputs_equal
|
|
|
|
|
|
@pytest.mark.parametrize("model_name", ["deepseek-ai/DeepSeek-V2-Lite-Chat"])
|
|
def test_e2e_ep_correctness(model_name):
|
|
example_prompts = [
|
|
"Hello, my name is",
|
|
"The president of the United States is",
|
|
"The capital of France is",
|
|
"The future of AI is",
|
|
]
|
|
max_tokens = 5
|
|
|
|
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
|
|
with VllmRunner(model_name, tensor_parallel_size=2,
|
|
enforce_eager=False) as vllm_model:
|
|
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
|
|
|
with VllmRunner(model_name,
|
|
tensor_parallel_size=2,
|
|
enable_expert_parallel=True,
|
|
enforce_eager=False) as vllm_model:
|
|
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
|
|
|
check_outputs_equal(
|
|
outputs_0_lst=ep_output,
|
|
outputs_1_lst=tp_output,
|
|
name_0="ep_output",
|
|
name_1="tp_output",
|
|
)
|