enginex-ascend-910-vllm/tests/e2e/multicard/test_expert_parallel.py

import pytest

from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal


@pytest.mark.parametrize("model_name", ["deepseek-ai/DeepSeek-V2-Lite-Chat"])
def test_e2e_ep_correctness(model_name):
    example_prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ]
    max_tokens = 5

    # FIXME: Really strange that chunked prefill might lead to different results, investigate further
    with VllmRunner(
            model_name,
            tensor_parallel_size=2,
            additional_config={"ascend_scheduler_config": {
                "enabled": True
            }},
            enforce_eager=True) as vllm_model:
        tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)

    with VllmRunner(
            model_name,
            tensor_parallel_size=2,
            enable_expert_parallel=True,
            additional_config={"ascend_scheduler_config": {
                "enabled": True
            }},
            enforce_eager=True) as vllm_model:
        ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)

    check_outputs_equal(
        outputs_0_lst=ep_output,
        outputs_1_lst=tp_output,
        name_0="ep_output",
        name_1="tp_output",
    )
v0.10.1rc1 2025-09-09 09:40:35 +08:00			`import pytest`

			`from tests.e2e.conftest import VllmRunner`
			`from tests.e2e.model_utils import check_outputs_equal`


			`@pytest.mark.parametrize("model_name", ["deepseek-ai/DeepSeek-V2-Lite-Chat"])`
			`def test_e2e_ep_correctness(model_name):`
			`example_prompts = [`
			`"Hello, my name is",`
			`"The president of the United States is",`
			`"The capital of France is",`
			`"The future of AI is",`
			`]`
			`max_tokens = 5`

init v0.11.0rc0 2025-10-14 10:38:28 +08:00			`# FIXME: Really strange that chunked prefill might lead to different results, investigate further`
			`with VllmRunner(`
			`model_name,`
			`tensor_parallel_size=2,`
			`additional_config={"ascend_scheduler_config": {`
			`"enabled": True`
			`}},`
			`enforce_eager=True) as vllm_model:`
v0.10.1rc1 2025-09-09 09:40:35 +08:00			`tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)`

init v0.11.0rc0 2025-10-14 10:38:28 +08:00			`with VllmRunner(`
			`model_name,`
			`tensor_parallel_size=2,`
			`enable_expert_parallel=True,`
			`additional_config={"ascend_scheduler_config": {`
			`"enabled": True`
			`}},`
			`enforce_eager=True) as vllm_model:`
v0.10.1rc1 2025-09-09 09:40:35 +08:00			`ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)`

			`check_outputs_equal(`
			`outputs_0_lst=ep_output,`
			`outputs_1_lst=tp_output,`
			`name_0="ep_output",`
			`name_1="tp_output",`
			`)`