xc-llm-ascend/tests/e2e/multicard/test_torchair_graph_mode.py

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
"""Compare the short outputs of HF and vLLM when using greedy sampling.

Run `pytest tests/multicard/test_torchair_graph_mode.py`.
"""
import os
from typing import Dict

from tests.e2e.conftest import VllmRunner

os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"


def _deepseek_torchair_test_fixture(
    additional_config: Dict,
    *,
    tensor_parallel_size=2,
    use_v1_schduler=False,
):
    example_prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ]
    kwargs = {}
    if not use_v1_schduler:
        kwargs = {
            "ascend_scheduler_config": {
                "enabled": True,
            },
            "refresh": True,
        }
    additional_config.update(**kwargs)

    with VllmRunner(
            "vllm-ascend/DeepSeek-V3-Pruning",
            dtype="half",
            tensor_parallel_size=tensor_parallel_size,
            distributed_executor_backend="mp",
            enforce_eager=False,
            additional_config=additional_config,
    ) as vllm_model:
        # use greedy sampler to make sure the generated results are fix
        vllm_output = vllm_model.generate_greedy(example_prompts, 5)

    # NOTE: vllm-ascend/DeepSeek-V3-Pruning is a random weight of
    # DeepSeek-V3 with 2 hidden layers, thus the golden results seems
    # inaccurate. This will only change if accuracy improves with the
    # official weights of DeepSeek-V3.
    golden_results = [
        'Hello, my name is下载早点向前很有่อง',
        'The president of the United States isSender)## physiological Albany',
        'The capital of France is Rocky转角 hospitalizedinterval sparked',
        'The future of AI is её asegο BIOS一扫',
    ]

    assert len(golden_results) == len(vllm_output)
    for i in range(len(vllm_output)):
        assert golden_results[i] == vllm_output[i][1]
        print(f"Generated text: {vllm_output[i][1]!r}")


def test_e2e_deepseekv3_with_torchair():
    additional_config = {
        "torchair_graph_config": {
            "enabled": True,
        },
    }
    _deepseek_torchair_test_fixture(additional_config)


def test_e2e_deepseekv3_with_torchair_ms_mla():
    additional_config = {
        "torchair_graph_config": {
            "enabled": True,
            "enable_multistream_mla": True,
        },
    }
    _deepseek_torchair_test_fixture(additional_config)


def test_e2e_deepseekv3_with_torchair_v1scheduler():
    additional_config = {
        "torchair_graph_config": {
            "enabled": True,
        },
    }
    _deepseek_torchair_test_fixture(additional_config, use_v1_schduler=True)


def _pangu_torchair_test_fixture(
    additional_config: Dict,
    *,
    tensor_parallel_size=2,
):
    example_prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ]

    # torchair is only work without chunked-prefill now
    kwargs = {
        "ascend_scheduler_config": {
            "enabled": True,
        },
        "refresh": True,
    }
    additional_config.update(**kwargs)

    with VllmRunner(
            "vllm-ascend/pangu-pro-moe-pruing",
            dtype="half",
            tensor_parallel_size=tensor_parallel_size,
            distributed_executor_backend="mp",
            enforce_eager=False,
            additional_config=additional_config,
            enable_expert_parallel=True,
    ) as vllm_model:
        # use greedy sampler to make sure the generated results are fix
        vllm_output = vllm_model.generate_greedy(example_prompts, 5)

    # NOTE: vllm-ascend/pangu-pro-moe-pruing is only part of PanguProMoE
    # with 2 hidden layers, thus the golden results seems inaccurate.
    # This will only change if accuracy changes with the official weights
    # of PanguProMoE.
    golden_results = [
        'Hello, my name is Remempondeprecatedmiot忱',
        'The president of the United States is Remem下的一个 rever ceremoni Segnali',
        'The capital of France is Rememvoud administrativ Remem投',
        'The future of AI isotope Segnali Zoeken精细化 supus',
    ]

    assert len(golden_results) == len(vllm_output)
    for i in range(len(vllm_output)):
        assert golden_results[i] == vllm_output[i][1]
        print(f"Generated text: {vllm_output[i][1]!r}")


def test_e2e_pangu_with_torchair():
    additional_config = {
        "torchair_graph_config": {
            "enabled": True,
        },
    }
    _pangu_torchair_test_fixture(additional_config)


def _qwen_torchair_test_fixture(
    model,
    tp,
    enable_expert_parallel,
):
    # The current access control does not support 16 cards,
    # so the MC2 operator in Qwen's graph mode cannot run.
    # Once 16-card support is available,
    # this e2e can be switched to graph mode.
    example_prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ]

    additional_config = {
        "torchair_graph_config": {
            "enabled": False,
        },
        "ascend_scheduler_config": {
            "enabled": True,
        },
        "refresh": True,
    }

    with VllmRunner(
            model,
            dtype="half",
            tensor_parallel_size=tp,
            distributed_executor_backend="mp",
            enforce_eager=True,
            additional_config=additional_config,
            enable_expert_parallel=enable_expert_parallel,
    ) as vllm_model:
        # use greedy sampler to make sure the generated results are fix
        vllm_output = vllm_model.generate_greedy(example_prompts, 5)

    # NOTE: vllm-ascend/pangu-pro-moe-pruing is only part of PanguProMoE
    # with 2 hidden layers, thus the golden results seems inaccurate.
    # This will only change if accuracy changes with the official weights
    # of PanguProMoE.
    golden_results = [
        'Hello, my name is Remempondeprecatedmiot忱',
        'The president of the United States is Remem下的一个 rever ceremoni Segnali',
        'The capital of France is Rememvoud administrativ Remem投',
        'The future of AI isotope Segnali Zoeken精细化 supus',
    ]

    assert len(golden_results) == len(vllm_output)
    for i in range(len(vllm_output)):
        print(f"Generated text: {vllm_output[i][1]!r}")


def test_e2e_qwen2_with_torchair():
    _qwen_torchair_test_fixture("Qwen/Qwen2.5-0.5B-Instruct", 2, False)


def test_e2e_qwen3_moe_with_torchair():
    _qwen_torchair_test_fixture("Qwen/Qwen3-30B-A3B", 2, True)
-												[CI/UT][Graph] Add ut for torchair graph mode (#1103)

### What this PR does / why we need it?
Add ut for torchair graph mode on DeepSeekV3

### How was this patch tested?
CI passed with new added test.

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Mengqing Cao <cmq0113@163.com>
											
										
										
											2025-06-14 16:59:00 +08:00
+								#
 								# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 								# Copyright 2023 The vLLM team.
 								#
 								# Licensed under the Apache License, Version 2.0 (the "License");
 								# you may not use this file except in compliance with the License.
 								# You may obtain a copy of the License at
 								#
 								#     http://www.apache.org/licenses/LICENSE-2.0
 								#
 								# Unless required by applicable law or agreed to in writing, software
 								# distributed under the License is distributed on an "AS IS" BASIS,
 								# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								# See the License for the specific language governing permissions and
 								# limitations under the License.
 								# This file is a part of the vllm-ascend project.
 								#
 								"""Compare the short outputs of HF and vLLM when using greedy sampling.
 								Run `pytest tests/multicard/test_torchair_graph_mode.py`.
 								"""
 								import os
-												Handle with_prefill_across_dp for multistream mla (#1322)

### What this PR does / why we need it?
After #1094, decode might be executed with non-compiled mode, despite of
`torchair_graph_config.enabled`, causing multistream mla to fail, which
assumes torchair compiled mode for decode when
`torchair_graph_config.enabled == True`.
Augment that assumption to fix this.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Tested both offline, and by graph mode mla e2e testcase.

---------

Signed-off-by: sdmyzlp <lrwei2@petalmail.com>
											
										
										
											2025-06-26 09:32:07 +08:00
+								from typing import Dict
-												[CI/UT][Graph] Add ut for torchair graph mode (#1103)

### What this PR does / why we need it?
Add ut for torchair graph mode on DeepSeekV3

### How was this patch tested?
CI passed with new added test.

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Mengqing Cao <cmq0113@163.com>
											
										
										
											2025-06-14 16:59:00 +08:00
-												[Test] Remove VLLM_USE_V1 in example and tests (#1733)

V1 is enabled by default, no need to set it by hand now. This PR remove
the useless setting in example and tests

- vLLM version: v0.9.2
- vLLM main:
https://github.com/vllm-project/vllm/commit/9ad0a4588ba4e9c979cda0d178dec4fcdb89fd0c

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
											
										
										
											2025-07-15 12:49:57 +08:00
+								from tests.e2e.conftest import VllmRunner
-												[CI/UT][Graph] Add ut for torchair graph mode (#1103)

### What this PR does / why we need it?
Add ut for torchair graph mode on DeepSeekV3

### How was this patch tested?
CI passed with new added test.

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Mengqing Cao <cmq0113@163.com>
											
										
										
											2025-06-14 16:59:00 +08:00
 								os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
-												Handle with_prefill_across_dp for multistream mla (#1322)

### What this PR does / why we need it?
After #1094, decode might be executed with non-compiled mode, despite of
`torchair_graph_config.enabled`, causing multistream mla to fail, which
assumes torchair compiled mode for decode when
`torchair_graph_config.enabled == True`.
Augment that assumption to fix this.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Tested both offline, and by graph mode mla e2e testcase.

---------

Signed-off-by: sdmyzlp <lrwei2@petalmail.com>
											
										
										
											2025-06-26 09:32:07 +08:00
+								def _deepseek_torchair_test_fixture(
 								    additional_config: Dict,
 								    *,
-												[CI] Enable linux-aarch64-a2 (64GB) and tp2 * 2 max-parallel to speed up CI (#2065)

### What this PR does / why we need it?
Currently our workflow run time takes about 3 hours in total, which
seriously affects the developer experience, so it is urgent to have a
optimization, after this pr, It is expected that the running time of the
full CI can be shortened to 1h40min.

- Enable linux-aarch64-a2 (64GB) to replace linux-arm64-npu (32GB)
- Change TP4 ---> TP2 * 2 max-parallel
- Move DeepSeek-V2-Lite-W8A8 to single card test

### Does this PR introduce _any_ user-facing change?
No


- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/a2480251ec92ba2a849464dde48db8a2b7f6ef81

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2025-07-29 18:59:05 +08:00
+								    tensor_parallel_size=2,
-												[BugFix] Fix a bug of running chunked-prefill with torchair. (#1378) (#1844)

This PR fixes the bug `local variable 'decode_hs_or_q_c' referenced
before assignment` when running chunked-prefill with torchair. We should
calculate `decode_hs_or_q_c` whether or not torchair graphics mode is
enabled.

backport of #1378
fix https://github.com/vllm-project/vllm-ascend/issues/1369


- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/0e36abf9931baa070609376debb4fb3772f4a3fe

---------

Signed-off-by: whx-sjtu <2952154980@qq.com>
Signed-off-by: MengqingCao <cmq0113@163.com>
Co-authored-by: whx-sjtu <2952154980@qq.com>
											
										
										
											2025-07-31 20:08:45 +08:00
+								    use_v1_schduler=False,
-												Handle with_prefill_across_dp for multistream mla (#1322)

### What this PR does / why we need it?
After #1094, decode might be executed with non-compiled mode, despite of
`torchair_graph_config.enabled`, causing multistream mla to fail, which
assumes torchair compiled mode for decode when
`torchair_graph_config.enabled == True`.
Augment that assumption to fix this.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Tested both offline, and by graph mode mla e2e testcase.

---------

Signed-off-by: sdmyzlp <lrwei2@petalmail.com>
											
										
										
											2025-06-26 09:32:07 +08:00
+								):
 								    example_prompts = [
 								        "Hello, my name is",
 								        "The president of the United States is",
 								        "The capital of France is",
 								        "The future of AI is",
 								    ]
-												[BugFix] Fix a bug of running chunked-prefill with torchair. (#1378) (#1844)

This PR fixes the bug `local variable 'decode_hs_or_q_c' referenced
before assignment` when running chunked-prefill with torchair. We should
calculate `decode_hs_or_q_c` whether or not torchair graphics mode is
enabled.

backport of #1378
fix https://github.com/vllm-project/vllm-ascend/issues/1369


- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/0e36abf9931baa070609376debb4fb3772f4a3fe

---------

Signed-off-by: whx-sjtu <2952154980@qq.com>
Signed-off-by: MengqingCao <cmq0113@163.com>
Co-authored-by: whx-sjtu <2952154980@qq.com>
											
										
										
											2025-07-31 20:08:45 +08:00
+								    kwargs = {}
 								    if not use_v1_schduler:
 								        kwargs = {
 								            "ascend_scheduler_config": {
 								                "enabled": True,
 								            },
 								            "refresh": True,
 								        }
-												Handle with_prefill_across_dp for multistream mla (#1322)

### What this PR does / why we need it?
After #1094, decode might be executed with non-compiled mode, despite of
`torchair_graph_config.enabled`, causing multistream mla to fail, which
assumes torchair compiled mode for decode when
`torchair_graph_config.enabled == True`.
Augment that assumption to fix this.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Tested both offline, and by graph mode mla e2e testcase.

---------

Signed-off-by: sdmyzlp <lrwei2@petalmail.com>
											
										
										
											2025-06-26 09:32:07 +08:00
+								    additional_config.update(**kwargs)
 								    with VllmRunner(
 								            "vllm-ascend/DeepSeek-V3-Pruning",
 								            dtype="half",
 								            tensor_parallel_size=tensor_parallel_size,
 								            distributed_executor_backend="mp",
 								            enforce_eager=False,
 								            additional_config=additional_config,
 								    ) as vllm_model:
 								        # use greedy sampler to make sure the generated results are fix
 								        vllm_output = vllm_model.generate_greedy(example_prompts, 5)
 								    # NOTE: vllm-ascend/DeepSeek-V3-Pruning is a random weight of
 								    # DeepSeek-V3 with 2 hidden layers, thus the golden results seems
 								    # inaccurate. This will only change if accuracy improves with the
 								    # official weights of DeepSeek-V3.
 								    golden_results = [
-												[CI] change to new ds model (#1513)

Previous, the DeepSeek V3 Pruning weight is not correct, the moe layer
is not tested. We update a new Pruning model to enable moe layer
compute.

This PR fix the CI to address the new weight.

---------

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
											
										
										
											2025-06-30 19:02:29 +08:00
+								        'Hello, my name is下载早点向前很有่อง',
 								        'The president of the United States isSender)## physiological Albany',
 								        'The capital of France is Rocky转角 hospitalizedinterval sparked',
 								        'The future of AI is её asegο BIOS一扫',
-												Handle with_prefill_across_dp for multistream mla (#1322)

### What this PR does / why we need it?
After #1094, decode might be executed with non-compiled mode, despite of
`torchair_graph_config.enabled`, causing multistream mla to fail, which
assumes torchair compiled mode for decode when
`torchair_graph_config.enabled == True`.
Augment that assumption to fix this.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Tested both offline, and by graph mode mla e2e testcase.

---------

Signed-off-by: sdmyzlp <lrwei2@petalmail.com>
											
										
										
											2025-06-26 09:32:07 +08:00
+								    ]
 								    assert len(golden_results) == len(vllm_output)
 								    for i in range(len(vllm_output)):
 								        assert golden_results[i] == vllm_output[i][1]
 								        print(f"Generated text: {vllm_output[i][1]!r}")
 								def test_e2e_deepseekv3_with_torchair():
 								    additional_config = {
 								        "torchair_graph_config": {
 								            "enabled": True,
 								        },
 								    }
 								    _deepseek_torchair_test_fixture(additional_config)
-												[CI/UT][Graph] Add ut for torchair graph mode (#1103)

### What this PR does / why we need it?
Add ut for torchair graph mode on DeepSeekV3

### How was this patch tested?
CI passed with new added test.

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Mengqing Cao <cmq0113@163.com>
											
										
										
											2025-06-14 16:59:00 +08:00
-												Handle with_prefill_across_dp for multistream mla (#1322)

### What this PR does / why we need it?
After #1094, decode might be executed with non-compiled mode, despite of
`torchair_graph_config.enabled`, causing multistream mla to fail, which
assumes torchair compiled mode for decode when
`torchair_graph_config.enabled == True`.
Augment that assumption to fix this.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Tested both offline, and by graph mode mla e2e testcase.

---------

Signed-off-by: sdmyzlp <lrwei2@petalmail.com>
											
										
										
											2025-06-26 09:32:07 +08:00
+								def test_e2e_deepseekv3_with_torchair_ms_mla():
 								    additional_config = {
 								        "torchair_graph_config": {
 								            "enabled": True,
 								            "enable_multistream_mla": True,
 								        },
 								    }
 								    _deepseek_torchair_test_fixture(additional_config)
-												[CORE]initial support for torchair with non-mla backend (#1506)

### What this PR does / why we need it?
This PR supports torchair graph mode with non-mla backend on both 800IA2
and 300I Duo platforms. The main change is to add
`attention_v1_torchair.py` to support specific attention related
operations that are required by torchair.

### Does this PR introduce _any_ user-facing change?
Before this PR, vLLM-Ascend only allows deepseek to use torchair. Now we
can also use it with pangu. Besides, we add a support model list to
control which type of models that can use torchair.

### How was this patch tested?
We have test it with PanguProMoE on both 800IA2 and 300I Duo platforms,
and model generates answer normally.

---------

Signed-off-by: angazenn <zengyanjia@huawei.com>
Signed-off-by: tianyitang <tangtianyi4@huawei.com>
Co-authored-by: angazenn <zengyanjia@huawei.com>
Co-authored-by: tianyitang <tangtianyi4@huawei.com>
											
										
										
											2025-07-03 22:21:42 +08:00
-												[BugFix] Fix a bug of running chunked-prefill with torchair. (#1378) (#1844)

This PR fixes the bug `local variable 'decode_hs_or_q_c' referenced
before assignment` when running chunked-prefill with torchair. We should
calculate `decode_hs_or_q_c` whether or not torchair graphics mode is
enabled.

backport of #1378
fix https://github.com/vllm-project/vllm-ascend/issues/1369


- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/0e36abf9931baa070609376debb4fb3772f4a3fe

---------

Signed-off-by: whx-sjtu <2952154980@qq.com>
Signed-off-by: MengqingCao <cmq0113@163.com>
Co-authored-by: whx-sjtu <2952154980@qq.com>
											
										
										
											2025-07-31 20:08:45 +08:00
+								def test_e2e_deepseekv3_with_torchair_v1scheduler():
 								    additional_config = {
 								        "torchair_graph_config": {
 								            "enabled": True,
 								        },
 								    }
 								    _deepseek_torchair_test_fixture(additional_config, use_v1_schduler=True)
-												[CORE]initial support for torchair with non-mla backend (#1506)

### What this PR does / why we need it?
This PR supports torchair graph mode with non-mla backend on both 800IA2
and 300I Duo platforms. The main change is to add
`attention_v1_torchair.py` to support specific attention related
operations that are required by torchair.

### Does this PR introduce _any_ user-facing change?
Before this PR, vLLM-Ascend only allows deepseek to use torchair. Now we
can also use it with pangu. Besides, we add a support model list to
control which type of models that can use torchair.

### How was this patch tested?
We have test it with PanguProMoE on both 800IA2 and 300I Duo platforms,
and model generates answer normally.

---------

Signed-off-by: angazenn <zengyanjia@huawei.com>
Signed-off-by: tianyitang <tangtianyi4@huawei.com>
Co-authored-by: angazenn <zengyanjia@huawei.com>
Co-authored-by: tianyitang <tangtianyi4@huawei.com>
											
										
										
											2025-07-03 22:21:42 +08:00
+								def _pangu_torchair_test_fixture(
 								    additional_config: Dict,
 								    *,
-												[CI] Enable linux-aarch64-a2 (64GB) and tp2 * 2 max-parallel to speed up CI (#2065)

### What this PR does / why we need it?
Currently our workflow run time takes about 3 hours in total, which
seriously affects the developer experience, so it is urgent to have a
optimization, after this pr, It is expected that the running time of the
full CI can be shortened to 1h40min.

- Enable linux-aarch64-a2 (64GB) to replace linux-arm64-npu (32GB)
- Change TP4 ---> TP2 * 2 max-parallel
- Move DeepSeek-V2-Lite-W8A8 to single card test

### Does this PR introduce _any_ user-facing change?
No


- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/a2480251ec92ba2a849464dde48db8a2b7f6ef81

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
											
										
										
											2025-07-29 18:59:05 +08:00
+								    tensor_parallel_size=2,
-												[CORE]initial support for torchair with non-mla backend (#1506)

### What this PR does / why we need it?
This PR supports torchair graph mode with non-mla backend on both 800IA2
and 300I Duo platforms. The main change is to add
`attention_v1_torchair.py` to support specific attention related
operations that are required by torchair.

### Does this PR introduce _any_ user-facing change?
Before this PR, vLLM-Ascend only allows deepseek to use torchair. Now we
can also use it with pangu. Besides, we add a support model list to
control which type of models that can use torchair.

### How was this patch tested?
We have test it with PanguProMoE on both 800IA2 and 300I Duo platforms,
and model generates answer normally.

---------

Signed-off-by: angazenn <zengyanjia@huawei.com>
Signed-off-by: tianyitang <tangtianyi4@huawei.com>
Co-authored-by: angazenn <zengyanjia@huawei.com>
Co-authored-by: tianyitang <tangtianyi4@huawei.com>
											
										
										
											2025-07-03 22:21:42 +08:00
+								):
 								    example_prompts = [
 								        "Hello, my name is",
 								        "The president of the United States is",
 								        "The capital of France is",
 								        "The future of AI is",
 								    ]
 								    # torchair is only work without chunked-prefill now
 								    kwargs = {
 								        "ascend_scheduler_config": {
 								            "enabled": True,
 								        },
 								        "refresh": True,
 								    }
 								    additional_config.update(**kwargs)
 								    with VllmRunner(
 								            "vllm-ascend/pangu-pro-moe-pruing",
 								            dtype="half",
 								            tensor_parallel_size=tensor_parallel_size,
 								            distributed_executor_backend="mp",
 								            enforce_eager=False,
 								            additional_config=additional_config,
-												[Dist][EP] Remove ETP/EP maintained in vllm-ascend (#1681)

### What this PR does / why we need it?
Remove ETP/EP maintained in branch main. We drop this as there is no
relevant scenarios to use ETP now, and we may subsequently advocate
implementing expert tensor parallelism in vLLM to support scenarios
where the expert is needed to be sliced

This is a part of #1422 backport.

Fixes https://github.com/vllm-project/vllm-ascend/issues/1396
https://github.com/vllm-project/vllm-ascend/issues/1154

### Does this PR introduce _any_ user-facing change?
We'll not maintain etp/ep in vllm-ascend anymore, and use the tp/ep in
vllm instead.

### How was this patch tested?
CI passed with new added and existing test.


- vLLM version: v0.9.2
- vLLM main:
https://github.com/vllm-project/vllm/commit/fe8a2c544ad97119f4dafd316e5d9664521b73f9

Signed-off-by: MengqingCao <cmq0113@163.com>
											
										
										
											2025-07-21 09:08:04 +08:00
+								            enable_expert_parallel=True,
-												[CORE]initial support for torchair with non-mla backend (#1506)

### What this PR does / why we need it?
This PR supports torchair graph mode with non-mla backend on both 800IA2
and 300I Duo platforms. The main change is to add
`attention_v1_torchair.py` to support specific attention related
operations that are required by torchair.

### Does this PR introduce _any_ user-facing change?
Before this PR, vLLM-Ascend only allows deepseek to use torchair. Now we
can also use it with pangu. Besides, we add a support model list to
control which type of models that can use torchair.

### How was this patch tested?
We have test it with PanguProMoE on both 800IA2 and 300I Duo platforms,
and model generates answer normally.

---------

Signed-off-by: angazenn <zengyanjia@huawei.com>
Signed-off-by: tianyitang <tangtianyi4@huawei.com>
Co-authored-by: angazenn <zengyanjia@huawei.com>
Co-authored-by: tianyitang <tangtianyi4@huawei.com>
											
										
										
											2025-07-03 22:21:42 +08:00
+								    ) as vllm_model:
 								        # use greedy sampler to make sure the generated results are fix
 								        vllm_output = vllm_model.generate_greedy(example_prompts, 5)
 								    # NOTE: vllm-ascend/pangu-pro-moe-pruing is only part of PanguProMoE
 								    # with 2 hidden layers, thus the golden results seems inaccurate.
 								    # This will only change if accuracy changes with the official weights
 								    # of PanguProMoE.
 								    golden_results = [
 								        'Hello, my name is Remempondeprecatedmiot忱',
 								        'The president of the United States is Remem下的一个 rever ceremoni Segnali',
 								        'The capital of France is Rememvoud administrativ Remem投',
 								        'The future of AI isotope Segnali Zoeken精细化 supus',
 								    ]
 								    assert len(golden_results) == len(vllm_output)
 								    for i in range(len(vllm_output)):
 								        assert golden_results[i] == vllm_output[i][1]
 								        print(f"Generated text: {vllm_output[i][1]!r}")
 								def test_e2e_pangu_with_torchair():
 								    additional_config = {
 								        "torchair_graph_config": {
 								            "enabled": True,
 								        },
 								    }
 								    _pangu_torchair_test_fixture(additional_config)
-												qwen3_moe/qwen25 support torchair graph (#2403)

### What this PR does / why we need it?
Added support for the TorchAir graph mode in qwen3_moe and qwen2.5
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
```bash
llm = LLM(
    model=model,
    tensor_parallel_size=GPUs_per_dp_rank,
    enforce_eager=False,
    enable_expert_parallel=True,
    max_model_len=4096,
    max_num_seqs=16,
    trust_remote_code=trust_remote_code,
    gpu_memory_utilization=0.4,
    additional_config={
             "torchair_graph_config": {
                 "enabled": True,
                 "use_cached_graph": False,
                 "graph_batch_sizes_init": False,
                 "graph_batch_sizes": [16]
             },
             "ascend_scheduler_config": {
                 "enabled": True,
                 "chunked_prefill_enabled":True,
             },
             "refresh": True,
    },
)
```

- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/b87cb97a53bcff92a90308528b3f313e43aff102

Signed-off-by: taoyuxiang <oui.nicholas.tao@gmail.com>
											
										
										
											2025-08-20 11:23:50 +08:00
 								def _qwen_torchair_test_fixture(
 								    model,
 								    tp,
 								    enable_expert_parallel,
 								):
 								    # The current access control does not support 16 cards,
 								    # so the MC2 operator in Qwen's graph mode cannot run.
 								    # Once 16-card support is available,
 								    # this e2e can be switched to graph mode.
 								    example_prompts = [
 								        "Hello, my name is",
 								        "The president of the United States is",
 								        "The capital of France is",
 								        "The future of AI is",
 								    ]
 								    additional_config = {
 								        "torchair_graph_config": {
 								            "enabled": False,
 								        },
 								        "ascend_scheduler_config": {
 								            "enabled": True,
 								        },
 								        "refresh": True,
 								    }
 								    with VllmRunner(
 								            model,
 								            dtype="half",
 								            tensor_parallel_size=tp,
 								            distributed_executor_backend="mp",
 								            enforce_eager=True,
 								            additional_config=additional_config,
 								            enable_expert_parallel=enable_expert_parallel,
 								    ) as vllm_model:
 								        # use greedy sampler to make sure the generated results are fix
 								        vllm_output = vllm_model.generate_greedy(example_prompts, 5)
 								    # NOTE: vllm-ascend/pangu-pro-moe-pruing is only part of PanguProMoE
 								    # with 2 hidden layers, thus the golden results seems inaccurate.
 								    # This will only change if accuracy changes with the official weights
 								    # of PanguProMoE.
 								    golden_results = [
 								        'Hello, my name is Remempondeprecatedmiot忱',
 								        'The president of the United States is Remem下的一个 rever ceremoni Segnali',
 								        'The capital of France is Rememvoud administrativ Remem投',
 								        'The future of AI isotope Segnali Zoeken精细化 supus',
 								    ]
 								    assert len(golden_results) == len(vllm_output)
 								    for i in range(len(vllm_output)):
 								        print(f"Generated text: {vllm_output[i][1]!r}")
 								def test_e2e_qwen2_with_torchair():
 								    _qwen_torchair_test_fixture("Qwen/Qwen2.5-0.5B-Instruct", 2, False)
 								def test_e2e_qwen3_moe_with_torchair():
 								    _qwen_torchair_test_fixture("Qwen/Qwen3-30B-A3B", 2, True)