[CORE]initial support for torchair with non-mla backend (#1506)
### What this PR does / why we need it? This PR supports torchair graph mode with non-mla backend on both 800IA2 and 300I Duo platforms. The main change is to add `attention_v1_torchair.py` to support specific attention related operations that are required by torchair. ### Does this PR introduce _any_ user-facing change? Before this PR, vLLM-Ascend only allows deepseek to use torchair. Now we can also use it with pangu. Besides, we add a support model list to control which type of models that can use torchair. ### How was this patch tested? We have test it with PanguProMoE on both 800IA2 and 300I Duo platforms, and model generates answer normally. --------- Signed-off-by: angazenn <zengyanjia@huawei.com> Signed-off-by: tianyitang <tangtianyi4@huawei.com> Co-authored-by: angazenn <zengyanjia@huawei.com> Co-authored-by: tianyitang <tangtianyi4@huawei.com>
This commit is contained in:
@@ -165,3 +165,20 @@ def test_models_distributed_DeepSeek_W8A8():
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
|
||||
def test_models_distributed_pangu():
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
max_tokens = 5
|
||||
|
||||
with VllmRunner(
|
||||
snapshot_download("vllm-ascend/pangu-pro-moe-pruing"),
|
||||
max_model_len=8192,
|
||||
enforce_eager=True,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=4,
|
||||
distributed_executor_backend="mp",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
@@ -99,3 +99,63 @@ def test_e2e_deepseekv3_with_torchair_ms_mla():
|
||||
},
|
||||
}
|
||||
_deepseek_torchair_test_fixture(additional_config)
|
||||
|
||||
|
||||
def _pangu_torchair_test_fixture(
|
||||
additional_config: Dict,
|
||||
*,
|
||||
tensor_parallel_size=4,
|
||||
):
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
# torchair is only work without chunked-prefill now
|
||||
kwargs = {
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
"refresh": True,
|
||||
}
|
||||
additional_config.update(**kwargs)
|
||||
|
||||
with VllmRunner(
|
||||
"vllm-ascend/pangu-pro-moe-pruing",
|
||||
dtype="half",
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend="mp",
|
||||
enforce_eager=False,
|
||||
additional_config=additional_config,
|
||||
) as vllm_model:
|
||||
# use greedy sampler to make sure the generated results are fix
|
||||
vllm_output = vllm_model.generate_greedy(example_prompts, 5)
|
||||
|
||||
# NOTE: vllm-ascend/pangu-pro-moe-pruing is only part of PanguProMoE
|
||||
# with 2 hidden layers, thus the golden results seems inaccurate.
|
||||
# This will only change if accuracy changes with the official weights
|
||||
# of PanguProMoE.
|
||||
golden_results = [
|
||||
'Hello, my name is Remempondeprecatedmiot忱',
|
||||
'The president of the United States is Remem下的一个 rever ceremoni Segnali',
|
||||
'The capital of France is Rememvoud administrativ Remem投',
|
||||
'The future of AI isotope Segnali Zoeken精细化 supus',
|
||||
]
|
||||
|
||||
assert len(golden_results) == len(vllm_output)
|
||||
for i in range(len(vllm_output)):
|
||||
assert golden_results[i] == vllm_output[i][1]
|
||||
print(f"Generated text: {vllm_output[i][1]!r}")
|
||||
|
||||
|
||||
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
|
||||
reason="torchair graph is not supported on v0")
|
||||
def test_e2e_pangu_with_torchair():
|
||||
additional_config = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
}
|
||||
_pangu_torchair_test_fixture(additional_config)
|
||||
|
||||
Reference in New Issue
Block a user