[bugfix] fix test_camem failed with triton-ascend (#5492)

### What this PR does / why we need it?
This fixes a bug that occurred when running `test_camem.py` in the
triton-ascend environment `NPU function error:
aclrtGetMemInfo(ACL_HBM_MEM, &device_free, &device_total)`

- vLLM version: v0.13.0
- vLLM main:
5326c89803

---------

Signed-off-by: Meihan-chen <jcccx.cmh@gmail.com>
This commit is contained in:
meihanc
2026-01-05 20:10:54 +08:00
committed by GitHub
parent 58e8d19c35
commit 16b1bee804
5 changed files with 10 additions and 21 deletions

View File

@@ -68,15 +68,6 @@ jobs:
pip install -r requirements-dev.txt pip install -r requirements-dev.txt
pip install -v -e . pip install -v -e .
- name: Run vllm-project/vllm-ascend test (non triton)
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
if: ${{ inputs.type == 'full' }}
run: |
pytest -sv --durations=0 tests/e2e/singlecard/test_aclgraph_mem.py
pytest -sv --durations=0 tests/e2e/singlecard/test_camem.py
- name: Install Ascend toolkit & triton_ascend - name: Install Ascend toolkit & triton_ascend
shell: bash -l {0} shell: bash -l {0}
run: | run: |
@@ -94,6 +85,8 @@ jobs:
run: | run: |
# pytest -sv --durations=0 tests/e2e/singlecard/test_aclgraph_accuracy.py # pytest -sv --durations=0 tests/e2e/singlecard/test_aclgraph_accuracy.py
# pytest -sv --durations=0 tests/e2e/singlecard/test_quantization.py # pytest -sv --durations=0 tests/e2e/singlecard/test_quantization.py
pytest -sv --durations=0 tests/e2e/singlecard/test_aclgraph_mem.py
pytest -sv --durations=0 tests/e2e/singlecard/test_camem.py
pytest -sv --durations=0 tests/e2e/singlecard/test_vlm.py::test_multimodal_vl pytest -sv --durations=0 tests/e2e/singlecard/test_vlm.py::test_multimodal_vl
pytest -sv --durations=0 tests/e2e/singlecard/pooling/test_classification.py::test_qwen_pooling_classify_correctness pytest -sv --durations=0 tests/e2e/singlecard/pooling/test_classification.py::test_qwen_pooling_classify_correctness

View File

@@ -1,8 +1,5 @@
import torch import torch
from vllm.triton_utils import HAS_TRITON, tl, triton from vllm.triton_utils import tl, triton
if HAS_TRITON:
import torch_npu._inductor # noqa: F401
from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num

View File

@@ -10,10 +10,7 @@
# ruff: noqa: E501 # ruff: noqa: E501
# mypy: ignore-errors # mypy: ignore-errors
import torch import torch
from vllm.triton_utils import HAS_TRITON, tl, triton from vllm.triton_utils import tl, triton
if HAS_TRITON:
import torch_npu._inductor # noqa: F401
@triton.jit @triton.jit

View File

@@ -14,10 +14,7 @@
# limitations under the License. # limitations under the License.
# This file is a part of the vllm-ascend project. # This file is a part of the vllm-ascend project.
# #
from vllm.triton_utils import HAS_TRITON, tl, triton from vllm.triton_utils import tl, triton
if HAS_TRITON:
import torch_npu._inductor # noqa: F401
from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num

View File

@@ -88,6 +88,11 @@ class NPUWorker(WorkerBase):
# register patch for vllm # register patch for vllm
from vllm_ascend.utils import adapt_patch from vllm_ascend.utils import adapt_patch
adapt_patch() adapt_patch()
# Import _inductor for graph mode execution with triton
# This lazy import avoids torch_npu re-initialization in patch
from vllm.triton_utils import HAS_TRITON
if HAS_TRITON:
import torch_npu._inductor # noqa: F401
# Register ops when worker init. # Register ops when worker init.
from vllm_ascend import ops from vllm_ascend import ops
ops.register_dummy_fusion_op() ops.register_dummy_fusion_op()