[CI] Add Triton Ascend in CI (#4921)

Add triton-ascend in UT and e2e - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: Meihan-chen <jcccx.cmh@gmail.com>
2025-12-23 12:47:35 +08:00
parent 2e010e12dd
commit 592cfb6a6f
8 changed files with 85 additions and 36 deletions
--- a/tests/ut/attention/test_sfa_v1.py
+++ b/tests/ut/attention/test_sfa_v1.py
@@ -1,3 +1,4 @@
+import sys
 from unittest.mock import MagicMock

 import torch
@@ -5,6 +6,10 @@ from vllm.v1.attention.backends.utils import AttentionCGSupport

 from tests.ut.base import TestBase
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
+
+if 'torch_npu._inductor' not in sys.modules:
+    sys.modules['torch_npu._inductor'] = MagicMock()
+
 from vllm_ascend.attention.sfa_v1 import (AscendSFABackend, AscendSFAImpl,
                                          AscendSFAMetadata,
                                          AscendSFAMetadataBuilder)
--- a/tests/ut/conftest.py
+++ b/tests/ut/conftest.py
@@ -15,10 +15,23 @@
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
+import sys
+from unittest.mock import MagicMock

 from vllm_ascend.utils import adapt_patch  # noqa E402
 from vllm_ascend.utils import register_ascend_customop

+# triton and torch_npu is not available in the environment, so we need to mock them
+sys.modules['torch_npu'].npu.current_device = MagicMock(return_value=0)
+sys.modules['torch_npu._inductor'] = MagicMock()
+
+triton_runtime = MagicMock()
+triton_runtime.driver.active.utils.get_device_properties.return_value = {
+    'num_aic': 8,
+    'num_vectorcore': 8,
+}
+sys.modules['triton.runtime'] = triton_runtime
+
 adapt_patch()
 adapt_patch(True)

--- a/tests/ut/ops/test_fused_moe.py
+++ b/tests/ut/ops/test_fused_moe.py
@@ -422,6 +422,7 @@ class TestUnifiedApplyMLP(TestBase):
        self.assertEqual(result.shape, hidden_states.shape)
        self.assertEqual(result.dtype, torch.float16)

+    @patch('vllm_ascend.ops.fused_moe.moe_mlp.HAS_TRITON', False)
    @patch('vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context')
    @patch('torch_npu.npu_grouped_matmul')
    @patch('torch_npu.npu_swiglu')
--- a/tests/ut/sample/test_rejection_sampler.py
+++ b/tests/ut/sample/test_rejection_sampler.py
@@ -127,19 +127,32 @@ class TestAscendRejectionSampler(TestBase):
        x = torch.tensor([10, 20, 30])
        cu_num_tokens = torch.tensor([2, 5, 7])
        num_tokens = 7
+        # Test PyTorch path
+        with patch("vllm_ascend.sample.rejection_sampler.HAS_TRITON", False):
+            with patch("vllm_ascend.sample.rejection_sampler.expand_pytorch"
+                       ) as mock_pytorch:
+                expand_batch_to_tokens(x, cu_num_tokens, num_tokens)
+                mock_pytorch.assert_called_once()
+                args = mock_pytorch.call_args[0]
+                assert (args[1] == x).all()
+                assert (args[2] == cu_num_tokens).all()

-        with patch("vllm_ascend.sample.rejection_sampler.expand_pytorch"
-                   ) as mock_kernel:
-            expand_batch_to_tokens(x, cu_num_tokens, num_tokens)
-            mock_kernel.assert_called_once()
-            args = mock_kernel.call_args[0]
-            assert (args[1] == x).all()
-            assert (args[2] == cu_num_tokens).all()
+        # Test Triton kernel path
+        with patch("vllm_ascend.sample.rejection_sampler.HAS_TRITON", True):
+            with patch("vllm_ascend.sample.rejection_sampler.expand_kernel"
+                       ) as mock_triton:
+                expand_batch_to_tokens(x, cu_num_tokens, num_tokens)
+                # grid = triton.cdiv(n, BLOCK_SIZE) = triton.cdiv(3, 2) = 2
+                mock_triton.__getitem__.assert_called_once_with((2, ))
+                call_args = mock_triton.__getitem__.return_value.call_args[0]
+                assert (call_args[1] == x).all()
+                assert (call_args[2] == cu_num_tokens).all()

        # Run actual function
-        result = expand_batch_to_tokens(x, cu_num_tokens, num_tokens)
-        expected = torch.tensor([10, 10, 20, 20, 20, 30, 30])
-        assert torch.equal(result, expected)
+        with patch("vllm_ascend.sample.rejection_sampler.HAS_TRITON", False):
+            result = expand_batch_to_tokens(x, cu_num_tokens, num_tokens)
+            expected = torch.tensor([10, 10, 20, 20, 20, 30, 30])
+            assert torch.equal(result, expected)

    def test_sample_recovered_tokens_pytorch_ngram(self):
        """Test recovered token sampling under n-gram mode"""
--- a/tests/ut/worker/test_worker_v1.py
+++ b/tests/ut/worker/test_worker_v1.py
@@ -239,7 +239,9 @@ class TestNPUWorker(TestBase):
        "vllm_ascend.worker.worker.NPUWorker._init_worker_distributed_environment"
    )
    @patch("vllm_ascend.worker.worker.NPUPlatform")
-    def test_init_device(self, mock_platform, mock_init_dist_env):
+    @patch("vllm_ascend.worker.worker.init_device_properties_triton")
+    def test_init_device(self, mock_init_triton, mock_platform,
+                         mock_init_dist_env):
        """Test _init_device method"""
        from vllm_ascend.worker.worker import NPUWorker