[CI] Add Triton Ascend in CI (#4921)
Add triton-ascend in UT and e2e
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: Meihan-chen <jcccx.cmh@gmail.com>
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
import sys
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import torch
|
||||
@@ -5,6 +6,10 @@ from vllm.v1.attention.backends.utils import AttentionCGSupport
|
||||
|
||||
from tests.ut.base import TestBase
|
||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||
|
||||
if 'torch_npu._inductor' not in sys.modules:
|
||||
sys.modules['torch_npu._inductor'] = MagicMock()
|
||||
|
||||
from vllm_ascend.attention.sfa_v1 import (AscendSFABackend, AscendSFAImpl,
|
||||
AscendSFAMetadata,
|
||||
AscendSFAMetadataBuilder)
|
||||
|
||||
@@ -15,10 +15,23 @@
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
import sys
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from vllm_ascend.utils import adapt_patch # noqa E402
|
||||
from vllm_ascend.utils import register_ascend_customop
|
||||
|
||||
# triton and torch_npu is not available in the environment, so we need to mock them
|
||||
sys.modules['torch_npu'].npu.current_device = MagicMock(return_value=0)
|
||||
sys.modules['torch_npu._inductor'] = MagicMock()
|
||||
|
||||
triton_runtime = MagicMock()
|
||||
triton_runtime.driver.active.utils.get_device_properties.return_value = {
|
||||
'num_aic': 8,
|
||||
'num_vectorcore': 8,
|
||||
}
|
||||
sys.modules['triton.runtime'] = triton_runtime
|
||||
|
||||
adapt_patch()
|
||||
adapt_patch(True)
|
||||
|
||||
|
||||
@@ -422,6 +422,7 @@ class TestUnifiedApplyMLP(TestBase):
|
||||
self.assertEqual(result.shape, hidden_states.shape)
|
||||
self.assertEqual(result.dtype, torch.float16)
|
||||
|
||||
@patch('vllm_ascend.ops.fused_moe.moe_mlp.HAS_TRITON', False)
|
||||
@patch('vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context')
|
||||
@patch('torch_npu.npu_grouped_matmul')
|
||||
@patch('torch_npu.npu_swiglu')
|
||||
|
||||
@@ -127,19 +127,32 @@ class TestAscendRejectionSampler(TestBase):
|
||||
x = torch.tensor([10, 20, 30])
|
||||
cu_num_tokens = torch.tensor([2, 5, 7])
|
||||
num_tokens = 7
|
||||
# Test PyTorch path
|
||||
with patch("vllm_ascend.sample.rejection_sampler.HAS_TRITON", False):
|
||||
with patch("vllm_ascend.sample.rejection_sampler.expand_pytorch"
|
||||
) as mock_pytorch:
|
||||
expand_batch_to_tokens(x, cu_num_tokens, num_tokens)
|
||||
mock_pytorch.assert_called_once()
|
||||
args = mock_pytorch.call_args[0]
|
||||
assert (args[1] == x).all()
|
||||
assert (args[2] == cu_num_tokens).all()
|
||||
|
||||
with patch("vllm_ascend.sample.rejection_sampler.expand_pytorch"
|
||||
) as mock_kernel:
|
||||
expand_batch_to_tokens(x, cu_num_tokens, num_tokens)
|
||||
mock_kernel.assert_called_once()
|
||||
args = mock_kernel.call_args[0]
|
||||
assert (args[1] == x).all()
|
||||
assert (args[2] == cu_num_tokens).all()
|
||||
# Test Triton kernel path
|
||||
with patch("vllm_ascend.sample.rejection_sampler.HAS_TRITON", True):
|
||||
with patch("vllm_ascend.sample.rejection_sampler.expand_kernel"
|
||||
) as mock_triton:
|
||||
expand_batch_to_tokens(x, cu_num_tokens, num_tokens)
|
||||
# grid = triton.cdiv(n, BLOCK_SIZE) = triton.cdiv(3, 2) = 2
|
||||
mock_triton.__getitem__.assert_called_once_with((2, ))
|
||||
call_args = mock_triton.__getitem__.return_value.call_args[0]
|
||||
assert (call_args[1] == x).all()
|
||||
assert (call_args[2] == cu_num_tokens).all()
|
||||
|
||||
# Run actual function
|
||||
result = expand_batch_to_tokens(x, cu_num_tokens, num_tokens)
|
||||
expected = torch.tensor([10, 10, 20, 20, 20, 30, 30])
|
||||
assert torch.equal(result, expected)
|
||||
with patch("vllm_ascend.sample.rejection_sampler.HAS_TRITON", False):
|
||||
result = expand_batch_to_tokens(x, cu_num_tokens, num_tokens)
|
||||
expected = torch.tensor([10, 10, 20, 20, 20, 30, 30])
|
||||
assert torch.equal(result, expected)
|
||||
|
||||
def test_sample_recovered_tokens_pytorch_ngram(self):
|
||||
"""Test recovered token sampling under n-gram mode"""
|
||||
|
||||
@@ -239,7 +239,9 @@ class TestNPUWorker(TestBase):
|
||||
"vllm_ascend.worker.worker.NPUWorker._init_worker_distributed_environment"
|
||||
)
|
||||
@patch("vllm_ascend.worker.worker.NPUPlatform")
|
||||
def test_init_device(self, mock_platform, mock_init_dist_env):
|
||||
@patch("vllm_ascend.worker.worker.init_device_properties_triton")
|
||||
def test_init_device(self, mock_init_triton, mock_platform,
|
||||
mock_init_dist_env):
|
||||
"""Test _init_device method"""
|
||||
from vllm_ascend.worker.worker import NPUWorker
|
||||
|
||||
|
||||
Reference in New Issue
Block a user