[CI]repair for ci custom ops (#7461)
### What this PR does / why we need it?
NPU resources are not released immediately when custom operator test
cases are executed, causing an error when other operator test cases are
executed.
- vLLM version: v0.17.0
- vLLM main:
8a680463fa
Signed-off-by: ZT-AIA <1028681969@qq.com>
Signed-off-by: ZT-AIA <63220130+ZT-AIA@users.noreply.github.com>
This commit is contained in:
@@ -1,3 +1,4 @@
|
|||||||
|
import gc
|
||||||
import random
|
import random
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -147,3 +148,6 @@ def test_quant_fpx_linear(row: int, col: int, dtype, atol, rtol, kernelType):
|
|||||||
torch.testing.assert_close(rstd * b1, rstd1 * b1, rtol=rtol, atol=100)
|
torch.testing.assert_close(rstd * b1, rstd1 * b1, rtol=rtol, atol=100)
|
||||||
torch.testing.assert_close(x * c, x1 * c, atol=atol, rtol=100)
|
torch.testing.assert_close(x * c, x1 * c, atol=atol, rtol=100)
|
||||||
torch.testing.assert_close(x * c1, x1 * c1, rtol=rtol, atol=100)
|
torch.testing.assert_close(x * c1, x1 * c1, rtol=rtol, atol=100)
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import gc
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
@@ -103,6 +104,9 @@ def test_npu_apply_top_k_top_p(vocab_size, batch_size, p_val, k_val):
|
|||||||
out_npu = ascendc_op_exec(logits, p, k)
|
out_npu = ascendc_op_exec(logits, p, k)
|
||||||
|
|
||||||
assert_output_close(out_cpu, out_npu)
|
assert_output_close(out_cpu, out_npu)
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('vocab_size', [15206, 152064])
|
@pytest.mark.parametrize('vocab_size', [15206, 152064])
|
||||||
@@ -120,6 +124,9 @@ def test_npu_apply_top_k(vocab_size, batch_size, k_val):
|
|||||||
out_npu = ascendc_op_exec(logits, p, k)
|
out_npu = ascendc_op_exec(logits, p, k)
|
||||||
|
|
||||||
assert_output_close(out_cpu, out_npu)
|
assert_output_close(out_cpu, out_npu)
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('vocab_size', [15206, 152064])
|
@pytest.mark.parametrize('vocab_size', [15206, 152064])
|
||||||
@@ -137,3 +144,6 @@ def test_npu_apply_top_p(vocab_size, batch_size, p_val):
|
|||||||
out_npu = ascendc_op_exec(logits, p, k)
|
out_npu = ascendc_op_exec(logits, p, k)
|
||||||
|
|
||||||
assert_output_close(out_cpu, out_npu)
|
assert_output_close(out_cpu, out_npu)
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import gc
|
||||||
import random
|
import random
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
@@ -56,7 +57,9 @@ class TestMatrixMultiplication(unittest.TestCase):
|
|||||||
|
|
||||||
self.assertLessEqual(max_diff, atol,
|
self.assertLessEqual(max_diff, atol,
|
||||||
f"Absolute error too large: {max_diff} > {atol}")
|
f"Absolute error too large: {max_diff} > {atol}")
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
def test_boundary_conditions(self):
|
def test_boundary_conditions(self):
|
||||||
"""Test boundary conditions"""
|
"""Test boundary conditions"""
|
||||||
test_cases = [
|
test_cases = [
|
||||||
@@ -90,6 +93,9 @@ class TestMatrixMultiplication(unittest.TestCase):
|
|||||||
|
|
||||||
self.assert_tensors_almost_equal(res1.view(-1, m, n), res2,
|
self.assert_tensors_almost_equal(res1.view(-1, m, n), res2,
|
||||||
dtype)
|
dtype)
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
|
|
||||||
def test_random_shapes(self):
|
def test_random_shapes(self):
|
||||||
"""Test randomly generated shapes"""
|
"""Test randomly generated shapes"""
|
||||||
@@ -116,7 +122,9 @@ class TestMatrixMultiplication(unittest.TestCase):
|
|||||||
a, b_tensor, res2)
|
a, b_tensor, res2)
|
||||||
self.assert_tensors_almost_equal(res1.view(-1, m, n), res2,
|
self.assert_tensors_almost_equal(res1.view(-1, m, n), res2,
|
||||||
dtype)
|
dtype)
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
def test_zero_values(self):
|
def test_zero_values(self):
|
||||||
"""Test zero input values"""
|
"""Test zero input values"""
|
||||||
dtypes = [torch.float16, torch.bfloat16]
|
dtypes = [torch.float16, torch.bfloat16]
|
||||||
@@ -135,7 +143,9 @@ class TestMatrixMultiplication(unittest.TestCase):
|
|||||||
self.assert_tensors_almost_equal(res1.view(-1, m, n), res2,
|
self.assert_tensors_almost_equal(res1.view(-1, m, n), res2,
|
||||||
dtype)
|
dtype)
|
||||||
self.assertTrue(torch.all(res2 == 0))
|
self.assertTrue(torch.all(res2 == 0))
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main(verbosity=2)
|
unittest.main(verbosity=2)
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import gc
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
import torch_npu
|
import torch_npu
|
||||||
@@ -35,3 +36,6 @@ def test_quant_fpx_linear(B: int, D: int, top_k: int, dtype, atol, rtol):
|
|||||||
topk_ids = topk_ids.to(torch.int32)
|
topk_ids = topk_ids.to(torch.int32)
|
||||||
torch.allclose(y, topk_weights, atol=atol, rtol=rtol)
|
torch.allclose(y, topk_weights, atol=atol, rtol=rtol)
|
||||||
torch.allclose(expert_idx, topk_ids, atol=atol, rtol=rtol)
|
torch.allclose(expert_idx, topk_ids, atol=atol, rtol=rtol)
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import gc
|
||||||
import itertools
|
import itertools
|
||||||
import random
|
import random
|
||||||
|
|
||||||
@@ -347,3 +348,6 @@ def test_moe_init_routing_custom():
|
|||||||
failed_test_cnt += 1
|
failed_test_cnt += 1
|
||||||
|
|
||||||
assert (failed_test_cnt == 0)
|
assert (failed_test_cnt == 0)
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import gc
|
||||||
import random
|
import random
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
@@ -203,6 +204,9 @@ def test_npu_moe_gating_topk_compare(group_select_mode: int,
|
|||||||
expert_idx_npu.cpu().numpy(),
|
expert_idx_npu.cpu().numpy(),
|
||||||
rtol=RTOL_TOLERANCE,
|
rtol=RTOL_TOLERANCE,
|
||||||
atol=ATOL_TOLERANCE)
|
atol=ATOL_TOLERANCE)
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import gc
|
||||||
import random
|
import random
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
@@ -96,6 +97,10 @@ class TestTransposeKvCacheByBlock(unittest.TestCase):
|
|||||||
for i in range (layers):
|
for i in range (layers):
|
||||||
self.assert_tensors_almost_equal(k_caches[i], cloned_k_caches[i], dtype)
|
self.assert_tensors_almost_equal(k_caches[i], cloned_k_caches[i], dtype)
|
||||||
self.assert_tensors_almost_equal(v_caches[i], cloned_v_caches[i], dtype)
|
self.assert_tensors_almost_equal(v_caches[i], cloned_v_caches[i], dtype)
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
|
|
||||||
|
|
||||||
def assert_tensors_almost_equal(self, actual, expected, dtype):
|
def assert_tensors_almost_equal(self, actual, expected, dtype):
|
||||||
"""Check if two tensors are approximately equal (considering floating point errors)"""
|
"""Check if two tensors are approximately equal (considering floating point errors)"""
|
||||||
@@ -135,3 +140,7 @@ class TestTransposeKvCacheByBlock(unittest.TestCase):
|
|||||||
|
|
||||||
self.assertLessEqual(max_diff, atol,
|
self.assertLessEqual(max_diff, atol,
|
||||||
f"Absolute error too large: {max_diff} > {atol}")
|
f"Absolute error too large: {max_diff} > {atol}")
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
|
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
import gc
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
@@ -310,6 +311,9 @@ def test_causal_conv1d(dim, width, extra_state_len, seq_len, has_bias,
|
|||||||
|
|
||||||
validate_cmp(out, out_ref, itype)
|
validate_cmp(out, out_ref, itype)
|
||||||
validate_cmp(conv_states, conv_states_ref, itype)
|
validate_cmp(conv_states, conv_states_ref, itype)
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
|
|
||||||
|
|
||||||
def causal_conv1d_update_ref(x,
|
def causal_conv1d_update_ref(x,
|
||||||
@@ -443,3 +447,6 @@ def test_causal_conv1d_update_with_batch_gather(batch_size, with_padding, dim,
|
|||||||
assert torch.equal(conv_state[unused_states_bool],
|
assert torch.equal(conv_state[unused_states_bool],
|
||||||
conv_state_for_padding_test[unused_states_bool])
|
conv_state_for_padding_test[unused_states_bool])
|
||||||
assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol)
|
assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol)
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import gc
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from tests.ut.base import PytestBase
|
from tests.ut.base import PytestBase
|
||||||
@@ -31,3 +32,6 @@ class TestChunkGatedDeltaRule(PytestBase):
|
|||||||
|
|
||||||
assert core_attn_out_non_spec.shape == (1, 17, 8, 128)
|
assert core_attn_out_non_spec.shape == (1, 17, 8, 128)
|
||||||
assert last_recurrent_state.shape == (3, 8, 128, 128)
|
assert last_recurrent_state.shape == (3, 8, 128, 128)
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import gc
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from einops import rearrange
|
from einops import rearrange
|
||||||
@@ -98,3 +99,6 @@ def test_fused_qkvzba_split_reshape_cat(
|
|||||||
validate_cmp(z, z_ref, dtype)
|
validate_cmp(z, z_ref, dtype)
|
||||||
validate_cmp(b, b_ref, dtype)
|
validate_cmp(b, b_ref, dtype)
|
||||||
validate_cmp(a, a_ref, dtype)
|
validate_cmp(a, a_ref, dtype)
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import gc
|
||||||
import torch
|
import torch
|
||||||
from vllm.model_executor.layers.fla.ops import fused_recurrent_gated_delta_rule
|
from vllm.model_executor.layers.fla.ops import fused_recurrent_gated_delta_rule
|
||||||
from vllm.model_executor.models.qwen3_next import fused_gdn_gating
|
from vllm.model_executor.models.qwen3_next import fused_gdn_gating
|
||||||
@@ -64,3 +65,6 @@ def test_triton_fusion_ops():
|
|||||||
rtol=1e-02,
|
rtol=1e-02,
|
||||||
atol=1e-02,
|
atol=1e-02,
|
||||||
equal_nan=True)
|
equal_nan=True)
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import gc
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
@@ -32,3 +33,6 @@ def test_l2norm(B: int, T: int, H: int, D: int, dtype: torch.dtype):
|
|||||||
tri = l2norm_fwd(x)
|
tri = l2norm_fwd(x)
|
||||||
|
|
||||||
assert torch.allclose(tri, ref, rtol=rtol, atol=atol)
|
assert torch.allclose(tri, ref, rtol=rtol, atol=atol)
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import gc
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@@ -254,4 +255,7 @@ def test_apply_penalties(
|
|||||||
if dtype == torch.bfloat16:
|
if dtype == torch.bfloat16:
|
||||||
atol = 1e-02
|
atol = 1e-02
|
||||||
rtol = 1e-02
|
rtol = 1e-02
|
||||||
assert torch.allclose(logits_triton, logits_pytorch_result, atol=atol, rtol=rtol)
|
assert torch.allclose(logits_triton, logits_pytorch_result, atol=atol, rtol=rtol)
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import gc
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
@@ -78,3 +79,6 @@ def test_prepare_inputs_padded(num_reqs):
|
|||||||
)
|
)
|
||||||
|
|
||||||
torch.testing.assert_close(out_tri, out_ref)
|
torch.testing.assert_close(out_tri, out_ref)
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import gc
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from vllm.v1.sample.rejection_sampler import \
|
from vllm.v1.sample.rejection_sampler import \
|
||||||
@@ -96,7 +97,9 @@ def test_rejection_random_sample(max_spec_len, vocab_size, batch_size):
|
|||||||
BLOCK_SIZE=block_size)
|
BLOCK_SIZE=block_size)
|
||||||
torch.npu.synchronize()
|
torch.npu.synchronize()
|
||||||
assert torch.equal(original_output_token_ids, output_token_ids)
|
assert torch.equal(original_output_token_ids, output_token_ids)
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
|
|
||||||
DEVICE = "npu"
|
DEVICE = "npu"
|
||||||
BATCH_SIZE = 7
|
BATCH_SIZE = 7
|
||||||
@@ -227,3 +230,6 @@ def test_rejection_sampler_block_verify_triton_kernel(
|
|||||||
BLOCK_SIZE=block_size)
|
BLOCK_SIZE=block_size)
|
||||||
torch.npu.synchronize()
|
torch.npu.synchronize()
|
||||||
assert torch.equal(output_token_ids_ref, output_token_ids_triton)
|
assert torch.equal(output_token_ids_ref, output_token_ids_triton)
|
||||||
|
gc.collect()
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
torch.npu.reset_peak_memory_stats()
|
||||||
Reference in New Issue
Block a user