From 05afc7f8c3822997f643535ff007096f122ac0cb Mon Sep 17 00:00:00 2001 From: ZT-AIA <63220130+ZT-AIA@users.noreply.github.com> Date: Thu, 19 Mar 2026 17:13:12 +0800 Subject: [PATCH] [CI]repair for ci custom ops (#7461) ### What this PR does / why we need it? NPU resources are not released immediately when custom operator test cases are executed, causing an error when other operator test cases are executed. - vLLM version: v0.17.0 - vLLM main: https://github.com/vllm-project/vllm/commit/8a680463fab3bc9e6760417cd5c0a6aa58283065 Signed-off-by: ZT-AIA <1028681969@qq.com> Signed-off-by: ZT-AIA <63220130+ZT-AIA@users.noreply.github.com> --- .../ops/singlecard_ops/test_add_rms_norm_bias.py | 4 ++++ .../test_apply_top_k_top_p_custom.py | 10 ++++++++++ .../test_batch_matmul_transpose.py | 16 +++++++++++++--- .../singlecard_ops/test_gating_top_k_softmax.py | 4 ++++ .../test_moe_init_routing_custom.py | 4 ++++ .../singlecard_ops/test_npu_moe_gating_top_k.py | 4 ++++ .../test_transpose_kv_cache_by_block.py | 9 +++++++++ .../singlecard_ops/triton/test_causal_conv1d.py | 7 +++++++ .../triton/test_chunk_gated_delta_rule.py | 4 ++++ .../test_fused_qkvzba_split_reshape_cat.py | 4 ++++ .../test_fused_sigmoid_gating_delta_rule.py | 4 ++++ .../ops/singlecard_ops/triton/test_l2norm.py | 4 ++++ .../ops/singlecard_ops/triton/test_penality.py | 6 +++++- .../triton/test_prepare_inputs_padded.py | 4 ++++ .../triton/test_rejection_sample.py | 8 +++++++- 15 files changed, 87 insertions(+), 5 deletions(-) diff --git a/tests/e2e/nightly/single_node/ops/singlecard_ops/test_add_rms_norm_bias.py b/tests/e2e/nightly/single_node/ops/singlecard_ops/test_add_rms_norm_bias.py index e106b5e9..ec769412 100644 --- a/tests/e2e/nightly/single_node/ops/singlecard_ops/test_add_rms_norm_bias.py +++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/test_add_rms_norm_bias.py @@ -1,3 +1,4 @@ +import gc import random import numpy as np @@ -147,3 +148,6 @@ def test_quant_fpx_linear(row: int, col: int, dtype, atol, rtol, kernelType): torch.testing.assert_close(rstd * b1, rstd1 * b1, rtol=rtol, atol=100) torch.testing.assert_close(x * c, x1 * c, atol=atol, rtol=100) torch.testing.assert_close(x * c1, x1 * c1, rtol=rtol, atol=100) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() diff --git a/tests/e2e/nightly/single_node/ops/singlecard_ops/test_apply_top_k_top_p_custom.py b/tests/e2e/nightly/single_node/ops/singlecard_ops/test_apply_top_k_top_p_custom.py index 3a153618..437d7900 100644 --- a/tests/e2e/nightly/single_node/ops/singlecard_ops/test_apply_top_k_top_p_custom.py +++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/test_apply_top_k_top_p_custom.py @@ -1,3 +1,4 @@ +import gc import numpy as np import pytest import torch @@ -103,6 +104,9 @@ def test_npu_apply_top_k_top_p(vocab_size, batch_size, p_val, k_val): out_npu = ascendc_op_exec(logits, p, k) assert_output_close(out_cpu, out_npu) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() @pytest.mark.parametrize('vocab_size', [15206, 152064]) @@ -120,6 +124,9 @@ def test_npu_apply_top_k(vocab_size, batch_size, k_val): out_npu = ascendc_op_exec(logits, p, k) assert_output_close(out_cpu, out_npu) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() @pytest.mark.parametrize('vocab_size', [15206, 152064]) @@ -137,3 +144,6 @@ def test_npu_apply_top_p(vocab_size, batch_size, p_val): out_npu = ascendc_op_exec(logits, p, k) assert_output_close(out_cpu, out_npu) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() diff --git a/tests/e2e/nightly/single_node/ops/singlecard_ops/test_batch_matmul_transpose.py b/tests/e2e/nightly/single_node/ops/singlecard_ops/test_batch_matmul_transpose.py index 6c81b9eb..6ab40daf 100644 --- a/tests/e2e/nightly/single_node/ops/singlecard_ops/test_batch_matmul_transpose.py +++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/test_batch_matmul_transpose.py @@ -1,3 +1,4 @@ +import gc import random import unittest @@ -56,7 +57,9 @@ class TestMatrixMultiplication(unittest.TestCase): self.assertLessEqual(max_diff, atol, f"Absolute error too large: {max_diff} > {atol}") - + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() def test_boundary_conditions(self): """Test boundary conditions""" test_cases = [ @@ -90,6 +93,9 @@ class TestMatrixMultiplication(unittest.TestCase): self.assert_tensors_almost_equal(res1.view(-1, m, n), res2, dtype) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() def test_random_shapes(self): """Test randomly generated shapes""" @@ -116,7 +122,9 @@ class TestMatrixMultiplication(unittest.TestCase): a, b_tensor, res2) self.assert_tensors_almost_equal(res1.view(-1, m, n), res2, dtype) - + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() def test_zero_values(self): """Test zero input values""" dtypes = [torch.float16, torch.bfloat16] @@ -135,7 +143,9 @@ class TestMatrixMultiplication(unittest.TestCase): self.assert_tensors_almost_equal(res1.view(-1, m, n), res2, dtype) self.assertTrue(torch.all(res2 == 0)) - + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() if __name__ == "__main__": unittest.main(verbosity=2) diff --git a/tests/e2e/nightly/single_node/ops/singlecard_ops/test_gating_top_k_softmax.py b/tests/e2e/nightly/single_node/ops/singlecard_ops/test_gating_top_k_softmax.py index 4edcdfde..b78e1821 100644 --- a/tests/e2e/nightly/single_node/ops/singlecard_ops/test_gating_top_k_softmax.py +++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/test_gating_top_k_softmax.py @@ -1,3 +1,4 @@ +import gc import pytest import torch import torch_npu @@ -35,3 +36,6 @@ def test_quant_fpx_linear(B: int, D: int, top_k: int, dtype, atol, rtol): topk_ids = topk_ids.to(torch.int32) torch.allclose(y, topk_weights, atol=atol, rtol=rtol) torch.allclose(expert_idx, topk_ids, atol=atol, rtol=rtol) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() diff --git a/tests/e2e/nightly/single_node/ops/singlecard_ops/test_moe_init_routing_custom.py b/tests/e2e/nightly/single_node/ops/singlecard_ops/test_moe_init_routing_custom.py index 05fd5112..d01596ec 100644 --- a/tests/e2e/nightly/single_node/ops/singlecard_ops/test_moe_init_routing_custom.py +++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/test_moe_init_routing_custom.py @@ -1,3 +1,4 @@ +import gc import itertools import random @@ -347,3 +348,6 @@ def test_moe_init_routing_custom(): failed_test_cnt += 1 assert (failed_test_cnt == 0) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() diff --git a/tests/e2e/nightly/single_node/ops/singlecard_ops/test_npu_moe_gating_top_k.py b/tests/e2e/nightly/single_node/ops/singlecard_ops/test_npu_moe_gating_top_k.py index 7928d539..4e25c36a 100644 --- a/tests/e2e/nightly/single_node/ops/singlecard_ops/test_npu_moe_gating_top_k.py +++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/test_npu_moe_gating_top_k.py @@ -1,3 +1,4 @@ +import gc import random import numpy @@ -203,6 +204,9 @@ def test_npu_moe_gating_topk_compare(group_select_mode: int, expert_idx_npu.cpu().numpy(), rtol=RTOL_TOLERANCE, atol=ATOL_TOLERANCE) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() if __name__ == "__main__": diff --git a/tests/e2e/nightly/single_node/ops/singlecard_ops/test_transpose_kv_cache_by_block.py b/tests/e2e/nightly/single_node/ops/singlecard_ops/test_transpose_kv_cache_by_block.py index 7527f1fb..6878f6cf 100644 --- a/tests/e2e/nightly/single_node/ops/singlecard_ops/test_transpose_kv_cache_by_block.py +++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/test_transpose_kv_cache_by_block.py @@ -1,3 +1,4 @@ +import gc import random import unittest @@ -96,6 +97,10 @@ class TestTransposeKvCacheByBlock(unittest.TestCase): for i in range (layers): self.assert_tensors_almost_equal(k_caches[i], cloned_k_caches[i], dtype) self.assert_tensors_almost_equal(v_caches[i], cloned_v_caches[i], dtype) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() + def assert_tensors_almost_equal(self, actual, expected, dtype): """Check if two tensors are approximately equal (considering floating point errors)""" @@ -135,3 +140,7 @@ class TestTransposeKvCacheByBlock(unittest.TestCase): self.assertLessEqual(max_diff, atol, f"Absolute error too large: {max_diff} > {atol}") + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() + \ No newline at end of file diff --git a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_causal_conv1d.py b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_causal_conv1d.py index 34db8f00..89b437d8 100644 --- a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_causal_conv1d.py +++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_causal_conv1d.py @@ -1,5 +1,6 @@ from typing import Optional +import gc import pytest import torch import torch.nn.functional as F @@ -310,6 +311,9 @@ def test_causal_conv1d(dim, width, extra_state_len, seq_len, has_bias, validate_cmp(out, out_ref, itype) validate_cmp(conv_states, conv_states_ref, itype) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() def causal_conv1d_update_ref(x, @@ -443,3 +447,6 @@ def test_causal_conv1d_update_with_batch_gather(batch_size, with_padding, dim, assert torch.equal(conv_state[unused_states_bool], conv_state_for_padding_test[unused_states_bool]) assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() diff --git a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_chunk_gated_delta_rule.py b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_chunk_gated_delta_rule.py index a0e4b6ef..4659a461 100644 --- a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_chunk_gated_delta_rule.py +++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_chunk_gated_delta_rule.py @@ -1,3 +1,4 @@ +import gc import torch from tests.ut.base import PytestBase @@ -31,3 +32,6 @@ class TestChunkGatedDeltaRule(PytestBase): assert core_attn_out_non_spec.shape == (1, 17, 8, 128) assert last_recurrent_state.shape == (3, 8, 128, 128) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() diff --git a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_fused_qkvzba_split_reshape_cat.py b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_fused_qkvzba_split_reshape_cat.py index 2099645a..dc142427 100644 --- a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_fused_qkvzba_split_reshape_cat.py +++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_fused_qkvzba_split_reshape_cat.py @@ -1,3 +1,4 @@ +import gc import pytest import torch from einops import rearrange @@ -98,3 +99,6 @@ def test_fused_qkvzba_split_reshape_cat( validate_cmp(z, z_ref, dtype) validate_cmp(b, b_ref, dtype) validate_cmp(a, a_ref, dtype) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() diff --git a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_fused_sigmoid_gating_delta_rule.py b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_fused_sigmoid_gating_delta_rule.py index abfbcc20..92ee62ab 100644 --- a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_fused_sigmoid_gating_delta_rule.py +++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_fused_sigmoid_gating_delta_rule.py @@ -1,3 +1,4 @@ +import gc import torch from vllm.model_executor.layers.fla.ops import fused_recurrent_gated_delta_rule from vllm.model_executor.models.qwen3_next import fused_gdn_gating @@ -64,3 +65,6 @@ def test_triton_fusion_ops(): rtol=1e-02, atol=1e-02, equal_nan=True) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() diff --git a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_l2norm.py b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_l2norm.py index 0b891468..f94e2a37 100644 --- a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_l2norm.py +++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_l2norm.py @@ -1,3 +1,4 @@ +import gc import pytest import torch import torch.nn.functional as F @@ -32,3 +33,6 @@ def test_l2norm(B: int, T: int, H: int, D: int, dtype: torch.dtype): tri = l2norm_fwd(x) assert torch.allclose(tri, ref, rtol=rtol, atol=atol) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() diff --git a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_penality.py b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_penality.py index 1d108bf4..fbad25f6 100644 --- a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_penality.py +++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_penality.py @@ -1,3 +1,4 @@ +import gc import pytest import torch @@ -254,4 +255,7 @@ def test_apply_penalties( if dtype == torch.bfloat16: atol = 1e-02 rtol = 1e-02 - assert torch.allclose(logits_triton, logits_pytorch_result, atol=atol, rtol=rtol) \ No newline at end of file + assert torch.allclose(logits_triton, logits_pytorch_result, atol=atol, rtol=rtol) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() diff --git a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_prepare_inputs_padded.py b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_prepare_inputs_padded.py index 2a84efbd..2425d476 100644 --- a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_prepare_inputs_padded.py +++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_prepare_inputs_padded.py @@ -1,3 +1,4 @@ +import gc import pytest import torch from vllm.triton_utils import triton @@ -78,3 +79,6 @@ def test_prepare_inputs_padded(num_reqs): ) torch.testing.assert_close(out_tri, out_ref) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() diff --git a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_rejection_sample.py b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_rejection_sample.py index 95c1157a..353dc479 100644 --- a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_rejection_sample.py +++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_rejection_sample.py @@ -1,3 +1,4 @@ +import gc import pytest import torch from vllm.v1.sample.rejection_sampler import \ @@ -96,7 +97,9 @@ def test_rejection_random_sample(max_spec_len, vocab_size, batch_size): BLOCK_SIZE=block_size) torch.npu.synchronize() assert torch.equal(original_output_token_ids, output_token_ids) - + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() DEVICE = "npu" BATCH_SIZE = 7 @@ -227,3 +230,6 @@ def test_rejection_sampler_block_verify_triton_kernel( BLOCK_SIZE=block_size) torch.npu.synchronize() assert torch.equal(output_token_ids_ref, output_token_ids_triton) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() \ No newline at end of file