[CI]repair for ci custom ops (#7461)

### What this PR does / why we need it? NPU resources are not released immediately when custom operator test cases are executed, causing an error when other operator test cases are executed. - vLLM version: v0.17.0 - vLLM main: 8a680463fa Signed-off-by: ZT-AIA <1028681969@qq.com> Signed-off-by: ZT-AIA <63220130+ZT-AIA@users.noreply.github.com>
2026-03-19 17:13:12 +08:00
parent 83a4065b4b
commit 05afc7f8c3
15 changed files with 87 additions and 5 deletions
--- a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_causal_conv1d.py
+++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_causal_conv1d.py
@@ -1,5 +1,6 @@
 from typing import Optional

+import gc
 import pytest
 import torch
 import torch.nn.functional as F
@@ -310,6 +311,9 @@ def test_causal_conv1d(dim, width, extra_state_len, seq_len, has_bias,

    validate_cmp(out, out_ref, itype)
    validate_cmp(conv_states, conv_states_ref, itype)
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()


 def causal_conv1d_update_ref(x,
@@ -443,3 +447,6 @@ def test_causal_conv1d_update_with_batch_gather(batch_size, with_padding, dim,
    assert torch.equal(conv_state[unused_states_bool],
                       conv_state_for_padding_test[unused_states_bool])
    assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol)
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
--- a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_chunk_gated_delta_rule.py
+++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_chunk_gated_delta_rule.py
@@ -1,3 +1,4 @@
+import gc
 import torch

 from tests.ut.base import PytestBase
@@ -31,3 +32,6 @@ class TestChunkGatedDeltaRule(PytestBase):

        assert core_attn_out_non_spec.shape == (1, 17, 8, 128)
        assert last_recurrent_state.shape == (3, 8, 128, 128)
+        gc.collect()
+        torch.npu.empty_cache()
+        torch.npu.reset_peak_memory_stats()
--- a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_fused_qkvzba_split_reshape_cat.py
+++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_fused_qkvzba_split_reshape_cat.py
@@ -1,3 +1,4 @@
+import gc
 import pytest
 import torch
 from einops import rearrange
@@ -98,3 +99,6 @@ def test_fused_qkvzba_split_reshape_cat(
    validate_cmp(z, z_ref, dtype)
    validate_cmp(b, b_ref, dtype)
    validate_cmp(a, a_ref, dtype)
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
--- a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_fused_sigmoid_gating_delta_rule.py
+++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_fused_sigmoid_gating_delta_rule.py
@@ -1,3 +1,4 @@
+import gc
 import torch
 from vllm.model_executor.layers.fla.ops import fused_recurrent_gated_delta_rule
 from vllm.model_executor.models.qwen3_next import fused_gdn_gating
@@ -64,3 +65,6 @@ def test_triton_fusion_ops():
                               rtol=1e-02,
                               atol=1e-02,
                               equal_nan=True)
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
--- a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_l2norm.py
+++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_l2norm.py
@@ -1,3 +1,4 @@
+import gc
 import pytest
 import torch
 import torch.nn.functional as F
@@ -32,3 +33,6 @@ def test_l2norm(B: int, T: int, H: int, D: int, dtype: torch.dtype):
    tri = l2norm_fwd(x)

    assert torch.allclose(tri, ref, rtol=rtol, atol=atol)
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
--- a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_penality.py
+++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_penality.py
@@ -1,3 +1,4 @@
+import gc
 import pytest
 import torch

@@ -254,4 +255,7 @@ def test_apply_penalties(
    if dtype == torch.bfloat16:
        atol = 1e-02
        rtol = 1e-02
-    assert torch.allclose(logits_triton, logits_pytorch_result, atol=atol, rtol=rtol)
+    assert torch.allclose(logits_triton, logits_pytorch_result, atol=atol, rtol=rtol)
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
--- a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_prepare_inputs_padded.py
+++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_prepare_inputs_padded.py
@@ -1,3 +1,4 @@
+import gc
 import pytest
 import torch
 from vllm.triton_utils import triton
@@ -78,3 +79,6 @@ def test_prepare_inputs_padded(num_reqs):
    )

    torch.testing.assert_close(out_tri, out_ref)
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
--- a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_rejection_sample.py
+++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_rejection_sample.py
@@ -1,3 +1,4 @@
+import gc
 import pytest
 import torch
 from vllm.v1.sample.rejection_sampler import \
@@ -96,7 +97,9 @@ def test_rejection_random_sample(max_spec_len, vocab_size, batch_size):
                                             BLOCK_SIZE=block_size)
    torch.npu.synchronize()
    assert torch.equal(original_output_token_ids, output_token_ids)
-
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()

 DEVICE = "npu"
 BATCH_SIZE = 7
@@ -227,3 +230,6 @@ def test_rejection_sampler_block_verify_triton_kernel(
        BLOCK_SIZE=block_size)
    torch.npu.synchronize()
    assert torch.equal(output_token_ids_ref, output_token_ids_triton)
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()