Fix potential flakiness in test_lora_qwen3 (#10250)

2025-09-10 01:04:39 -07:00
parent 27760fc1b6
commit e903f695c8
3 changed files with 21 additions and 14 deletions
--- a/test/srt/lora/test_lora_qwen3.py
+++ b/test/srt/lora/test_lora_qwen3.py
@@ -18,7 +18,7 @@ import random
 import unittest
 from typing import List

-from utils import TORCH_DTYPES, LoRAAdaptor, LoRAModelCase
+from utils import TORCH_DTYPES, LoRAAdaptor, LoRAModelCase, ensure_reproducibility

 from sglang.test.runners import HFRunner, SRTRunner
 from sglang.test.test_utils import CustomTestCase, calculate_rouge_l, is_in_ci
@@ -59,19 +59,18 @@ TEST_MULTIPLE_BATCH_PROMPTS = [
    The Transformers are large language models,
    They're used to make predictions on text.
    """,
-    # "AI is a field of computer science focused on", TODO: Add it back after fixing its bug
+    "AI is a field of computer science focused on",
    "Computer science is the study of",
    "Write a short story.",
    "What are the main components of a computer?",
 ]


-class TestLoRA(CustomTestCase):
-
+class TestLoRAQwen3(CustomTestCase):
    def _run_lora_multiple_batch_on_model_cases(self, model_cases: List[LoRAModelCase]):
        for model_case in model_cases:
            for torch_dtype in TORCH_DTYPES:
-                max_new_tokens = 10
+                max_new_tokens = 32
                backend = "triton"
                base_path = model_case.base
                lora_adapter_paths = [a.name for a in model_case.adaptors]
@@ -133,6 +132,7 @@ class TestLoRA(CustomTestCase):
                )

                # Initialize runners
+                ensure_reproducibility()
                srt_runner = SRTRunner(
                    base_path,
                    torch_dtype=torch_dtype,
@@ -140,7 +140,11 @@ class TestLoRA(CustomTestCase):
                    lora_paths=[lora_adapter_paths[0], lora_adapter_paths[1]],
                    max_loras_per_batch=len(lora_adapter_paths) + 1,
                    lora_backend=backend,
+                    sleep_on_idle=True,  # Eliminate non-determinism by forcing all requests to be processed in one batch.
+                    attention_backend="torch_native",
                )
+
+                ensure_reproducibility()
                hf_runner = HFRunner(
                    base_path,
                    torch_dtype=torch_dtype,