From aa957102a9ac1f7c528a46f254f44fb37e872b88 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 10 Mar 2025 01:24:22 -0700
Subject: [PATCH] Simplify tests & Fix trtllm custom allreduce registration
 (#4252)

---
 .github/workflows/pr-test.yml                 |  2 +-
 python/sglang/srt/_custom_ops.py              |  6 +-
 .../layers/attention/flashinfer_backend.py    |  2 +-
 python/sglang/srt/model_loader/loader.py      |  3 +-
 sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu   |  5 +-
 sgl-kernel/csrc/torch_extension.cc            |  7 +-
 sgl-kernel/tests/test_rotary_embedding.py     |  2 -
 test/srt/run_suite.py                         | 22 +++---
 test/srt/test_bench_one_batch.py              |  7 +-
 ...est_eval_accuracy_large_chunked_prefill.py | 68 -----------------
 ...al_accuracy_large_mixed_chunked_prefill.py | 74 -------------------
 test/srt/test_eval_accuracy_mini.py           | 42 -----------
 test/srt/test_gptqmodel_dynamic.py            |  1 +
 13 files changed, 30 insertions(+), 211 deletions(-)
 delete mode 100644 test/srt/test_eval_accuracy_large_chunked_prefill.py
 delete mode 100644 test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py
 delete mode 100644 test/srt/test_eval_accuracy_mini.py

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 3b33b319d..265e1374b 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -266,7 +266,7 @@ jobs:
           cd test/srt
           python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
 
-          USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
+          # USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
 
       - name: Benchmark single latency + torch.compile (TP=2)
         timeout-minutes: 10
diff --git a/python/sglang/srt/_custom_ops.py b/python/sglang/srt/_custom_ops.py
index d06765c3a..0584dc80f 100644
--- a/python/sglang/srt/_custom_ops.py
+++ b/python/sglang/srt/_custom_ops.py
@@ -6,10 +6,12 @@ from typing import List, Tuple
 import torch
 import torch.library
 
-from sglang.srt.utils import is_hip, is_hpu
+from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu
 
 logger = logging.getLogger(__name__)
-use_vllm_custom_allreduce = os.environ.get("USE_VLLM_CUSTOM_ALLREDUCE", default=True)
+use_vllm_custom_allreduce = get_bool_env_var(
+    "USE_VLLM_CUSTOM_ALLREDUCE", default="true"
+)
 
 if not is_hpu():
     # ROCm does not use vllm custom allreduce
diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py
index af47f2ce2..791cbeec0 100644
--- a/python/sglang/srt/layers/attention/flashinfer_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -22,7 +22,7 @@ from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_trito
 from sglang.srt.layers.dp_attention import get_attention_tp_size
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
-from sglang.srt.utils import is_flashinfer_available
+from sglang.srt.utils import get_bool_env_var, is_flashinfer_available
 
 if TYPE_CHECKING:
     from sglang.srt.layers.radix_attention import RadixAttention
diff --git a/python/sglang/srt/model_loader/loader.py b/python/sglang/srt/model_loader/loader.py
index eff4aa5f3..c241fd9d6 100644
--- a/python/sglang/srt/model_loader/loader.py
+++ b/python/sglang/srt/model_loader/loader.py
@@ -48,6 +48,7 @@ from sglang.srt.model_loader.weight_utils import (
     safetensors_weights_iterator,
 )
 from sglang.srt.utils import (
+    get_bool_env_var,
     get_device_capability,
     is_pin_memory_available,
     set_weight_attrs,
@@ -197,7 +198,7 @@ class DefaultModelLoader(BaseModelLoader):
 
         Returns the path to the downloaded model, or None if the model is not
         downloaded from ModelScope."""
-        if os.environ.get("SGLANG_USE_MODELSCOPE", None) == "True":
+        if get_bool_env_var("SGLANG_USE_MODELSCOPE"):
             # download model from ModelScope hub,
             # lazy import so that modelscope is not required for normal use.
             # pylint: disable=C.
diff --git a/sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu b/sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu
index d0a80c7bf..2add0826f 100644
--- a/sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu
+++ b/sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu
@@ -100,7 +100,6 @@ void cublas_grouped_gemm(
   check_device_dtype(out_dtype, inputs);
   check_device_dtype(out_dtype, weights);
   check_device_dtype(out_dtype, outputs);
-  cudaDataType_t cuda_data_type = (out_dtype == torch::kHalf ? CUDA_R_16F : CUDA_R_16BF);
 
   // Weights should be transposed to (n, k) of column major
   std::vector<cublasOperation_t> transa_array(group_count, CUBLAS_OP_T);
@@ -132,7 +131,6 @@ void cublas_grouped_gemm(
   std::vector<void*> b_array = get_tensor_ptrs(inputs);
   std::vector<void*> c_array = get_tensor_ptrs(outputs);
 
-  auto handle = reinterpret_cast<cublasHandle_t>(cublas_handle);
   auto stream = reinterpret_cast<cudaStream_t>(cuda_stream);
 
   // Should allocate tensors for storage of pointers
@@ -141,6 +139,9 @@ void cublas_grouped_gemm(
   torch::Tensor d_c = create_ptr_pointer(c_array, stream);
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12050
+  auto handle = reinterpret_cast<cublasHandle_t>(cublas_handle);
+  cudaDataType_t cuda_data_type = (out_dtype == torch::kHalf ? CUDA_R_16F : CUDA_R_16BF);
+
   auto status = cublasGemmGroupedBatchedEx(
       handle,
       transa_array.data(),
diff --git a/sgl-kernel/csrc/torch_extension.cc b/sgl-kernel/csrc/torch_extension.cc
index 1304915bf..d8bd89917 100644
--- a/sgl-kernel/csrc/torch_extension.cc
+++ b/sgl-kernel/csrc/torch_extension.cc
@@ -32,11 +32,8 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
   m.def("all_reduce(int fa, Tensor inp, Tensor! out) -> ()");
   m.impl("all_reduce", torch::kCUDA, &all_reduce);
 
-  m.def("get_graph_buffer_ipc_meta(int fa) -> (int[], int[])");
-  m.impl("get_graph_buffer_ipc_meta", torch::kCUDA, &get_graph_buffer_ipc_meta);
-
-  m.def("register_graph_buffers(int fa, int[][] handles, int[][] offsets) -> ()");
-  m.impl("register_graph_buffers", torch::kCUDA, &register_graph_buffers);
+  m.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
+  m.def("register_graph_buffers", &register_graph_buffers);
 
   /*
    * From csrc/attention
diff --git a/sgl-kernel/tests/test_rotary_embedding.py b/sgl-kernel/tests/test_rotary_embedding.py
index b7a141404..fa937a604 100644
--- a/sgl-kernel/tests/test_rotary_embedding.py
+++ b/sgl-kernel/tests/test_rotary_embedding.py
@@ -1,9 +1,7 @@
-import math
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import pytest
 import torch
-import torch.nn as nn
 from sgl_kernel import apply_rope_with_cos_sin_cache_inplace
 
 
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index b9e36e232..590c46136 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -20,30 +20,33 @@ suites = {
         TestFile("models/test_generation_models.py", 103),
         TestFile("models/test_qwen_models.py", 82),
         TestFile("models/test_reward_models.py", 83),
-        TestFile("test_gptqmodel_dynamic.py", 72),
         TestFile("models/test_gme_qwen_models.py", 45),
         TestFile("test_abort.py", 51),
+        TestFile("test_block_int8.py", 22),
         TestFile("test_chunked_prefill.py", 336),
-        TestFile("test_custom_allreduce.py", 1),
-        TestFile("test_double_sparsity.py", 50),
         TestFile("test_eagle_infer.py", 447),
+        TestFile("test_ebnf_constrained.py"),
+        TestFile("test_fp8_kernel.py", 2),
         TestFile("test_embedding_openai_server.py", 36),
-        TestFile("test_eval_accuracy_mini.py", 63),
         TestFile("test_gguf.py", 78),
+        TestFile("test_gptqmodel_dynamic.py", 72),
+        TestFile("test_hidden_states.py", 55),
+        TestFile("test_int8_kernel.py", 1),
         TestFile("test_input_embeddings.py", 38),
+        TestFile("test_json_constrained.py", 98),
+        TestFile("test_large_max_new_tokens.py", 41),
+        TestFile("test_metrics.py", 32),
         TestFile("test_mla.py", 92),
         TestFile("test_mla_deepseek_v3.py", 221),
         TestFile("test_mla_flashinfer.py", 395),
         TestFile("test_mla_fp8.py", 93),
-        TestFile("test_json_constrained.py", 98),
-        TestFile("test_large_max_new_tokens.py", 41),
-        TestFile("test_metrics.py", 32),
         TestFile("test_no_chunked_prefill.py", 126),
         TestFile("test_no_overlap_scheduler.py", 262),
         TestFile("test_openai_server.py", 124),
         TestFile("test_penalty.py", 41),
         TestFile("test_pytorch_sampling_backend.py", 66),
         TestFile("test_radix_attention.py", 167),
+        TestFile("test_reasoning_content.py", 89),
         TestFile("test_regex_constrained.py", 64),
         TestFile("test_release_memory_occupation.py", 44),
         TestFile("test_request_length_validation.py", 31),
@@ -58,7 +61,6 @@ suites = {
         TestFile("test_torchao.py", 70),
         TestFile("test_triton_attention_kernels.py", 4),
         TestFile("test_triton_attention_backend.py", 134),
-        TestFile("test_hidden_states.py", 55),
         TestFile("test_update_weights_from_disk.py", 114),
         TestFile("test_update_weights_from_tensor.py", 48),
         TestFile("test_vertex_endpoint.py", 31),
@@ -66,10 +68,6 @@ suites = {
         TestFile("test_vision_llm.py", 18.4),
         TestFile("test_vision_openai_server.py", 344),
         TestFile("test_w8a8_quantization.py", 46),
-        TestFile("test_fp8_kernel.py", 2),
-        TestFile("test_block_int8.py", 22),
-        TestFile("test_int8_kernel.py", 1),
-        TestFile("test_reasoning_content.py", 89),
     ],
     "nightly": [
         TestFile("test_nightly_gsm8k_eval.py"),
diff --git a/test/srt/test_bench_one_batch.py b/test/srt/test_bench_one_batch.py
index f4140b89f..e015da6a1 100644
--- a/test/srt/test_bench_one_batch.py
+++ b/test/srt/test_bench_one_batch.py
@@ -3,6 +3,7 @@ import unittest
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
     DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+    get_bool_env_var,
     is_in_ci,
     run_bench_one_batch,
     write_github_step_summary,
@@ -27,9 +28,13 @@ class TestBenchOneBatch(unittest.TestCase):
             DEFAULT_MOE_MODEL_NAME_FOR_TEST, ["--tp", "2", "--cuda-graph-max-bs", "2"]
         )
 
+        use_vllm_custom_allreduce = get_bool_env_var(
+            "USE_VLLM_CUSTOM_ALLREDUCE", default="true"
+        )
+
         if is_in_ci():
             write_github_step_summary(
-                f"### test_moe_tp2_bs1\n"
+                f"### test_moe_tp2_bs1 ({use_vllm_custom_allreduce=})\n"
                 f"output_throughput : {output_throughput:.2f} token/s\n"
             )
             self.assertGreater(output_throughput, 124)
diff --git a/test/srt/test_eval_accuracy_large_chunked_prefill.py b/test/srt/test_eval_accuracy_large_chunked_prefill.py
deleted file mode 100644
index c8ce5cff2..000000000
--- a/test/srt/test_eval_accuracy_large_chunked_prefill.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import unittest
-from types import SimpleNamespace
-
-from sglang.srt.utils import kill_process_tree
-from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import (
-    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    popen_launch_server,
-)
-
-
-class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=["--log-level-http", "warning", "--chunked-prefill-size", "256"],
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-
-    def test_mmlu(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="mmlu",
-            num_examples=3000,
-            num_threads=1024,
-        )
-
-        metrics = run_eval(args)
-        assert metrics["score"] >= 0.705, f"{metrics}"
-
-    def test_human_eval(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="humaneval",
-            num_examples=None,
-            num_threads=1024,
-        )
-
-        metrics = run_eval(args)
-        assert metrics["score"] >= 0.64, f"{metrics}"
-
-    def test_mgsm_en(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="mgsm_en",
-            num_examples=None,
-            num_threads=1024,
-        )
-
-        metrics = run_eval(args)
-        assert metrics["score"] >= 0.84, f"{metrics}"
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py b/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py
deleted file mode 100644
index 3bc115874..000000000
--- a/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import unittest
-from types import SimpleNamespace
-
-from sglang.srt.utils import kill_process_tree
-from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import (
-    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    popen_launch_server,
-)
-
-
-class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--log-level-http",
-                "warning",
-                "--chunked-prefill-size",
-                "256",
-                "--enable-mixed-chunk",
-            ],
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-
-    def test_mmlu(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="mmlu",
-            num_examples=3000,
-            num_threads=1024,
-        )
-
-        metrics = run_eval(args)
-        assert metrics["score"] >= 0.705, f"{metrics}"
-
-    def test_human_eval(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="humaneval",
-            num_examples=None,
-            num_threads=1024,
-        )
-
-        metrics = run_eval(args)
-        assert metrics["score"] >= 0.64, f"{metrics}"
-
-    def test_mgsm_en(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="mgsm_en",
-            num_examples=None,
-            num_threads=1024,
-        )
-
-        metrics = run_eval(args)
-        assert metrics["score"] >= 0.84, f"{metrics}"
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/srt/test_eval_accuracy_mini.py b/test/srt/test_eval_accuracy_mini.py
deleted file mode 100644
index a008c3869..000000000
--- a/test/srt/test_eval_accuracy_mini.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import unittest
-from types import SimpleNamespace
-
-from sglang.srt.utils import kill_process_tree
-from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import (
-    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    popen_launch_server,
-)
-
-
-class TestEvalAccuracyMini(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-
-    def test_mmlu(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="mmlu",
-            num_examples=64,
-            num_threads=32,
-            temperature=0.1,
-        )
-
-        metrics = run_eval(args)
-        self.assertGreaterEqual(metrics["score"], 0.65)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/srt/test_gptqmodel_dynamic.py b/test/srt/test_gptqmodel_dynamic.py
index 92e17a8e4..c9145fe6f 100644
--- a/test/srt/test_gptqmodel_dynamic.py
+++ b/test/srt/test_gptqmodel_dynamic.py
@@ -129,6 +129,7 @@ class TestGPTQModelDynamic(unittest.TestCase):
                 "text": "The capital of France is",
                 "sampling_params": {
                     "max_new_tokens": max_new_tokens,
+                    "temperature": 0.001,
                 },
             },
         )