Move deep gemm related arguments to sglang.srt.environ (#11547)

2025-10-14 00:34:35 +08:00
parent bfadb5ea5f
commit acc2327bbd
20 changed files with 187 additions and 189 deletions
--- a/test/srt/ep/test_eplb.py
+++ b/test/srt/ep/test_eplb.py
@@ -5,6 +5,7 @@ from pathlib import Path
 from types import SimpleNamespace

 import sglang as sgl
+from sglang.srt.environ import envs
 from sglang.srt.utils import kill_process_tree
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
@@ -23,44 +24,43 @@ class _BaseTestDynamicEPLB(CustomTestCase):
    def setUpClass(cls):
        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--tp",
-                "2",
-                "--dp",
-                "2",
-                "--enable-dp-attention",
-                "--moe-a2a-backend",
-                "deepep",
-                "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
-                "--enable-eplb",
-                "--ep-num-redundant-experts",
-                "4",
-                "--eplb-rebalance-num-iterations",
-                "50",
-                "--expert-distribution-recorder-buffer-size",
-                "50",
-                # TODO pr-chain: enable later
-                # "--enable-expert-distribution-metrics",
-                # TODO auto determine these flags
-                "--expert-distribution-recorder-mode",
-                "stat",
-                "--ep-dispatch-algorithm",
-                "static",
-                *cls.extra_args,
-            ],
-            env={
-                "SGL_ENABLE_JIT_DEEPGEMM": "0",
-                "SGLANG_EXPERT_LOCATION_UPDATER_CANARY": "1",
-                **os.environ,
-            },
-        )
+        with (
+            envs.SGLANG_ENABLE_JIT_DEEPGEMM.override(False),
+            envs.SGLANG_EXPERT_LOCATION_UPDATER_CANARY.override(True),
+        ):
+            cls.process = popen_launch_server(
+                cls.model,
+                cls.base_url,
+                timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                other_args=[
+                    "--trust-remote-code",
+                    "--tp",
+                    "2",
+                    "--dp",
+                    "2",
+                    "--enable-dp-attention",
+                    "--moe-a2a-backend",
+                    "deepep",
+                    "--deepep-mode",
+                    "normal",
+                    "--disable-cuda-graph",
+                    "--enable-eplb",
+                    "--ep-num-redundant-experts",
+                    "4",
+                    "--eplb-rebalance-num-iterations",
+                    "50",
+                    "--expert-distribution-recorder-buffer-size",
+                    "50",
+                    # TODO pr-chain: enable later
+                    # "--enable-expert-distribution-metrics",
+                    # TODO auto determine these flags
+                    "--expert-distribution-recorder-mode",
+                    "stat",
+                    "--ep-dispatch-algorithm",
+                    "static",
+                    *cls.extra_args,
+                ],
+            )

    @classmethod
    def tearDownClass(cls):
@@ -89,7 +89,7 @@ class TestDynamicEPLBMultiChunk(_BaseTestDynamicEPLB):

 class TestStaticEPLB(CustomTestCase):
    def test_save_expert_distribution_and_init_expert_location(self):
-        os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "0"
+        envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False)

        with tempfile.TemporaryDirectory() as tmp_dir:
            engine_kwargs = dict(
@@ -108,7 +108,7 @@ class TestStaticEPLB(CustomTestCase):
            )

            print(f"Action: start engine")
-            os.environ["SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR"] = tmp_dir
+            envs.SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR.set(tmp_dir)
            engine = sgl.Engine(
                **engine_kwargs,
                disable_overlap_schedule=True,
--- a/test/srt/ep/test_moe_deepep.py
+++ b/test/srt/ep/test_moe_deepep.py
@@ -3,6 +3,7 @@ import os
 import unittest
 from types import SimpleNamespace

+from sglang.srt.environ import envs
 from sglang.srt.utils import kill_process_tree
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
@@ -55,48 +56,45 @@ class TestDPAttn(unittest.TestCase):
    def setUpClass(cls):
        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--tp",
-                "2",
-                "--dp",
-                "2",
-                "--enable-dp-attention",
-                "--moe-a2a-backend",
-                "deepep",
-                "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
-                # Test custom config
-                "--deepep-config",
-                json.dumps(
-                    {
-                        "normal_dispatch": {
-                            "num_sms": 20,
-                            "num_max_nvl_chunked_send_tokens": 16,
-                            "num_max_nvl_chunked_recv_tokens": 256,
-                            "num_max_rdma_chunked_send_tokens": 6,
-                            "num_max_rdma_chunked_recv_tokens": 128,
-                        },
-                        "normal_combine": {
-                            "num_sms": 20,
-                            "num_max_nvl_chunked_send_tokens": 6,
-                            "num_max_nvl_chunked_recv_tokens": 256,
-                            "num_max_rdma_chunked_send_tokens": 6,
-                            "num_max_rdma_chunked_recv_tokens": 128,
-                        },
-                    }
-                ),
-            ],
-            env={
-                "SGL_ENABLE_JIT_DEEPGEMM": "0",
-                **os.environ,
-            },
-        )
+        with envs.SGLANG_ENABLE_JIT_DEEPGEMM.override(False):
+            cls.process = popen_launch_server(
+                cls.model,
+                cls.base_url,
+                timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                other_args=[
+                    "--trust-remote-code",
+                    "--tp",
+                    "2",
+                    "--dp",
+                    "2",
+                    "--enable-dp-attention",
+                    "--moe-a2a-backend",
+                    "deepep",
+                    "--deepep-mode",
+                    "normal",
+                    "--disable-cuda-graph",
+                    # Test custom config
+                    "--deepep-config",
+                    json.dumps(
+                        {
+                            "normal_dispatch": {
+                                "num_sms": 20,
+                                "num_max_nvl_chunked_send_tokens": 16,
+                                "num_max_nvl_chunked_recv_tokens": 256,
+                                "num_max_rdma_chunked_send_tokens": 6,
+                                "num_max_rdma_chunked_recv_tokens": 128,
+                            },
+                            "normal_combine": {
+                                "num_sms": 20,
+                                "num_max_nvl_chunked_send_tokens": 6,
+                                "num_max_nvl_chunked_recv_tokens": 256,
+                                "num_max_rdma_chunked_send_tokens": 6,
+                                "num_max_rdma_chunked_recv_tokens": 128,
+                            },
+                        }
+                    ),
+                ],
+            )

    @classmethod
    def tearDownClass(cls):
--- a/test/srt/test_disaggregation_different_tp.py
+++ b/test/srt/test_disaggregation_different_tp.py
@@ -1,8 +1,7 @@
-import os
-import time
 import unittest
 from types import SimpleNamespace

+from sglang.srt.environ import envs
 from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
 from sglang.test.test_disaggregation_utils import TestDisaggregationBase
 from sglang.test.test_utils import (
@@ -18,8 +17,7 @@ class TestDisaggregationMooncakePrefillLargerTP(TestDisaggregationBase):
    def setUpClass(cls):
        super().setUpClass()
        # Temporarily disable JIT DeepGEMM
-        cls.original_jit_deepgemm = os.environ.get("SGL_ENABLE_JIT_DEEPGEMM")
-        os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "false"
+        envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False)

        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA

@@ -90,8 +88,7 @@ class TestDisaggregationMooncakeDecodeLargerTP(TestDisaggregationBase):
    def setUpClass(cls):
        super().setUpClass()
        # Temporarily disable JIT DeepGEMM
-        cls.original_jit_deepgemm = os.environ.get("SGL_ENABLE_JIT_DEEPGEMM")
-        os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "false"
+        envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False)

        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA

@@ -162,8 +159,7 @@ class TestDisaggregationMooncakeMHAPrefillLargerTP(TestDisaggregationBase):
    def setUpClass(cls):
        super().setUpClass()
        # Temporarily disable JIT DeepGEMM
-        cls.original_jit_deepgemm = os.environ.get("SGL_ENABLE_JIT_DEEPGEMM")
-        os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "false"
+        envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False)

        cls.model = DEFAULT_MODEL_NAME_FOR_TEST

@@ -234,8 +230,7 @@ class TestDisaggregationMooncakeMHADecodeLargerTP(TestDisaggregationBase):
    def setUpClass(cls):
        super().setUpClass()
        # Temporarily disable JIT DeepGEMM
-        cls.original_jit_deepgemm = os.environ.get("SGL_ENABLE_JIT_DEEPGEMM")
-        os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "false"
+        envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False)

        cls.model = DEFAULT_MODEL_NAME_FOR_TEST

--- a/test/srt/test_disaggregation_dp_attention.py
+++ b/test/srt/test_disaggregation_dp_attention.py
@@ -2,6 +2,7 @@ import os
 import unittest
 from types import SimpleNamespace

+from sglang.srt.environ import envs
 from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
 from sglang.test.test_disaggregation_utils import TestDisaggregationBase
 from sglang.test.test_utils import (
@@ -16,8 +17,7 @@ class TestDisaggregationDPAttention(TestDisaggregationBase):
    def setUpClass(cls):
        super().setUpClass()
        # Temporarily disable JIT DeepGEMM
-        cls.original_jit_deepgemm = os.environ.get("SGL_ENABLE_JIT_DEEPGEMM")
-        os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "false"
+        envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False)

        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA

--- a/test/srt/test_expert_distribution.py
+++ b/test/srt/test_expert_distribution.py
@@ -6,9 +6,9 @@ from pathlib import Path
 import requests
 import torch

+from sglang.srt.environ import envs
 from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
-    DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    CustomTestCase,
@@ -32,7 +32,7 @@ class TestExpertDistribution(CustomTestCase):
    def _execute_core(self, model_path: str, mode: str = "stat", tp_size: int = 1):
        """Test expert distribution record endpoints"""
        with tempfile.TemporaryDirectory() as tmp_dir:
-            os.environ["SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR"] = tmp_dir
+            envs.SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR.set(tmp_dir)

            process = popen_launch_server(
                model_path,
--- a/test/srt/test_fa3.py
+++ b/test/srt/test_fa3.py
@@ -1,9 +1,9 @@
-import os
 import unittest
 from types import SimpleNamespace

 import requests

+from sglang.srt.environ import envs
 from sglang.srt.utils import get_device_sm, kill_process_tree
 from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
 from sglang.test.test_utils import (
@@ -77,14 +77,16 @@ class BaseFlashAttentionTest(CustomTestCase):
    def setUpClass(cls):
        # disable deep gemm precompile to make launch server faster
        # please don't do this if you want to make your inference workload faster
-        os.environ["SGL_JIT_DEEPGEMM_PRECOMPILE"] = "false"
-        os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "false"
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=cls.get_server_args(),
-        )
+        with (
+            envs.SGLANG_JIT_DEEPGEMM_PRECOMPILE.override(False),
+            envs.SGLANG_ENABLE_JIT_DEEPGEMM.override(False),
+        ):
+            cls.process = popen_launch_server(
+                cls.model,
+                cls.base_url,
+                timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                other_args=cls.get_server_args(),
+            )

    @classmethod
    def tearDownClass(cls):
--- a/test/srt/test_hybrid_attn_backend.py
+++ b/test/srt/test_hybrid_attn_backend.py
@@ -4,6 +4,7 @@ from types import SimpleNamespace

 import requests

+from sglang.srt.environ import envs
 from sglang.srt.utils import get_device_sm, kill_process_tree
 from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
 from sglang.test.test_utils import (
@@ -49,18 +50,20 @@ class TestHybridAttnBackendBase(CustomTestCase):
    def setUpClass(cls):
        # disable deep gemm precompile to make launch server faster
        # please don't do this if you want to make your inference workload faster
-        os.environ["SGL_JIT_DEEPGEMM_PRECOMPILE"] = "false"
-        os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "false"
-        if cls.speculative_decode:
-            model = DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
-        else:
-            model = cls.model
-        cls.process = popen_launch_server(
-            model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=cls.get_server_args(),
-        )
+        with (
+            envs.SGLANG_JIT_DEEPGEMM_PRECOMPILE.override(False),
+            envs.SGLANG_ENABLE_JIT_DEEPGEMM.override(False),
+        ):
+            if cls.speculative_decode:
+                model = DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
+            else:
+                model = cls.model
+            cls.process = popen_launch_server(
+                model,
+                cls.base_url,
+                timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                other_args=cls.get_server_args(),
+            )

    @classmethod
    def tearDownClass(cls):
--- a/test/srt/test_ngram_speculative_decoding.py
+++ b/test/srt/test_ngram_speculative_decoding.py
@@ -1,9 +1,9 @@
-import os
 import unittest
 from types import SimpleNamespace

 import requests

+from sglang.srt.environ import envs
 from sglang.srt.utils import kill_process_tree
 from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
 from sglang.test.test_utils import (
@@ -47,8 +47,8 @@ class TestNgramSpeculativeDecodingBase(CustomTestCase):
    def setUpClass(cls):
        # disable deep gemm precompile to make launch server faster
        # please don't do this if you want to make your inference workload faster
-        os.environ["SGL_JIT_DEEPGEMM_PRECOMPILE"] = "false"
-        os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "false"
+        envs.SGLANG_JIT_DEEPGEMM_PRECOMPILE.set(False)
+        envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False)
        model = cls.model
        cls.process = popen_launch_server(
            model,
--- a/test/srt/test_standalone_speculative_decoding.py
+++ b/test/srt/test_standalone_speculative_decoding.py
@@ -1,9 +1,9 @@
-import os
 import unittest
 from types import SimpleNamespace

 import requests

+from sglang.srt.environ import envs
 from sglang.srt.utils import kill_process_tree
 from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
 from sglang.test.test_utils import (
@@ -55,8 +55,8 @@ class TestStandaloneSpeculativeDecodingBase(CustomTestCase):
    def setUpClass(cls):
        # disable deep gemm precompile to make launch server faster
        # please don't do this if you want to make your inference workload faster
-        os.environ["SGL_JIT_DEEPGEMM_PRECOMPILE"] = "false"
-        os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "false"
+        envs.SGLANG_JIT_DEEPGEMM_PRECOMPILE.set(False)
+        envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False)
        model = cls.model
        cls.process = popen_launch_server(
            model,
--- a/test/srt/test_two_batch_overlap.py
+++ b/test/srt/test_two_batch_overlap.py
@@ -1,9 +1,9 @@
-import os
 import unittest
 from types import SimpleNamespace

 import requests

+from sglang.srt.environ import envs
 from sglang.srt.model_executor.forward_batch_info import ForwardMode
 from sglang.srt.two_batch_overlap import (
    compute_split_seq_index,
@@ -25,26 +25,26 @@ class TestTwoBatchOverlap(unittest.TestCase):
    def setUpClass(cls):
        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--tp",
-                "2",
-                "--dp",
-                "2",
-                "--enable-dp-attention",
-                "--moe-a2a-backend",
-                "deepep",
-                "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",  # DeepEP normal does not support CUDA Graph
-                "--enable-two-batch-overlap",
-            ],
-            env={"SGL_ENABLE_JIT_DEEPGEMM": "0", **os.environ},
-        )
+        with envs.SGLANG_ENABLE_JIT_DEEPGEMM.override(False):
+            cls.process = popen_launch_server(
+                cls.model,
+                cls.base_url,
+                timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                other_args=[
+                    "--trust-remote-code",
+                    "--tp",
+                    "2",
+                    "--dp",
+                    "2",
+                    "--enable-dp-attention",
+                    "--moe-a2a-backend",
+                    "deepep",
+                    "--deepep-mode",
+                    "normal",
+                    "--disable-cuda-graph",  # DeepEP normal does not support CUDA Graph
+                    "--enable-two-batch-overlap",
+                ],
+            )

    @classmethod
    def tearDownClass(cls):
@@ -126,26 +126,26 @@ class TestQwen3TwoBatchOverlap(TestTwoBatchOverlap):
        cls.model = DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-1234"
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--tp",
-                "2",
-                "--dp",
-                "2",
-                "--enable-dp-attention",
-                "--moe-a2a-backend",
-                "deepep",
-                "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",  # DeepEP normal does not support CUDA Graph
-                "--enable-two-batch-overlap",
-            ],
-            env={"SGL_ENABLE_JIT_DEEPGEMM": "0", **os.environ},
-        )
+        with envs.SGLANG_ENABLE_JIT_DEEPGEMM.override(False):
+            cls.process = popen_launch_server(
+                cls.model,
+                cls.base_url,
+                timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                other_args=[
+                    "--trust-remote-code",
+                    "--tp",
+                    "2",
+                    "--dp",
+                    "2",
+                    "--enable-dp-attention",
+                    "--moe-a2a-backend",
+                    "deepep",
+                    "--deepep-mode",
+                    "normal",
+                    "--disable-cuda-graph",  # DeepEP normal does not support CUDA Graph
+                    "--enable-two-batch-overlap",
+                ],
+            )


 if __name__ == "__main__":