diff --git a/.github/workflows/nightly-eval.yml b/.github/workflows/nightly-test.yml
similarity index 62%
rename from .github/workflows/nightly-eval.yml
rename to .github/workflows/nightly-test.yml
index 7b77c63a5..04a109f23 100644
--- a/.github/workflows/nightly-eval.yml
+++ b/.github/workflows/nightly-test.yml
@@ -1,4 +1,4 @@
-name: Nightly Evaluation
+name: Nightly Test
 
 on:
   schedule:
@@ -11,11 +11,11 @@ on:
   workflow_dispatch:
 
 concurrency:
-  group: nightly-eval-${{ github.ref }}
+  group: nightly-test-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:
-  nightly-eval-2-gpu:
+  nightly-test:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
     runs-on: 2-gpu-runner
     steps:
@@ -27,14 +27,8 @@ jobs:
           bash scripts/ci_install_dependency.sh
           pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"
 
-      - name: Test gsm8k
-        timeout-minutes: 120
+      - name: Run test
+        timeout-minutes: 10
         run: |
-          cd test/srt
-          python3 test_nightly_gsm8k_eval.py
-
-      - name: Test human eval
-        timeout-minutes: 120
-        run: |
-          cd test/srt
-          python3 test_nightly_human_eval.py
+          cd test/lang
+          python3 run_suite.py --suite nightly --timeout-per-file 2400
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index dd872c768..36b53baa3 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -45,7 +45,7 @@ jobs:
         timeout-minutes: 10
         run: |
           cd test/lang
-          python3 run_suite.py --suite minimal
+          python3 run_suite.py --suite per-commit
 
   unit-test-backend-1-gpu:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
@@ -70,7 +70,7 @@ jobs:
           RANGE=${{ matrix.range }}
           range_begin=${RANGE%-*}
           range_end=${RANGE#*-}
-          python3 run_suite.py --suite minimal --range-begin ${range_begin} --range-end ${range_end}
+          python3 run_suite.py --suite per-commit --range-begin ${range_begin} --range-end ${range_end}
 
   unit-test-backend-2-gpu:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
diff --git a/docs/references/benchmark_and_profiling.md b/docs/references/benchmark_and_profiling.md
index 329dad336..87ac51774 100644
--- a/docs/references/benchmark_and_profiling.md
+++ b/docs/references/benchmark_and_profiling.md
@@ -56,6 +56,8 @@ with nvtx.annotate("description", color="color"):
 ## Other tips
 
 1. You can benchmark a model using dummy weights by only providing the config.json file. This allows for quick testing of model variants without training. To do so, add `--load-format dummy` to the above commands and then you only need a correct `config.json` under the checkpoint folder.
+2. You can benchmark a model with modified configs (e.g., less layers) by using `--json-model-override-args`. For example, you can benchmark a model with only 2 layers and 2 kv heads using `python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --batch 32 --input-len 256 --output-len 32 --load-format dummy --json-model-override-args '{"num_hidden_layers": 1, "num_key_value_heads": 1}'`
+
 
 ## Profile with PyTorch Profiler
 - To profile a server
diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
index ac2474549..6067a7444 100644
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -897,6 +897,7 @@ async def benchmark(
     else:
         raise ValueError(f"Unknown backend: {backend}")
 
+    # Limit concurrency
     # From https://github.com/vllm-project/vllm/pull/9390
     semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
 
@@ -906,6 +907,7 @@ async def benchmark(
         async with semaphore:
             return await request_func(request_func_input=request_func_input, pbar=pbar)
 
+    # Warmup
     print("Starting initial single prompt test run...")
     test_prompt, test_prompt_len, test_output_len = input_requests[0]
     test_input = RequestFuncInput(
@@ -924,11 +926,15 @@ async def benchmark(
             f"are correctly specified. Error: {test_output.error}"
         )
     else:
-        requests.post(base_url + "/flush_cache")
         print("Initial test run completed. Starting main benchmark run...")
 
-    time.sleep(1.5)
+    # Flush cache
+    if "sglang" in backend:
+        requests.post(base_url + "/flush_cache")
 
+    time.sleep(1.0)
+
+    # Start profiler
     if profile:
         print("Starting profiler...")
         profile_output = await async_request_profile(
@@ -939,6 +945,7 @@ async def benchmark(
 
     pbar = None if disable_tqdm else tqdm(total=len(input_requests))
 
+    # Run all requests
     benchmark_start_time = time.perf_counter()
     tasks: List[asyncio.Task] = []
     async for request in get_request(input_requests, request_rate):
@@ -959,6 +966,7 @@ async def benchmark(
         )
     outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
 
+    # Stop profiler
     if profile:
         print("Stopping profiler...")
         profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
@@ -968,8 +976,8 @@ async def benchmark(
     if pbar is not None:
         pbar.close()
 
+    # Compute metrics and print results
     benchmark_duration = time.perf_counter() - benchmark_start_time
-
     metrics, output_lens = calculate_metrics(
         input_requests=input_requests,
         outputs=outputs,
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/README b/python/sglang/srt/layers/moe/fused_moe_triton/configs/README
index 45d40cbfb..4aa527f27 100644
--- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/README
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/README
@@ -8,3 +8,5 @@ the JSON file contains a mapping from M (batch size) to the chosen configuration
 The example configurations provided are for the Mixtral model for TP2 on H100
 and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have
 N = 7168 and for TP4 we have N = 3584.
+
+See `benchmark/kernels/fused_moe_triton/README.md` on how to generate these config files.
diff --git a/sgl-kernel/tests/.gitkeep b/sgl-kernel/tests/.gitkeep
deleted file mode 100644
index e69de29bb..000000000
diff --git a/test/lang/run_suite.py b/test/lang/run_suite.py
index 379427afa..ebc26e608 100644
--- a/test/lang/run_suite.py
+++ b/test/lang/run_suite.py
@@ -4,7 +4,7 @@ import glob
 from sglang.test.test_utils import run_unittest_files
 
 suites = {
-    "minimal": ["test_srt_backend.py", "test_openai_backend.py"],
+    "per-commit": ["test_srt_backend.py", "test_openai_backend.py"],
 }
 
 
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 137507656..a0ca5fabb 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -4,7 +4,7 @@ import glob
 from sglang.test.test_utils import run_unittest_files
 
 suites = {
-    "minimal": [
+    "per-commit": [
         "models/test_embedding_models.py",
         "models/test_generation_models.py",
         "models/test_lora.py",
diff --git a/test/srt/test_triton_attention_backend.py b/test/srt/test_triton_attention_backend.py
index 905590965..88904c55f 100644
--- a/test/srt/test_triton_attention_backend.py
+++ b/test/srt/test_triton_attention_backend.py
@@ -30,7 +30,7 @@ class TestTritonAttnBackend(unittest.TestCase):
         )
 
         if is_in_ci():
-            assert output_throughput > 153, f"{output_throughput=}"
+            self.assertGreater(output_throughput, 153)
 
     def test_mmlu(self):
         model = DEFAULT_MODEL_NAME_FOR_TEST