diff --git a/.github/workflows/execute-notebook.yml b/.github/workflows/execute-notebook.yml
index f3e05dd84..7298d80ec 100644
--- a/.github/workflows/execute-notebook.yml
+++ b/.github/workflows/execute-notebook.yml
@@ -24,7 +24,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/ci_install_dependency.sh
+          bash scripts/ci/ci_install_dependency.sh
           pip install -r docs/requirements.txt
           apt-get update && apt-get install -y pandoc parallel retry
           ln -sf "$(which python3)" /usr/bin/python
diff --git a/.github/workflows/experiment-runner.yml b/.github/workflows/experiment-runner.yml
index f3382320b..487ed9ba3 100644
--- a/.github/workflows/experiment-runner.yml
+++ b/.github/workflows/experiment-runner.yml
@@ -21,7 +21,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/ci_install_dependency.sh
+          bash scripts/ci/ci_install_dependency.sh
 
       - name: Test experiment runner
         timeout-minutes: 120
diff --git a/.github/workflows/nightly-test-amd.yml b/.github/workflows/nightly-test-amd.yml
index 2cdb55ef1..096e876de 100644
--- a/.github/workflows/nightly-test-amd.yml
+++ b/.github/workflows/nightly-test-amd.yml
@@ -28,14 +28,14 @@ jobs:
       - name: Setup docker
         run: |
           touch github_summary.md
-          bash scripts/amd_ci_start_container.sh
+          bash scripts/ci/amd_ci_start_container.sh
         env:
           GITHUB_WORKSPACE: ${{ github.workspace }}
 
       - name: Install dependencies
-        run: bash scripts/amd_ci_install_dependency.sh
+        run: bash scripts/ci/amd_ci_install_dependency.sh
 
       - name: Nightly Test
         run: |
-          bash scripts/amd_ci_exec.sh -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd --timeout-per-file 7200
+          bash scripts/ci/amd_ci_exec.sh -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd --timeout-per-file 7200
           echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY
diff --git a/.github/workflows/nightly-test.yml b/.github/workflows/nightly-test.yml
index 5d6cf34c3..a32c1dbea 100644
--- a/.github/workflows/nightly-test.yml
+++ b/.github/workflows/nightly-test.yml
@@ -24,7 +24,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/ci_install_dependency.sh
+          bash scripts/ci/ci_install_dependency.sh
 
       - name: Run test
         timeout-minutes: 120
diff --git a/.github/workflows/pr-benchmark-rust.yml b/.github/workflows/pr-benchmark-rust.yml
index b2f16bf4d..e34454c19 100644
--- a/.github/workflows/pr-benchmark-rust.yml
+++ b/.github/workflows/pr-benchmark-rust.yml
@@ -31,7 +31,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/ci_install_rust.sh
+          bash scripts/ci/ci_install_rust.sh
 
       - name: Cache Rust dependencies
         uses: actions/cache@v4
@@ -78,7 +78,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/ci_install_rust.sh
+          bash scripts/ci/ci_install_rust.sh
 
       - name: Cache Rust dependencies
         uses: actions/cache@v4
diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml
index aba17ccb9..9756356bb 100644
--- a/.github/workflows/pr-test-amd.yml
+++ b/.github/workflows/pr-test-amd.yml
@@ -36,19 +36,19 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Start CI container
-        run: bash scripts/amd_ci_start_container.sh
+        run: bash scripts/ci/amd_ci_start_container.sh
         env:
           GITHUB_WORKSPACE: ${{ github.workspace }}
 
       - name: Install dependencies
-        run: bash scripts/amd_ci_install_dependency.sh
+        run: bash scripts/ci/amd_ci_install_dependency.sh
 
       - name: Evaluate Accuracy
         timeout-minutes: 30
         run: |
-          bash scripts/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_eval_accuracy_large.py
-          bash scripts/amd_ci_exec.sh python3 test_eval_fp8_accuracy.py
-          bash scripts/amd_ci_exec.sh python3 models/test_qwen_models.py
+          bash scripts/ci/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_eval_accuracy_large.py
+          bash scripts/ci/amd_ci_exec.sh python3 test_eval_fp8_accuracy.py
+          bash scripts/ci/amd_ci_exec.sh python3 models/test_qwen_models.py
 
   accuracy-test-2-gpu-amd:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -62,17 +62,17 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Start CI container
-        run: bash scripts/amd_ci_start_container.sh
+        run: bash scripts/ci/amd_ci_start_container.sh
         env:
           GITHUB_WORKSPACE: ${{ github.workspace }}
 
       - name: Install dependencies
-        run: bash scripts/amd_ci_install_dependency.sh
+        run: bash scripts/ci/amd_ci_install_dependency.sh
 
       - name: Evaluate accuracy (TP=2)
         timeout-minutes: 30
         run: |
-          bash scripts/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_moe_eval_accuracy_large.py
+          bash scripts/ci/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_moe_eval_accuracy_large.py
 
   mla-test-1-gpu-amd:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -86,17 +86,17 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Start CI container
-        run: bash scripts/amd_ci_start_container.sh
+        run: bash scripts/ci/amd_ci_start_container.sh
         env:
           GITHUB_WORKSPACE: ${{ github.workspace }}
 
       - name: Install dependencies
-        run: bash scripts/amd_ci_install_dependency.sh
+        run: bash scripts/ci/amd_ci_install_dependency.sh
 
       - name: MLA TEST
         timeout-minutes: 30
         run: |
-          bash scripts/amd_ci_exec.sh python3 test_mla.py
+          bash scripts/ci/amd_ci_exec.sh python3 test_mla.py
 
   performance-test-1-gpu-part-1-amd:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -110,33 +110,33 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Start CI container
-        run: bash scripts/amd_ci_start_container.sh
+        run: bash scripts/ci/amd_ci_start_container.sh
         env:
           GITHUB_WORKSPACE: ${{ github.workspace }}
 
       - name: Install dependencies
-        run: bash scripts/amd_ci_install_dependency.sh
+        run: bash scripts/ci/amd_ci_install_dependency.sh
 
       - name: Benchmark single latency
         timeout-minutes: 20
         run: |
-          bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
-          bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default
 
       - name: Benchmark online latency
         timeout-minutes: 15
         run: |
-          bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
 
       - name: Benchmark offline throughput
         timeout-minutes: 15
         run: |
-          bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
 
       - name: Benchmark offline throughput (Non-streaming, small batch size)
         timeout-minutes: 15
         run: |
-          bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
 
   performance-test-1-gpu-part-2-amd:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -150,27 +150,27 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Start CI container
-        run: bash scripts/amd_ci_start_container.sh
+        run: bash scripts/ci/amd_ci_start_container.sh
         env:
           GITHUB_WORKSPACE: ${{ github.workspace }}
 
       - name: Install dependencies
-        run: bash scripts/amd_ci_install_dependency.sh
+        run: bash scripts/ci/amd_ci_install_dependency.sh
 
       - name: Benchmark offline throughput (w/o RadixAttention)
         timeout-minutes: 15
         run: |
-          bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
 
       - name: Benchmark offline throughput (w/ Triton)
         timeout-minutes: 15
         run: |
-          bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
 
       - name: Benchmark offline throughput (w/ FP8)
         timeout-minutes: 15
         run: |
-          bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
 
   bench-test-2-gpu-amd:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -184,37 +184,37 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Start CI container
-        run: bash scripts/amd_ci_start_container.sh
+        run: bash scripts/ci/amd_ci_start_container.sh
         env:
           GITHUB_WORKSPACE: ${{ github.workspace }}
 
       - name: Install dependencies
-        run: bash scripts/amd_ci_install_dependency.sh
+        run: bash scripts/ci/amd_ci_install_dependency.sh
 
       - name: Benchmark dummy grok (TP=2)
         timeout-minutes: 30
         run: |
-          bash scripts/amd_ci_exec.sh python3 models/test_dummy_grok_models.py
+          bash scripts/ci/amd_ci_exec.sh python3 models/test_dummy_grok_models.py
 
       - name: Benchmark single latency (TP=2)
         timeout-minutes: 25
         run: |
-          bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
 
       - name: Benchmark single latency + torch.compile (TP=2)
         timeout-minutes: 25
         run: |
-          bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
 
       - name: Benchmark offline throughput (TP=2)
         timeout-minutes: 25
         run: |
-          bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default
 
       - name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
         timeout-minutes: 25
         run: |
-          bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
 
   unit-test-backend-1-gpu-amd:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -230,17 +230,17 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Start CI container
-        run: bash scripts/amd_ci_start_container.sh
+        run: bash scripts/ci/amd_ci_start_container.sh
         env:
           GITHUB_WORKSPACE: ${{ github.workspace }}
 
       - name: Install dependencies
-        run: bash scripts/amd_ci_install_dependency.sh
+        run: bash scripts/ci/amd_ci_install_dependency.sh
 
       - name: Run test
         timeout-minutes: 50
         run: |
-          bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 7
+          bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 7
 
   unit-test-backend-2-gpu-amd:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -254,17 +254,17 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Start CI container
-        run: bash scripts/amd_ci_start_container.sh
+        run: bash scripts/ci/amd_ci_start_container.sh
         env:
           GITHUB_WORKSPACE: ${{ github.workspace }}
 
       - name: Install dependencies
-        run: bash scripts/amd_ci_install_dependency.sh
+        run: bash scripts/ci/amd_ci_install_dependency.sh
 
       - name: Run test
         timeout-minutes: 40
         run: |
-          bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-2-gpu-amd
+          bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-2-gpu-amd
 
   unit-test-backend-8-gpu-amd:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -278,22 +278,22 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Start CI container
-        run: bash scripts/amd_ci_start_container.sh
+        run: bash scripts/ci/amd_ci_start_container.sh
         env:
           GITHUB_WORKSPACE: ${{ github.workspace }}
 
       - name: Install dependencies
-        run: bash scripts/amd_ci_install_dependency.sh
+        run: bash scripts/ci/amd_ci_install_dependency.sh
 
       - name: Run test
         timeout-minutes: 60
         run: |
-          bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --timeout-per-file 3600
+          bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --timeout-per-file 3600
 
       - name: Run CustomAllReduce test
         timeout-minutes: 20
         run: |
-          bash scripts/amd_ci_exec.sh -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m unittest test_custom_allreduce.TestCustomAllReduce
+          bash scripts/ci/amd_ci_exec.sh -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m unittest test_custom_allreduce.TestCustomAllReduce
 
   unit-test-sgl-kernel-amd:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -308,13 +308,13 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Start CI container
-        run: bash scripts/amd_ci_start_container.sh
+        run: bash scripts/ci/amd_ci_start_container.sh
         env:
           GITHUB_WORKSPACE: ${{ github.workspace }}
 
       - name: Install dependencies
         run: |
-          bash scripts/amd_ci_install_dependency.sh
+          bash scripts/ci/amd_ci_install_dependency.sh
 
       - name: Run test
         timeout-minutes: 10
diff --git a/.github/workflows/pr-test-npu.yml b/.github/workflows/pr-test-npu.yml
index 777d75435..fe03a0db1 100644
--- a/.github/workflows/pr-test-npu.yml
+++ b/.github/workflows/pr-test-npu.yml
@@ -34,7 +34,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/npu_ci_install_dependency.sh
+          bash scripts/ci/npu_ci_install_dependency.sh
           # copy required file from our daily cache
           cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
           # copy download through proxy
@@ -63,7 +63,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/npu_ci_install_dependency.sh
+          bash scripts/ci/npu_ci_install_dependency.sh
           # copy required file from our daily cache
           cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
           # copy download through proxy
@@ -92,7 +92,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/npu_ci_install_dependency.sh
+          bash scripts/ci/npu_ci_install_dependency.sh
           # copy required file from our daily cache
           cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
           # copy download through proxy
diff --git a/.github/workflows/pr-test-pd-router.yml b/.github/workflows/pr-test-pd-router.yml
index acae4c610..caca5c94e 100644
--- a/.github/workflows/pr-test-pd-router.yml
+++ b/.github/workflows/pr-test-pd-router.yml
@@ -5,13 +5,13 @@ on:
     branches: [ main ]
     paths:
       - 'python/sglang/srt/disaggregation/**'
-      - 'scripts/ci_start_disaggregation_servers.sh'
+      - 'scripts/ci/ci_start_disaggregation_servers.sh'
       - 'sgl-router/**'
   pull_request:
     branches: [ main ]
     paths:
       - 'python/sglang/srt/disaggregation/**'
-      - 'scripts/ci_start_disaggregation_servers.sh'
+      - 'scripts/ci/ci_start_disaggregation_servers.sh'
       - 'sgl-router/**'
   workflow_dispatch:
 
@@ -44,7 +44,7 @@ jobs:
 
     - name: Setup Rust
       run: |
-        bash scripts/ci_install_rust.sh
+        bash scripts/ci/ci_install_rust.sh
 
     - name: Cache Rust dependencies
       uses: actions/cache@v4
@@ -132,7 +132,7 @@ jobs:
       id: start_servers
       run: |
         echo "Starting disaggregation servers..."
-        bash scripts/ci_start_disaggregation_servers.sh &
+        bash scripts/ci/ci_start_disaggregation_servers.sh &
         SERVER_PID=$!
         echo "server_pid=$SERVER_PID" >> $GITHUB_OUTPUT
 
diff --git a/.github/workflows/pr-test-rust.yml b/.github/workflows/pr-test-rust.yml
index b5c3cd01d..cc44192cb 100644
--- a/.github/workflows/pr-test-rust.yml
+++ b/.github/workflows/pr-test-rust.yml
@@ -25,7 +25,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/ci_install_rust.sh
+          bash scripts/ci/ci_install_rust.sh
 
       - name: Run fmt
         run: |
@@ -64,7 +64,7 @@ jobs:
 
       - name: Install rust dependencies
         run: |
-          bash scripts/ci_install_rust.sh
+          bash scripts/ci/ci_install_rust.sh
 
       - name: Build python binding
         run: |
diff --git a/.github/workflows/pr-test-sgl-kernel.yml b/.github/workflows/pr-test-sgl-kernel.yml
index a3c84a539..624d9ed32 100644
--- a/.github/workflows/pr-test-sgl-kernel.yml
+++ b/.github/workflows/pr-test-sgl-kernel.yml
@@ -84,7 +84,7 @@ jobs:
 
       - name: Install
         run: |
-          bash scripts/ci_install_dependency.sh
+          bash scripts/ci/ci_install_dependency.sh
           pip3 install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126 && pip3 install pytest
           pip3 uninstall sgl-kernel -y || true
           pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps
@@ -116,7 +116,7 @@ jobs:
 
       - name: Install
         run: |
-          bash scripts/ci_install_dependency.sh
+          bash scripts/ci/ci_install_dependency.sh
           pip3 install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126
           pip3 uninstall sgl-kernel -y || true
           pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 65cf23bfa..7f76b02bf 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -52,7 +52,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/ci_install_dependency.sh
+          bash scripts/ci/ci_install_dependency.sh
 
       - name: Run test
         timeout-minutes: 10
@@ -76,7 +76,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/ci_install_dependency.sh
+          bash scripts/ci/ci_install_dependency.sh
 
       - name: Run test
         timeout-minutes: 30
@@ -96,7 +96,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/ci_install_dependency.sh
+          bash scripts/ci/ci_install_dependency.sh
 
       - name: Run test
         timeout-minutes: 30
@@ -120,7 +120,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/ci_install_dependency.sh
+          bash scripts/ci/ci_install_dependency.sh
 
       - name: Run test
         timeout-minutes: 20
@@ -144,7 +144,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/ci_install_dependency.sh
+          bash scripts/ci/ci_install_dependency.sh
 
       - name: Run test
         timeout-minutes: 20
@@ -164,7 +164,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/ci_install_dependency.sh
+          bash scripts/ci/ci_install_dependency.sh
 
       - name: Benchmark single latency
         timeout-minutes: 10
@@ -216,7 +216,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/ci_install_dependency.sh
+          bash scripts/ci/ci_install_dependency.sh
 
       - name: Benchmark offline throughput (w/o RadixAttention)
         timeout-minutes: 10
@@ -260,7 +260,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/ci_install_dependency.sh
+          bash scripts/ci/ci_install_dependency.sh
 
       - name: Benchmark single latency (TP=2)
         timeout-minutes: 10
@@ -310,7 +310,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/ci_install_dependency.sh
+          bash scripts/ci/ci_install_dependency.sh
           git clone https://github.com/merrymercy/human-eval.git
           cd human-eval
           pip install -e .
@@ -333,7 +333,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/ci_install_dependency.sh
+          bash scripts/ci/ci_install_dependency.sh
           git clone https://github.com/merrymercy/human-eval.git
           cd human-eval
           pip install -e .
@@ -356,7 +356,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/ci_install_deepep.sh
+          bash scripts/ci/ci_install_deepep.sh
 
       - name: Run test
         timeout-minutes: 20
@@ -376,7 +376,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/ci_install_deepep.sh
+          bash scripts/ci/ci_install_deepep.sh
 
       - name: Run test
         timeout-minutes: 20
@@ -398,7 +398,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          IS_BLACKWELL=1 bash scripts/ci_install_dependency.sh
+          IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh
 
       - name: Run test
         timeout-minutes: 20
diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml
index 591cd5bdf..f4ae7ae3e 100644
--- a/.github/workflows/release-docs.yml
+++ b/.github/workflows/release-docs.yml
@@ -24,29 +24,28 @@ jobs:
 
       - name: Install dependencies
         run: |
-          find /public_sglang_ci/runner-a-gpu-1/_work/_tool/Python/3.10.13/x64/lib/python3.10/site-packages -name "sgl-kernel*" -exec rm -rf {} + || true
-          bash scripts/ci_install_dependency.sh
+          bash scripts/ci/ci_install_dependency.sh
           pip install -r docs/requirements.txt
-          apt-get update
-          apt-get install -y pandoc
-          apt-get update && apt-get install -y parallel retry
-
+          apt-get update && apt-get install -y pandoc parallel retry
           ln -sf "$(which python3)" /usr/bin/python
 
       - name: Setup Jupyter Kernel
         run: |
           python -m ipykernel install --user --name python3 --display-name "Python 3"
 
-      - name: Execute notebooks and push to documents
-        env:
-          GITHUB_TOKEN: ${{ secrets.DOCUMENTATION_PAT_TOKEN }}
+      - name: Execute notebooks
+        timeout-minutes: 40
         run: |
           cd docs
           make clean
           make compile
 
+      - name: Push HTML to sgl-project.github.io
+        run: |
+          cd docs
           make html
           python3 wrap_run_llm.py
+
           cd _build/html
 
           git clone https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git ../sgl-project.github.io --depth 1
diff --git a/.github/workflows/vllm-dependency-test.yml b/.github/workflows/vllm-dependency-test.yml
index 7dc6a8ba6..5bb1392e1 100644
--- a/.github/workflows/vllm-dependency-test.yml
+++ b/.github/workflows/vllm-dependency-test.yml
@@ -29,7 +29,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          bash scripts/ci_install_dependency.sh
+          bash scripts/ci/ci_install_dependency.sh
           pip install "vllm==0.10.0"
           pip install "openai==1.99.1"
           pip install "bitsandbytes>=0.44.0"
diff --git a/README.md b/README.md
index 3b3a226b9..63a8952c6 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@
 | [**Documentation**](https://docs.sglang.ai/)
 | [**Join Slack**](https://slack.sglang.ai/)
 | [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
-| [**Roadmap**](https://github.com/sgl-project/sglang/issues/4042)
+| [**Roadmap**](https://github.com/sgl-project/sglang/issues/7736)
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 
 ## News
diff --git a/scripts/amd_ci_exec.sh b/scripts/ci/amd_ci_exec.sh
similarity index 100%
rename from scripts/amd_ci_exec.sh
rename to scripts/ci/amd_ci_exec.sh
diff --git a/scripts/amd_ci_install_dependency.sh b/scripts/ci/amd_ci_install_dependency.sh
similarity index 100%
rename from scripts/amd_ci_install_dependency.sh
rename to scripts/ci/amd_ci_install_dependency.sh
diff --git a/scripts/amd_ci_start_container.sh b/scripts/ci/amd_ci_start_container.sh
similarity index 98%
rename from scripts/amd_ci_start_container.sh
rename to scripts/ci/amd_ci_start_container.sh
index ebb41debf..5d1e6cfe1 100755
--- a/scripts/amd_ci_start_container.sh
+++ b/scripts/ci/amd_ci_start_container.sh
@@ -3,7 +3,7 @@ set -euo pipefail
 
 # Get version from SGLang version.py file
 FALLBACK_SGLANG_VERSION="v0.4.10.post2"
-SGLANG_VERSION_FILE="$(dirname "$0")/../python/sglang/version.py"
+SGLANG_VERSION_FILE="$(dirname "$0")/../../python/sglang/version.py"
 
 if [ -f "$SGLANG_VERSION_FILE" ]; then
   SGLANG_VERSION=$(python3 -c '
diff --git a/scripts/ci_install_deepep.sh b/scripts/ci/ci_install_deepep.sh
similarity index 98%
rename from scripts/ci_install_deepep.sh
rename to scripts/ci/ci_install_deepep.sh
index e743bddaf..d82dca935 100755
--- a/scripts/ci_install_deepep.sh
+++ b/scripts/ci/ci_install_deepep.sh
@@ -2,7 +2,7 @@
 # Install the dependency in CI.
 set -euxo pipefail
 
-bash scripts/ci_install_dependency.sh
+bash scripts/ci/ci_install_dependency.sh
 
 export GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
 export NVSHMEM_DIR=/opt/nvshmem/install
diff --git a/scripts/ci_install_dependency.sh b/scripts/ci/ci_install_dependency.sh
similarity index 98%
rename from scripts/ci_install_dependency.sh
rename to scripts/ci/ci_install_dependency.sh
index 0ad51c7a3..83108a0e1 100755
--- a/scripts/ci_install_dependency.sh
+++ b/scripts/ci/ci_install_dependency.sh
@@ -12,7 +12,7 @@ fi
 
 # Kill existing processes
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-bash "${SCRIPT_DIR}/killall_sglang.sh"
+bash "${SCRIPT_DIR}/../killall_sglang.sh"
 
 # Install apt packages
 apt install -y git libnuma-dev
diff --git a/scripts/ci_install_rust.sh b/scripts/ci/ci_install_rust.sh
similarity index 100%
rename from scripts/ci_install_rust.sh
rename to scripts/ci/ci_install_rust.sh
diff --git a/scripts/ci_start_disaggregation_servers.sh b/scripts/ci/ci_start_disaggregation_servers.sh
similarity index 100%
rename from scripts/ci_start_disaggregation_servers.sh
rename to scripts/ci/ci_start_disaggregation_servers.sh
diff --git a/scripts/npu_ci_install_dependency.sh b/scripts/ci/npu_ci_install_dependency.sh
similarity index 100%
rename from scripts/npu_ci_install_dependency.sh
rename to scripts/ci/npu_ci_install_dependency.sh
diff --git a/scripts/ci_cache_models.sh b/scripts/ci_cache_models.sh
deleted file mode 100755
index 0ebe6c055..000000000
--- a/scripts/ci_cache_models.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-set -euxo pipefail
-
-mapfile -t models < <(python3 -c "from sglang.test.test_utils import _get_default_models; print(_get_default_models())" | jq -r '.[]')
-
-if [ ${#models[@]} -eq 0 ]; then
-    echo "Failed to get default models."
-    exit 1
-fi
-
-cache_dir="${DEFAULT_MODEL_CACHE_DIR:-}"
-
-if [ -z "$cache_dir" ]; then
-    echo "DEFAULT_MODEL_CACHE_DIR environment variable is not set."
-    exit 1
-fi
-
-failed_models=()
-for model in "${models[@]}"; do
-    local_model_dir="$cache_dir/$model"
-    echo "Caching model: $model to $local_model_dir"
-    mkdir -p "$local_model_dir"
-
-    if ! huggingface-cli download "$model" \
-        --local-dir "$local_model_dir" \
-        --local-dir-use-symlinks False 2>/dev/null; then
-        echo "WARNING: Failed to cache model: $model"
-        rm -rf "$local_model_dir"
-        failed_models+=("$model")
-        continue
-    fi
-    echo "Successfully cached model: $model"
-done
-
-if [ ${#failed_models[@]} -gt 0 ]; then
-    echo -e "\n[Summary] Failed to cache following models:"
-    printf ' - %s\n' "${failed_models[@]}"
-else
-    echo -e "\n[Summary] All models cached successfully"
-fi
diff --git a/sgl-kernel/CMakeLists.txt b/sgl-kernel/CMakeLists.txt
index aeefd3371..4fa98e436 100644
--- a/sgl-kernel/CMakeLists.txt
+++ b/sgl-kernel/CMakeLists.txt
@@ -87,6 +87,7 @@ FetchContent_Declare(
     GIT_SHALLOW    OFF
 )
 FetchContent_Populate(repo-flashinfer)
+
 # flash-attention
 FetchContent_Declare(
     repo-flash-attention
@@ -95,6 +96,7 @@ FetchContent_Declare(
     GIT_SHALLOW    OFF
 )
 FetchContent_Populate(repo-flash-attention)
+
 # mscclpp
 FetchContent_Declare(
     repo-mscclpp
@@ -232,6 +234,7 @@ set(SOURCES
     "csrc/elementwise/activation.cu"
     "csrc/elementwise/fused_add_rms_norm_kernel.cu"
     "csrc/elementwise/rope.cu"
+    "csrc/common_extension.cc"
     "csrc/gemm/awq_kernel.cu"
     "csrc/gemm/bmm_fp8.cu"
     "csrc/gemm/dsv3_fused_a_gemm.cu"
@@ -251,24 +254,10 @@ set(SOURCES
     "csrc/gemm/per_token_quant_fp8.cu"
     "csrc/gemm/qserve_w4a8_per_chn_gemm.cu"
     "csrc/gemm/qserve_w4a8_per_group_gemm.cu"
-    "csrc/moe/moe_align_kernel.cu"
-    "csrc/moe/moe_fused_gate.cu"
-    "csrc/moe/moe_topk_softmax_kernels.cu"
-    "csrc/moe/nvfp4_blockwise_moe.cu"
-    "csrc/moe/fp8_blockwise_moe_kernel.cu"
-    "csrc/moe/prepare_moe_input.cu"
-    "csrc/moe/ep_moe_reorder_kernel.cu"
-    "csrc/moe/ep_moe_silu_and_mul_kernel.cu"
-    "csrc/speculative/eagle_utils.cu"
-    "csrc/speculative/packbit.cu"
-    "csrc/spatial/greenctx_stream.cu"
-    "csrc/speculative/speculative_sampling.cu"
     "csrc/grammar/apply_token_bitmask_inplace_cuda.cu"
-    "csrc/kvcacheio/transfer.cu"
     "csrc/moe/cutlass_moe/w4a8/scaled_mm_entry.cu"
     "csrc/moe/cutlass_moe/w4a8/w4a8_moe_data.cu"
     "csrc/moe/cutlass_moe/w4a8/w4a8_grouped_mm_c3x.cu"
-    "csrc/common_extension.cc"
     "csrc/moe/marlin_moe_wna16/ops.cu"
     "csrc/moe/marlin_moe_wna16/gptq_marlin_repack.cu"
     "csrc/moe/marlin_moe_wna16/awq_marlin_repack.cu"
@@ -278,6 +267,19 @@ set(SOURCES
     "csrc/moe/marlin_moe_wna16/kernel_fp16_ku4.cu"
     "csrc/moe/marlin_moe_wna16/kernel_fp16_ku4b8.cu"
     "csrc/moe/marlin_moe_wna16/kernel_fp16_ku8b128.cu"
+    "csrc/moe/moe_align_kernel.cu"
+    "csrc/moe/moe_fused_gate.cu"
+    "csrc/moe/moe_topk_softmax_kernels.cu"
+    "csrc/moe/nvfp4_blockwise_moe.cu"
+    "csrc/moe/fp8_blockwise_moe_kernel.cu"
+    "csrc/moe/prepare_moe_input.cu"
+    "csrc/moe/ep_moe_reorder_kernel.cu"
+    "csrc/moe/ep_moe_silu_and_mul_kernel.cu"
+    "csrc/kvcacheio/transfer.cu"
+    "csrc/speculative/eagle_utils.cu"
+    "csrc/speculative/packbit.cu"
+    "csrc/spatial/greenctx_stream.cu"
+    "csrc/speculative/speculative_sampling.cu"
     "${repo-flashinfer_SOURCE_DIR}/csrc/norm.cu"
     "${repo-flashinfer_SOURCE_DIR}/csrc/renorm.cu"
     "${repo-flashinfer_SOURCE_DIR}/csrc/sampling.cu"
@@ -312,12 +314,15 @@ else()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
 endif()
+
+# mscclpp
 set(MSCCLPP_USE_CUDA ON)
 set(MSCCLPP_BYPASS_GPU_CHECK ON)
 set(MSCCLPP_BUILD_TESTS OFF)
 add_subdirectory(${repo-mscclpp_SOURCE_DIR})
 target_link_libraries(common_ops PRIVATE ${TORCH_LIBRARIES} c10 cuda cublas cublasLt mscclpp_static)
 
+# flash attention
 target_compile_definitions(common_ops PRIVATE
     FLASHATTENTION_DISABLE_BACKWARD
     FLASHATTENTION_DISABLE_DROPOUT
diff --git a/sgl-kernel/README.md b/sgl-kernel/README.md
index 72491433a..c81a2af0b 100644
--- a/sgl-kernel/README.md
+++ b/sgl-kernel/README.md
@@ -5,6 +5,11 @@
 [![PyPI](https://img.shields.io/pypi/v/sgl-kernel)](https://pypi.org/project/sgl-kernel)
 
 ## Installation
+For CUDA 12.1 and above:
+
+```bash
+pip3 install sgl-kernel
+```
 
 For CUDA 11.8:
 
@@ -12,11 +17,6 @@ For CUDA 11.8:
 pip3 install sgl-kernel -i https://docs.sglang.ai/whl/cu118
 ```
 
-For CUDA 12.1 or CUDA 12.4:
-
-```bash
-pip3 install sgl-kernel
-```
 ## Build from source
 
 Development build:
diff --git a/test/srt/test_ascend_mla_w8a8int8.py b/test/srt/ascend/test_ascend_mla_w8a8int8.py
similarity index 100%
rename from test/srt/test_ascend_mla_w8a8int8.py
rename to test/srt/ascend/test_ascend_mla_w8a8int8.py
diff --git a/test/srt/test_ascend_tp1_bf16.py b/test/srt/ascend/test_ascend_tp1_bf16.py
similarity index 100%
rename from test/srt/test_ascend_tp1_bf16.py
rename to test/srt/ascend/test_ascend_tp1_bf16.py
diff --git a/test/srt/test_ascend_tp2_bf16.py b/test/srt/ascend/test_ascend_tp2_bf16.py
similarity index 100%
rename from test/srt/test_ascend_tp2_bf16.py
rename to test/srt/ascend/test_ascend_tp2_bf16.py
diff --git a/test/srt/test_deepep_internode.py b/test/srt/ep/test_deepep_internode.py
similarity index 100%
rename from test/srt/test_deepep_internode.py
rename to test/srt/ep/test_deepep_internode.py
diff --git a/test/srt/test_deepep_intranode.py b/test/srt/ep/test_deepep_intranode.py
similarity index 100%
rename from test/srt/test_deepep_intranode.py
rename to test/srt/ep/test_deepep_intranode.py
diff --git a/test/srt/test_deepep_large.py b/test/srt/ep/test_deepep_large.py
similarity index 100%
rename from test/srt/test_deepep_large.py
rename to test/srt/ep/test_deepep_large.py
diff --git a/test/srt/test_deepep_low_latency.py b/test/srt/ep/test_deepep_low_latency.py
similarity index 100%
rename from test/srt/test_deepep_low_latency.py
rename to test/srt/ep/test_deepep_low_latency.py
diff --git a/test/srt/test_deepep_small.py b/test/srt/ep/test_deepep_small.py
similarity index 100%
rename from test/srt/test_deepep_small.py
rename to test/srt/ep/test_deepep_small.py
diff --git a/test/srt/test_eplb.py b/test/srt/ep/test_eplb.py
similarity index 100%
rename from test/srt/test_eplb.py
rename to test/srt/ep/test_eplb.py
diff --git a/test/srt/test_hybrid_dp_ep_tp_mtp.py b/test/srt/ep/test_hybrid_dp_ep_tp_mtp.py
similarity index 100%
rename from test/srt/test_hybrid_dp_ep_tp_mtp.py
rename to test/srt/ep/test_hybrid_dp_ep_tp_mtp.py
diff --git a/test/srt/test_moe_deepep.py b/test/srt/ep/test_moe_deepep.py
similarity index 100%
rename from test/srt/test_moe_deepep.py
rename to test/srt/ep/test_moe_deepep.py
diff --git a/test/srt/test_moe_deepep_eval_accuracy_large.py b/test/srt/ep/test_moe_deepep_eval_accuracy_large.py
similarity index 100%
rename from test/srt/test_moe_deepep_eval_accuracy_large.py
rename to test/srt/ep/test_moe_deepep_eval_accuracy_large.py
diff --git a/test/srt/test_moe_ep.py b/test/srt/ep/test_moe_ep.py
similarity index 100%
rename from test/srt/test_moe_ep.py
rename to test/srt/ep/test_moe_ep.py
diff --git a/test/srt/test_hicache.py b/test/srt/hicache/test_hicache.py
similarity index 100%
rename from test/srt/test_hicache.py
rename to test/srt/hicache/test_hicache.py
diff --git a/test/srt/test_hicache_mla.py b/test/srt/hicache/test_hicache_mla.py
similarity index 100%
rename from test/srt/test_hicache_mla.py
rename to test/srt/hicache/test_hicache_mla.py
diff --git a/test/srt/test_hicache_page.py b/test/srt/hicache/test_hicache_page.py
similarity index 100%
rename from test/srt/test_hicache_page.py
rename to test/srt/hicache/test_hicache_page.py
diff --git a/test/srt/test_hicache_storage.py b/test/srt/hicache/test_hicache_storage.py
similarity index 100%
rename from test/srt/test_hicache_storage.py
rename to test/srt/hicache/test_hicache_storage.py
diff --git a/test/srt/models/lora/test_lora.py b/test/srt/lora/test_lora.py
similarity index 100%
rename from test/srt/models/lora/test_lora.py
rename to test/srt/lora/test_lora.py
diff --git a/test/srt/models/lora/test_lora_backend.py b/test/srt/lora/test_lora_backend.py
similarity index 100%
rename from test/srt/models/lora/test_lora_backend.py
rename to test/srt/lora/test_lora_backend.py
diff --git a/test/srt/models/lora/test_lora_cuda_graph.py b/test/srt/lora/test_lora_cuda_graph.py
similarity index 100%
rename from test/srt/models/lora/test_lora_cuda_graph.py
rename to test/srt/lora/test_lora_cuda_graph.py
diff --git a/test/srt/models/lora/test_lora_eviction.py b/test/srt/lora/test_lora_eviction.py
similarity index 100%
rename from test/srt/models/lora/test_lora_eviction.py
rename to test/srt/lora/test_lora_eviction.py
diff --git a/test/srt/models/lora/test_lora_qwen3.py b/test/srt/lora/test_lora_qwen3.py
similarity index 100%
rename from test/srt/models/lora/test_lora_qwen3.py
rename to test/srt/lora/test_lora_qwen3.py
diff --git a/test/srt/models/lora/test_lora_tp.py b/test/srt/lora/test_lora_tp.py
similarity index 100%
rename from test/srt/models/lora/test_lora_tp.py
rename to test/srt/lora/test_lora_tp.py
diff --git a/test/srt/models/lora/test_lora_update.py b/test/srt/lora/test_lora_update.py
similarity index 100%
rename from test/srt/models/lora/test_lora_update.py
rename to test/srt/lora/test_lora_update.py
diff --git a/test/srt/models/lora/test_multi_lora_backend.py b/test/srt/lora/test_multi_lora_backend.py
similarity index 100%
rename from test/srt/models/lora/test_multi_lora_backend.py
rename to test/srt/lora/test_multi_lora_backend.py
diff --git a/test/srt/models/lora/utils.py b/test/srt/lora/utils.py
similarity index 100%
rename from test/srt/models/lora/utils.py
rename to test/srt/lora/utils.py
diff --git a/test/srt/test_awq.py b/test/srt/quant/test_awq.py
similarity index 100%
rename from test/srt/test_awq.py
rename to test/srt/quant/test_awq.py
diff --git a/test/srt/test_awq_dequant.py b/test/srt/quant/test_awq_dequant.py
similarity index 100%
rename from test/srt/test_awq_dequant.py
rename to test/srt/quant/test_awq_dequant.py
diff --git a/test/srt/test_block_int8.py b/test/srt/quant/test_block_int8.py
similarity index 100%
rename from test/srt/test_block_int8.py
rename to test/srt/quant/test_block_int8.py
diff --git a/test/srt/test_fp8_kernel.py b/test/srt/quant/test_fp8_kernel.py
similarity index 100%
rename from test/srt/test_fp8_kernel.py
rename to test/srt/quant/test_fp8_kernel.py
diff --git a/test/srt/test_fp8_kvcache.py b/test/srt/quant/test_fp8_kvcache.py
similarity index 100%
rename from test/srt/test_fp8_kvcache.py
rename to test/srt/quant/test_fp8_kvcache.py
diff --git a/test/srt/test_int8_kernel.py b/test/srt/quant/test_int8_kernel.py
similarity index 100%
rename from test/srt/test_int8_kernel.py
rename to test/srt/quant/test_int8_kernel.py
diff --git a/test/srt/test_w8a8_quantization.py b/test/srt/quant/test_w8a8_quantization.py
similarity index 97%
rename from test/srt/test_w8a8_quantization.py
rename to test/srt/quant/test_w8a8_quantization.py
index 3d4ce1afa..acb7f5c7d 100644
--- a/test/srt/test_w8a8_quantization.py
+++ b/test/srt/quant/test_w8a8_quantization.py
@@ -43,7 +43,7 @@ class TestW8A8(CustomTestCase):
         metrics = run_eval(args)
         print(metrics)
 
-        self.assertGreater(metrics["accuracy"], 0.7)
+        self.assertGreater(metrics["accuracy"], 0.69)
 
     def run_decode(self, max_new_tokens):
         response = requests.post(
diff --git a/test/srt/test_update_weights_from_disk.py b/test/srt/rl/test_update_weights_from_disk.py
similarity index 100%
rename from test/srt/test_update_weights_from_disk.py
rename to test/srt/rl/test_update_weights_from_disk.py
diff --git a/test/srt/test_update_weights_from_distributed.py b/test/srt/rl/test_update_weights_from_distributed.py
similarity index 100%
rename from test/srt/test_update_weights_from_distributed.py
rename to test/srt/rl/test_update_weights_from_distributed.py
diff --git a/test/srt/test_update_weights_from_tensor.py b/test/srt/rl/test_update_weights_from_tensor.py
similarity index 100%
rename from test/srt/test_update_weights_from_tensor.py
rename to test/srt/rl/test_update_weights_from_tensor.py
diff --git a/test/srt/test_verl_engine_2_gpu.py b/test/srt/rl/test_verl_engine_2_gpu.py
similarity index 100%
rename from test/srt/test_verl_engine_2_gpu.py
rename to test/srt/rl/test_verl_engine_2_gpu.py
diff --git a/test/srt/test_verl_engine_4_gpu.py b/test/srt/rl/test_verl_engine_4_gpu.py
similarity index 100%
rename from test/srt/test_verl_engine_4_gpu.py
rename to test/srt/rl/test_verl_engine_4_gpu.py
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index fa265e698..0eab9537e 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -13,13 +13,16 @@ class TestFile:
 
 suites = {
     "per-commit": [
-        TestFile("models/lora/test_lora.py", 200),
-        TestFile("models/lora/test_lora_eviction.py", 200),
-        TestFile("models/lora/test_lora_backend.py", 99),
-        TestFile("models/lora/test_multi_lora_backend.py", 60),
-        TestFile("models/lora/test_lora_cuda_graph.py", 250),
-        TestFile("models/lora/test_lora_update.py", 800),
-        TestFile("models/lora/test_lora_qwen3.py", 97),
+        TestFile("hicache/test_hicache.py", 116),
+        TestFile("hicache/test_hicache_mla.py", 127),
+        TestFile("hicache/test_hicache_storage.py", 127),
+        TestFile("lora/test_lora.py", 200),
+        TestFile("lora/test_lora_eviction.py", 200),
+        TestFile("lora/test_lora_backend.py", 99),
+        TestFile("lora/test_multi_lora_backend.py", 60),
+        TestFile("lora/test_lora_cuda_graph.py", 250),
+        TestFile("lora/test_lora_update.py", 800),
+        TestFile("lora/test_lora_qwen3.py", 97),
         TestFile("models/test_embedding_models.py", 73),
         # TestFile("models/test_clip_models.py", 52),
         TestFile("models/test_encoder_embedding_models.py", 100),
@@ -50,8 +53,13 @@ suites = {
         TestFile("openai_server/validation/test_matched_stop.py", 60),
         TestFile("openai_server/validation/test_openai_server_ignore_eos.py", 85),
         TestFile("openai_server/validation/test_request_length_validation.py", 31),
+        TestFile("quant/test_block_int8.py", 22),
+        TestFile("quant/test_fp8_kernel.py", 8),
+        TestFile("quant/test_int8_kernel.py", 8),
+        TestFile("quant/test_w8a8_quantization.py", 46),
+        TestFile("rl/test_update_weights_from_disk.py", 114),
+        TestFile("rl/test_update_weights_from_tensor.py", 48),
         TestFile("test_abort.py", 51),
-        TestFile("test_block_int8.py", 22),
         TestFile("test_create_kvindices.py", 2),
         TestFile("test_chunked_prefill.py", 313),
         TestFile("test_eagle_infer_a.py", 370),
@@ -60,15 +68,11 @@ suites = {
         TestFile("test_eval_fp8_accuracy.py", 303),
         TestFile("test_fa3.py", 376),
         # TestFile("test_flashmla.py", 352),
-        TestFile("test_fp8_kernel.py", 8),
         TestFile("test_function_call_parser.py", 10),
         TestFile("test_fused_moe.py", 30),
         TestFile("test_gpt_oss_1gpu.py", 600),
-        TestFile("test_hicache.py", 116),
-        TestFile("test_hicache_mla.py", 127),
-        TestFile("test_hicache_storage.py", 127),
         TestFile("test_hidden_states.py", 55),
-        TestFile("test_int8_kernel.py", 8),
+        TestFile("test_hybrid_attn_backend.py", 100),
         TestFile("test_input_embeddings.py", 38),
         TestFile("test_io_struct.py", 8),
         TestFile("test_jinja_template_utils.py", 1),
@@ -85,6 +89,7 @@ suites = {
         TestFile("test_pytorch_sampling_backend.py", 66),
         TestFile("test_radix_attention.py", 105),
         TestFile("test_regex_constrained.py", 64),
+        TestFile("test_reasoning_parser.py", 5),
         TestFile("test_retract_decode.py", 54),
         TestFile("test_request_queue_validation.py", 30),
         TestFile("test_server_args.py", 1),
@@ -100,23 +105,18 @@ suites = {
         TestFile("test_triton_attention_backend.py", 150),
         TestFile("test_triton_moe_channel_fp8_kernel.py", 25),
         TestFile("test_triton_sliding_window.py", 250),
-        TestFile("test_update_weights_from_disk.py", 114),
-        TestFile("test_update_weights_from_tensor.py", 48),
         TestFile("test_utils_update_weights.py", 48),
         TestFile("test_vision_chunked_prefill.py", 175),
         TestFile("test_vlm_input_format.py", 300),
         TestFile("test_vision_openai_server_a.py", 989),
         TestFile("test_vision_openai_server_b.py", 620),
-        TestFile("test_w8a8_quantization.py", 46),
-        TestFile("test_reasoning_parser.py", 5),
-        TestFile("test_hybrid_attn_backend.py", 100),
     ],
     "per-commit-2-gpu": [
-        TestFile("models/lora/test_lora_tp.py", 116),
+        TestFile("lora/test_lora_tp.py", 116),
+        TestFile("rl/test_update_weights_from_distributed.py", 103),
         TestFile("test_data_parallelism.py", 73),
         TestFile("test_dp_attention.py", 277),
         TestFile("test_patch_torch.py", 19),
-        TestFile("test_update_weights_from_distributed.py", 103),
         TestFile("test_release_memory_occupation.py", 127),
     ],
     "per-commit-4-gpu": [
@@ -127,7 +127,7 @@ suites = {
     ],
     "per-commit-8-gpu": [
         # Disabled because it hangs on the CI.
-        # TestFile("test_moe_ep.py", 181),
+        # TestFile("ep/test_moe_ep.py", 181),
         TestFile("test_disaggregation.py", 499),
         TestFile("test_disaggregation_different_tp.py", 155),
         TestFile("test_full_deepseek_v3.py", 333),
@@ -136,16 +136,16 @@ suites = {
         # add more here
     ],
     "per-commit-4-gpu-deepep": [
-        TestFile("test_deepep_small.py", 531),
+        TestFile("ep/test_deepep_small.py", 531),
     ],
     "per-commit-8-gpu-deepep": [
-        TestFile("test_deepep_large.py", 338),
+        TestFile("ep/test_deepep_large.py", 338),
     ],
     "nightly": [
         TestFile("test_nightly_gsm8k_eval.py"),
     ],
     "vllm_dependency_test": [
-        TestFile("test_awq.py", 163),
+        TestFile("quant/test_awq.py", 163),
         TestFile("test_bnb.py", 5),
         TestFile("test_gguf.py", 96),
         TestFile("test_gptqmodel_dynamic.py", 102),
@@ -156,13 +156,9 @@ suites = {
 # Add AMD tests
 suite_amd = {
     "per-commit-amd": [
-        TestFile("models/lora/test_lora_backend.py", 99),
-        TestFile("models/lora/test_multi_lora_backend.py", 60),
-        TestFile("models/lora/test_lora_cuda_graph.py", 250),
-        TestFile("test_mla.py", 242),
-        TestFile("test_mla_deepseek_v3.py", 221),
-        TestFile("test_torch_compile.py", 76),
-        TestFile("test_torch_compile_moe.py", 172),
+        TestFile("lora/test_lora_backend.py", 99),
+        TestFile("lora/test_multi_lora_backend.py", 60),
+        TestFile("lora/test_lora_cuda_graph.py", 250),
         TestFile("models/test_qwen_models.py", 82),
         TestFile("models/test_reward_models.py", 132),
         TestFile("openai_server/basic/test_openai_embedding.py", 141),
@@ -170,14 +166,18 @@ suite_amd = {
         TestFile("openai_server/features/test_reasoning_content.py", 89),
         TestFile("openai_server/validation/test_large_max_new_tokens.py", 41),
         TestFile("openai_server/validation/test_request_length_validation.py", 31),
+        TestFile("quant/test_block_int8.py", 22),
+        TestFile("quant/test_awq_dequant.py", 2),
+        TestFile("rl/test_update_weights_from_disk.py", 114),
         TestFile("test_abort.py", 51),
-        TestFile("test_block_int8.py", 22),
         TestFile("test_create_kvindices.py", 2),
         TestFile("test_chunked_prefill.py", 313),
         TestFile("test_eval_fp8_accuracy.py", 303),
         TestFile("test_function_call_parser.py", 10),
         TestFile("test_fused_moe.py", 30),
         TestFile("test_input_embeddings.py", 38),
+        TestFile("test_mla.py", 242),
+        TestFile("test_mla_deepseek_v3.py", 221),
         TestFile("test_metrics.py", 32),
         TestFile("test_no_chunked_prefill.py", 108),
         # TestFile("test_no_overlap_scheduler.py", 234), # Disabled temporarily and track in #7703
@@ -186,22 +186,21 @@ suite_amd = {
         TestFile("test_pytorch_sampling_backend.py", 66),
         TestFile("test_radix_attention.py", 105),
         TestFile("test_retract_decode.py", 54),
-        TestFile("test_server_args.py", 1),
-        TestFile("test_skip_tokenizer_init.py", 117),
-        TestFile("test_torch_native_attention_backend.py", 123),
-        TestFile("test_triton_attention_backend.py", 150),
-        TestFile("test_update_weights_from_disk.py", 114),
-        TestFile("test_vertex_endpoint.py", 31),
-        # TestFile("test_vision_chunked_prefill.py", 175), # Disabled temporarily and track in #7701
         TestFile("test_reasoning_parser.py", 5),
         TestFile("test_rope_rocm.py", 3),
-        TestFile("test_awq_dequant.py", 2),
+        TestFile("test_server_args.py", 1),
+        TestFile("test_skip_tokenizer_init.py", 117),
+        TestFile("test_torch_compile.py", 76),
+        TestFile("test_torch_compile_moe.py", 172),
+        TestFile("test_torch_native_attention_backend.py", 123),
+        TestFile("test_triton_attention_backend.py", 150),
+        # TestFile("test_vision_chunked_prefill.py", 175), # Disabled temporarily and track in #7701
     ],
     "per-commit-2-gpu-amd": [
-        TestFile("models/lora/test_lora_tp.py", 116),
+        TestFile("lora/test_lora_tp.py", 116),
+        TestFile("rl/test_update_weights_from_distributed.py", 103),
         TestFile("test_data_parallelism.py", 73),
         TestFile("test_patch_torch.py", 19),
-        TestFile("test_update_weights_from_distributed.py", 103),
     ],
     "per-commit-4-gpu-amd": [
         TestFile("test_pp_single_node.py", 150),
@@ -236,13 +235,13 @@ suite_xeon = {
 # Add Ascend NPU tests
 suite_ascend = {
     "per-commit-1-ascend-npu": [
-        TestFile("test_ascend_tp1_bf16.py", 400),
+        TestFile("ascend/test_ascend_tp1_bf16.py", 400),
     ],
     "per-commit-2-ascend-npu": [
-        TestFile("test_ascend_tp2_bf16.py", 400),
+        TestFile("ascend/test_ascend_tp2_bf16.py", 400),
     ],
     "per-commit-4-ascend-npu": [
-        TestFile("test_ascend_mla_w8a8int8.py", 400),
+        TestFile("ascend/test_ascend_mla_w8a8int8.py", 400),
     ],
 }
 
diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py
index 30e1fab50..608595b95 100644
--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -56,7 +56,10 @@ class TestBenchServing(CustomTestCase):
                 f"### test_offline_throughput_non_stream_small_batch_size\n"
                 f"Output throughput: {res['output_throughput']:.2f} token/s\n"
             )
-            self.assertGreater(res["output_throughput"], 1045)
+            if is_in_amd_ci():
+                self.assertGreater(res["output_throughput"], 1000)
+            else:
+                self.assertGreater(res["output_throughput"], 1050)
 
     def test_offline_throughput_without_radix_cache(self):
         res = run_bench_serving(
diff --git a/test/srt/test_intel_amx_attention_backend.py b/test/srt/test_intel_amx_attention_backend.py
index 4c2bc130e..0b49c8af7 100644
--- a/test/srt/test_intel_amx_attention_backend.py
+++ b/test/srt/test_intel_amx_attention_backend.py
@@ -70,7 +70,7 @@ class TestIntelAMXAttnBackend(CustomTestCase):
             )
 
             metrics = run_eval(args)
-            self.assertGreater(metrics["score"], 0.5)
+            self.assertGreater(metrics["score"], 0.45)
         finally:
             kill_process_tree(process.pid)