diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index b29bf63f4..2378695e2 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -324,33 +324,33 @@ jobs:
           cd test/srt
           python3 run_suite.py --suite per-commit-4-gpu-deepep
 
-  # unit-test-deepep-8-gpu:
-  #   if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
-  #       github.event.pull_request.draft == false
-  #   runs-on: 8-gpu-runner
-  #   needs: [
-  #     unit-test-deepep-4-gpu,
-  #   ]
-  #   steps:
-  #     - name: Checkout code
-  #       uses: actions/checkout@v4
-  #
-  #     - name: Install dependencies
-  #       run: |
-  #         bash scripts/ci_install_deepep.sh
-  #
-  #     - name: Run test
-  #       timeout-minutes: 20
-  #       run: |
-  #         cd test/srt
-  #         python3 run_suite.py --suite per-commit-8-gpu-deepep
+  unit-test-deepep-8-gpu:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    runs-on: 8-gpu-runner
+    needs: [
+      unit-test-deepep-4-gpu,
+    ]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci_install_deepep.sh
+
+      - name: Run test
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-8-gpu-deepep
 
   finish:
     if: always()
     needs: [
       unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu, unit-test-backend-4-gpu,
       unit-test-backend-8-gpu, performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu,
-      accuracy-test-1-gpu, accuracy-test-2-gpu, unit-test-deepep-4-gpu, # unit-test-deepep-8-gpu,
+      accuracy-test-1-gpu, accuracy-test-2-gpu, unit-test-deepep-4-gpu, unit-test-deepep-8-gpu,
     ]
     runs-on: ubuntu-latest
     steps:
diff --git a/scripts/ci_install_deepep.sh b/scripts/ci_install_deepep.sh
index aa4dab097..e743bddaf 100755
--- a/scripts/ci_install_deepep.sh
+++ b/scripts/ci_install_deepep.sh
@@ -4,30 +4,30 @@ set -euxo pipefail
 
 bash scripts/ci_install_dependency.sh
 
-if python3 -c "import deep_ep" >/dev/null 2>&1; then
-    echo "deep_ep is already installed or importable. Skipping installation."
-    exit 0
-fi
-
 export GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
 export NVSHMEM_DIR=/opt/nvshmem/install
 export LD_LIBRARY_PATH="${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH"
 export PATH="${NVSHMEM_DIR}/bin:$PATH"
 export CUDA_HOME=/usr/local/cuda
 
+if python3 -c "import deep_ep" >/dev/null 2>&1; then
+    echo "deep_ep is already installed or importable. Skipping installation."
+    exit 0
+fi
+
 # Install system dependencies
 apt install -y curl wget git sudo libibverbs-dev rdma-core infiniband-diags openssh-server perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 build-essential cmake
 
 # Install GDRCopy
 rm -rf /opt/gdrcopy && mkdir -p /opt/gdrcopy
-mkdir -p /opt/nvshmem
+rm -rf /opt/nvshmem && mkdir -p /opt/nvshmem
 cd /opt/gdrcopy
 git clone https://github.com/NVIDIA/gdrcopy.git .
 git checkout v2.4.4
 apt update
 apt install -y nvidia-dkms-535
 apt install -y build-essential devscripts debhelper fakeroot pkg-config dkms
-apt install -y check libsubunit0 libsubunit-dev
+apt install -y check libsubunit0 libsubunit-dev python3-venv
 cd packages
 CUDA=/usr/local/cuda ./build-deb-packages.sh
 dpkg -i gdrdrv-dkms_*.deb
@@ -40,16 +40,11 @@ if [ ! -e "/usr/lib/x86_64-linux-gnu/libmlx5.so" ]; then
 fi
 apt-get update && apt-get install -y libfabric-dev
 
-# Clone DeepEP
-rm -rf /root/.cache/deepep && git clone https://github.com/deepseek-ai/DeepEP.git /root/.cache/deepep && cd /root/.cache/deepep && git checkout eef7ab50fa5cf0ab1dd3fce4c6493c90bdf290ac
-
 # Install NVSHMEM
 cd /opt/nvshmem
-wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
-tar -xf nvshmem_src_3.2.5-1.txz
-rm -rf nvshmem && mv nvshmem_src nvshmem
-cd nvshmem
-git apply /root/.cache/deepep/third-party/nvshmem.patch
+wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz
+tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz
+mv nvshmem_src nvshmem && cd nvshmem
 NVSHMEM_SHMEM_SUPPORT=0 \
 NVSHMEM_UCX_SUPPORT=0 \
 NVSHMEM_USE_NCCL=0 \
@@ -63,12 +58,10 @@ cd build
 make -j$(nproc) install
 
 # Install DeepEP
+rm -rf /root/.cache/deepep && git clone https://github.com/deepseek-ai/DeepEP.git /root/.cache/deepep && cd /root/.cache/deepep && git checkout b6ce310bb0b75079682d09bc2ebc063a074fbd58
 cd /root/.cache/deepep && python3 setup.py install
 
 # Verify configuration
-echo "=== NCCL Configuration ==="
-nvidia-smi topo -m
-nvidia-smi nvlink -s
 echo "=== Verify GDRCOPY ==="
 gdrcopy_copybw
 echo "=== Verify NVSHMEM ==="
diff --git a/test/srt/test_deepep_large.py b/test/srt/test_deepep_large.py
index 8afb2896f..703eb7789 100644
--- a/test/srt/test_deepep_large.py
+++ b/test/srt/test_deepep_large.py
@@ -45,6 +45,7 @@ class TestDeepseek(CustomTestCase):
                 "256",
                 "--max-running-requests",
                 "2048",
+                "--disable-radix-cache",
             ],
         )
 
@@ -54,10 +55,10 @@ class TestDeepseek(CustomTestCase):
 
     def test_gsm8k(self):
         args = SimpleNamespace(
-            num_shots=8,
+            num_shots=5,
             data_path=None,
-            num_questions=1250,
-            parallel=1250,
+            num_questions=1200,
+            parallel=1200,
             max_new_tokens=512,
             host="http://127.0.0.1",
             port=int(self.base_url.split(":")[-1]),
@@ -65,7 +66,7 @@ class TestDeepseek(CustomTestCase):
         metrics = run_eval_few_shot_gsm8k(args)
         print(f"Eval accuracy of GSM8K: {metrics=}")
 
-        self.assertGreater(metrics["accuracy"], 0.93)
+        self.assertGreater(metrics["accuracy"], 0.92)
 
 
 class TestDeepseekMTP(CustomTestCase):
@@ -107,6 +108,7 @@ class TestDeepseekMTP(CustomTestCase):
                 "1",
                 "--speculative-num-draft-tokens",
                 "2",
+                "--disable-radix-cache",
             ],
         )
 
@@ -116,10 +118,10 @@ class TestDeepseekMTP(CustomTestCase):
 
     def test_gsm8k(self):
         args = SimpleNamespace(
-            num_shots=8,
+            num_shots=5,
             data_path=None,
-            num_questions=1250,
-            parallel=1250,
+            num_questions=1200,
+            parallel=1200,
             max_new_tokens=512,
             host="http://127.0.0.1",
             port=int(self.base_url.split(":")[-1]),
@@ -127,7 +129,7 @@ class TestDeepseekMTP(CustomTestCase):
         metrics = run_eval_few_shot_gsm8k(args)
         print(f"Eval accuracy of GSM8K: {metrics=}")
 
-        self.assertGreater(metrics["accuracy"], 0.93)
+        self.assertGreater(metrics["accuracy"], 0.92)
 
         server_info = requests.get(self.base_url + "/get_server_info")
         avg_spec_accept_length = server_info.json()["internal_states"][0][
@@ -138,7 +140,7 @@ class TestDeepseekMTP(CustomTestCase):
             f"accuracy={metrics['accuracy']=:.3f}\n"
             f"{avg_spec_accept_length=:.3f}\n"
         )
-        self.assertGreater(avg_spec_accept_length, 1.9)
+        self.assertGreater(avg_spec_accept_length, 1.85)
 
 
 if __name__ == "__main__":
diff --git a/test/srt/test_deepep_small.py b/test/srt/test_deepep_small.py
index 9724ae735..e26017ade 100644
--- a/test/srt/test_deepep_small.py
+++ b/test/srt/test_deepep_small.py
@@ -36,6 +36,8 @@ class TestPureDP(CustomTestCase):
                 "128",
                 "--max-running-requests",
                 "128",
+                "--mem-fraction-static",
+                "0.5",
             ],
         )
 
@@ -56,7 +58,7 @@ class TestPureDP(CustomTestCase):
         metrics = run_eval_few_shot_gsm8k(args)
         print(metrics)
 
-        self.assertGreater(metrics["accuracy"], 0.62)
+        self.assertGreater(metrics["accuracy"], 0.60)
 
 
 class TestHybridDPTP(CustomTestCase):
@@ -100,7 +102,7 @@ class TestHybridDPTP(CustomTestCase):
         metrics = run_eval_few_shot_gsm8k(args)
         print(metrics)
 
-        self.assertGreater(metrics["accuracy"], 0.62)
+        self.assertGreater(metrics["accuracy"], 0.60)
 
 
 class TestTP(CustomTestCase):
@@ -141,10 +143,10 @@ class TestTP(CustomTestCase):
         metrics = run_eval_few_shot_gsm8k(args)
         print(metrics)
 
-        self.assertGreater(metrics["accuracy"], 0.62)
+        self.assertGreater(metrics["accuracy"], 0.60)
 
 
-# @unittest.skip("covered in test_deepep_large.py")
+@unittest.skip("covered in test_deepep_large.py")
 class TestNoGatherdBuffer(CustomTestCase):
     @classmethod
     def setUpClass(cls):
@@ -189,7 +191,7 @@ class TestNoGatherdBuffer(CustomTestCase):
         metrics = run_eval_few_shot_gsm8k(args)
         print(metrics)
 
-        self.assertGreater(metrics["accuracy"], 0.62)
+        self.assertGreater(metrics["accuracy"], 0.60)
 
 
 class TestTBO(CustomTestCase):
@@ -236,10 +238,10 @@ class TestTBO(CustomTestCase):
         metrics = run_eval_few_shot_gsm8k(args)
         print(metrics)
 
-        self.assertGreater(metrics["accuracy"], 0.62)
+        self.assertGreater(metrics["accuracy"], 0.60)
 
 
-# @unittest.skip("covered in TestMTPWithTBO")
+@unittest.skip("covered in TestMTPWithTBO")
 class TestMTP(CustomTestCase):
     @classmethod
     def setUpClass(cls):
@@ -280,8 +282,6 @@ class TestMTP(CustomTestCase):
         kill_process_tree(cls.process.pid)
 
     def test_gsm8k(self):
-        requests.get(self.base_url + "/flush_cache")
-
         args = SimpleNamespace(
             num_shots=5,
             data_path=None,
@@ -352,8 +352,6 @@ class TestMTPWithTBO(CustomTestCase):
         kill_process_tree(cls.process.pid)
 
     def test_gsm8k(self):
-        requests.get(self.base_url + "/flush_cache")
-
         args = SimpleNamespace(
             num_shots=5,
             data_path=None,