diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml
index 3ba24d979..51ea12ea5 100644
--- a/.github/workflows/pr-test-amd.yml
+++ b/.github/workflows/pr-test-amd.yml
@@ -291,7 +291,7 @@ jobs:
           bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --timeout-per-file 3600
 
       - name: Run CustomAllReduce test
-        timeout-minutes: 10
+        timeout-minutes: 20
         run: |
           bash scripts/amd_ci_exec.sh -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m unittest test_custom_allreduce.TestCustomAllReduce
 
diff --git a/python/sglang/srt/layers/attention/aiter_backend.py b/python/sglang/srt/layers/attention/aiter_backend.py
index 7e6b9936e..cea097cb0 100644
--- a/python/sglang/srt/layers/attention/aiter_backend.py
+++ b/python/sglang/srt/layers/attention/aiter_backend.py
@@ -720,11 +720,6 @@ class AiterIndicesUpdaterPrefill:
         self.req_to_token = model_runner.req_to_token_pool.req_to_token
         self.update = self.update_single_wrapper
 
-        # get the last index of the pool
-        self.pool_size = (
-            model_runner.token_to_kv_pool.size + model_runner.token_to_kv_pool.page_size
-        ) - 1
-
         self.kv_indices = None
         self.max_q_len = 0
         self.max_kv_len = 0
@@ -769,9 +764,8 @@ class AiterIndicesUpdaterPrefill:
             # but the 0 location will be made nan (noqa) in cuda graph capture mode
             # this will cause the output tensor value becomes nan
             # WA is to assure that last index of pool not changed
-            kv_indices = torch.full(
-                (paged_kernel_lens_sum + 128,),
-                self.pool_size,
+            kv_indices = torch.empty(
+                paged_kernel_lens_sum + 256,
                 dtype=torch.int32,
                 device=req_pool_indices.device,
             )
@@ -785,6 +779,9 @@ class AiterIndicesUpdaterPrefill:
                 self.req_to_token.shape[1],
             )
 
+            token_num = kv_indptr[-1]
+            kv_indices[token_num:] = kv_indices[0]
+
             self.max_kv_len = torch.max(paged_kernel_lens).item()
 
             extend_lens = seq_lens - prefix_lens
diff --git a/scripts/amd_ci_start_container.sh b/scripts/amd_ci_start_container.sh
index 239fd3770..9ce33549b 100755
--- a/scripts/amd_ci_start_container.sh
+++ b/scripts/amd_ci_start_container.sh
@@ -124,6 +124,7 @@ echo "Starting container: ci_sglang"
 docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
   -v "${GITHUB_WORKSPACE:-$PWD}:/sglang-checkout" \
   --ipc=host --group-add video \
+  --shm-size 32g \
   --cap-add=SYS_PTRACE \
   -e HF_TOKEN="${HF_TOKEN:-}" \
   --security-opt seccomp=unconfined \