diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py
index 07e06b9f4..a31afabe5 100644
--- a/python/sglang/srt/disaggregation/decode.py
+++ b/python/sglang/srt/disaggregation/decode.py
@@ -772,7 +772,7 @@ class SchedulerDisaggregationDecodeMixin:
             self.last_batch_in_queue = last_batch_in_queue
 
     def _prepare_idle_batch_and_run(self: Scheduler, batch, delay_process=False):
-        batch, _ = self.prepare_mlp_sync_batch(batch)
+        batch = self.prepare_mlp_sync_batch(batch)
         result = None
         if batch:
             result = self.run_batch(batch)
diff --git a/python/sglang/srt/disaggregation/prefill.py b/python/sglang/srt/disaggregation/prefill.py
index 231b648cc..bf61644cf 100644
--- a/python/sglang/srt/disaggregation/prefill.py
+++ b/python/sglang/srt/disaggregation/prefill.py
@@ -276,7 +276,7 @@ class SchedulerDisaggregationPrefillMixin:
             batch = self.get_new_batch_prefill()
 
             if require_mlp_sync(self.server_args):
-                batch, _ = self.prepare_mlp_sync_batch(batch)
+                batch = self.prepare_mlp_sync_batch(batch)
             self.cur_batch = batch
 
             if batch:
@@ -310,7 +310,7 @@ class SchedulerDisaggregationPrefillMixin:
             batch = self.get_new_batch_prefill()
 
             if require_mlp_sync(self.server_args):
-                batch, _ = self.prepare_mlp_sync_batch(batch)
+                batch = self.prepare_mlp_sync_batch(batch)
             self.cur_batch = batch
             if batch:
                 result = self.run_batch(batch)
diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py
index 10319800b..848c4581c 100644
--- a/python/sglang/srt/layers/moe/ep_moe/layer.py
+++ b/python/sglang/srt/layers/moe/ep_moe/layer.py
@@ -42,7 +42,7 @@ from sglang.srt.layers.quantization.fp8_kernel import (
 )
 from sglang.srt.layers.quantization.fp8_utils import normalize_e4m3fn_to_e4m3fnuz
 from sglang.srt.managers.schedule_batch import global_server_args_dict
-from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.utils import (
     DeepEPMode,
     ceil_div,
@@ -1178,12 +1178,14 @@ class DeepEPMoE(EPMoE):
         masked_m: torch.Tensor,
         expected_m: int,
         num_recv_tokens_per_expert: List[int],
-        forward_mode: ForwardMode,
+        forward_batch: ForwardBatch,
     ):
         if _use_aiter:
             # in forward_aiter, we skip token permutation and unpermutation, which have been fused inside aiter kernel
             return self.forward_aiter(hidden_states, topk_idx, topk_weights)
-        resolved_deepep_mode = self.deepep_mode.resolve(forward_mode)
+        resolved_deepep_mode = self.deepep_mode.resolve(
+            forward_batch.is_extend_in_batch
+        )
         if resolved_deepep_mode == DeepEPMode.normal:
             if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
                 return self.forward_deepgemm_contiguous(
diff --git a/python/sglang/srt/layers/moe/ep_moe/token_dispatcher.py b/python/sglang/srt/layers/moe/ep_moe/token_dispatcher.py
index a3df8432c..5c0cd3ec9 100644
--- a/python/sglang/srt/layers/moe/ep_moe/token_dispatcher.py
+++ b/python/sglang/srt/layers/moe/ep_moe/token_dispatcher.py
@@ -34,7 +34,7 @@ from sglang.srt.layers.moe.ep_moe.kernels import (
     deepep_post_reorder_triton_kernel,
     deepep_run_moe_deep_preprocess,
 )
-from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 
 _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and is_hip()
 
@@ -686,21 +686,21 @@ class DeepEPDispatcher:
         hidden_states: torch.Tensor,
         topk_idx: torch.Tensor,
         topk_weights: torch.Tensor,
-        forward_mode: ForwardMode = None,
+        forward_batch: ForwardBatch,
     ):
         self._update_stage(_Stage.INITIAL, _Stage.AFTER_DISPATCH_A)
-        inner_state = self._get_impl(forward_mode).dispatch_a(
+        inner_state = self._get_impl(forward_batch).dispatch_a(
             hidden_states=hidden_states,
             topk_idx=topk_idx,
             topk_weights=topk_weights,
         )
-        self._dispatch_intermediate_state = forward_mode, inner_state
+        self._dispatch_intermediate_state = forward_batch, inner_state
 
     def dispatch_b(self):
         self._update_stage(_Stage.AFTER_DISPATCH_A, _Stage.AFTER_DISPATCH_B)
-        forward_mode, inner_state = self._dispatch_intermediate_state
+        forward_batch, inner_state = self._dispatch_intermediate_state
         del self._dispatch_intermediate_state
-        return self._get_impl(forward_mode).dispatch_b(*inner_state)
+        return self._get_impl(forward_batch).dispatch_b(*inner_state)
 
     def combine(self, *args, **kwargs) -> Tuple:
         self.combine_a(*args, **kwargs)
@@ -712,24 +712,26 @@ class DeepEPDispatcher:
         hidden_states: torch.Tensor,
         topk_idx: torch.Tensor,
         topk_weights: torch.Tensor,
-        forward_mode: ForwardMode,
+        forward_batch: ForwardBatch,
     ):
         self._update_stage(_Stage.AFTER_DISPATCH_B, _Stage.AFTER_COMBINE_A)
-        inner_state = self._get_impl(forward_mode).combine_a(
+        inner_state = self._get_impl(forward_batch).combine_a(
             hidden_states=hidden_states,
             topk_idx=topk_idx,
             topk_weights=topk_weights,
         )
-        self._combine_intermediate_state = forward_mode, inner_state
+        self._combine_intermediate_state = forward_batch, inner_state
 
     def combine_b(self):
         self._update_stage(_Stage.AFTER_COMBINE_A, _Stage.INITIAL)
-        forward_mode, inner_state = self._combine_intermediate_state
+        forward_batch, inner_state = self._combine_intermediate_state
         del self._combine_intermediate_state
-        return self._get_impl(forward_mode).combine_b(*inner_state)
+        return self._get_impl(forward_batch).combine_b(*inner_state)
 
-    def _get_impl(self, forward_mode: ForwardMode) -> _DeepEPDispatcherImplBase:
-        resolved_deepep_mode = self.deepep_mode.resolve(forward_mode)
+    def _get_impl(self, forward_batch: ForwardBatch) -> _DeepEPDispatcherImplBase:
+        resolved_deepep_mode = self.deepep_mode.resolve(
+            forward_batch.is_extend_in_batch
+        )
         if resolved_deepep_mode == DeepEPMode.normal:
             return self._normal_dispatcher
         elif resolved_deepep_mode == DeepEPMode.low_latency:
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 7d08b1510..7f6a641ef 100644
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -840,6 +840,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     # For DP attention
     global_num_tokens: Optional[List[int]] = None
     global_num_tokens_for_logprob: Optional[List[int]] = None
+    is_extend_in_batch: bool = False
     can_run_dp_cuda_graph: bool = False
     is_extend_in_batch: bool = False
     tbo_split_seq_index: Optional[int] = None
@@ -1714,6 +1715,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             token_ids_logprobs=self.token_ids_logprobs,
             global_num_tokens=self.global_num_tokens,
             global_num_tokens_for_logprob=self.global_num_tokens_for_logprob,
+            is_extend_in_batch=self.is_extend_in_batch,
             can_run_dp_cuda_graph=self.can_run_dp_cuda_graph,
             tbo_split_seq_index=self.tbo_split_seq_index,
             global_forward_mode=self.global_forward_mode,
@@ -1798,6 +1800,7 @@ class ModelWorkerBatch:
     # For DP attention
     global_num_tokens: Optional[List[int]]
     global_num_tokens_for_logprob: Optional[List[int]]
+    is_extend_in_batch: bool
     can_run_dp_cuda_graph: bool
     tbo_split_seq_index: Optional[int]
     global_forward_mode: Optional[ForwardMode]
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index a9ee8d392..635536a97 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -1490,7 +1490,7 @@ class Scheduler(
         if need_dp_attn_preparation and not self.spec_algorithm.is_none():
             # In speculative decoding, prefill batches and decode batches cannot be processed in the same DP attention group.
             # We prepare idle batches in advance to skip preparing decode batches when there are prefill batches in the group.
-            new_batch, _ = self.prepare_mlp_sync_batch(new_batch)
+            new_batch = self.prepare_mlp_sync_batch(new_batch)
             need_dp_attn_preparation = new_batch is None
 
         if new_batch is not None:
@@ -1506,7 +1506,7 @@ class Scheduler(
 
         # Handle DP attention
         if need_dp_attn_preparation:
-            ret, _ = self.prepare_mlp_sync_batch(ret)
+            ret = self.prepare_mlp_sync_batch(ret)
 
         return ret
 
@@ -1923,8 +1923,7 @@ class Scheduler(
             if not disable_cuda_graph:
                 local_batch.can_run_dp_cuda_graph = can_cuda_graph
 
-        # TODO(ch-wan): refactor: any(is_extend_in_batch) now is a part of local_batch. Remove it from here.
-        return local_batch, any(is_extend_in_batch)
+        return local_batch
 
     def get_idle_batch(self):
         idle_batch = ScheduleBatch.init_new(
diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py
index 65dd8d428..940dbc68b 100644
--- a/python/sglang/srt/model_executor/forward_batch_info.py
+++ b/python/sglang/srt/model_executor/forward_batch_info.py
@@ -254,6 +254,7 @@ class ForwardBatch:
     dp_local_start_pos: Optional[torch.Tensor] = None  # cached info at runtime
     dp_local_num_tokens: Optional[torch.Tensor] = None  # cached info at runtime
     gathered_buffer: Optional[torch.Tensor] = None
+    is_extend_in_batch: bool = False
     can_run_dp_cuda_graph: bool = False
     global_forward_mode: Optional[ForwardMode] = None
 
@@ -299,6 +300,7 @@ class ForwardBatch:
             return_logprob=batch.return_logprob,
             top_logprobs_nums=batch.top_logprobs_nums,
             token_ids_logprobs=batch.token_ids_logprobs,
+            is_extend_in_batch=batch.is_extend_in_batch,
             can_run_dp_cuda_graph=batch.can_run_dp_cuda_graph,
             global_forward_mode=batch.global_forward_mode,
             lora_paths=batch.lora_paths,
diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
index 1ebd403b6..f254a7b3e 100644
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -558,7 +558,7 @@ class DeepseekV2MoE(nn.Module):
                 hidden_states=hidden_states,
                 topk_idx=topk_idx,
                 topk_weights=topk_weights,
-                forward_mode=forward_mode,
+                forward_batch=forward_batch,
             )
         final_hidden_states = self.experts(
             hidden_states=hidden_states,
@@ -569,14 +569,14 @@ class DeepseekV2MoE(nn.Module):
             masked_m=masked_m,
             expected_m=expected_m,
             num_recv_tokens_per_expert=num_recv_tokens_per_expert,
-            forward_mode=forward_mode,
+            forward_batch=forward_batch,
         )
         if self.ep_size > 1:
             final_hidden_states = self.deepep_dispatcher.combine(
                 hidden_states=final_hidden_states,
                 topk_idx=topk_idx,
                 topk_weights=topk_weights,
-                forward_mode=forward_mode,
+                forward_batch=forward_batch,
             )
 
         if shared_output is not None:
@@ -651,7 +651,7 @@ class DeepseekV2MoE(nn.Module):
                 hidden_states=state.hidden_states_mlp_input,
                 topk_idx=state.pop("topk_idx_local"),
                 topk_weights=state.pop("topk_weights_local"),
-                forward_mode=state.forward_batch.forward_mode,
+                forward_batch=state.forward_batch,
                 tbo_subbatch_index=state.get("tbo_subbatch_index"),
             )
 
@@ -683,7 +683,7 @@ class DeepseekV2MoE(nn.Module):
             masked_m=state.pop("masked_m"),
             expected_m=state.pop("expected_m"),
             num_recv_tokens_per_expert=state.pop("num_recv_tokens_per_expert"),
-            forward_mode=state.forward_batch.forward_mode,
+            forward_batch=state.forward_batch,
         )
 
     def op_combine_a(self, state):
@@ -692,7 +692,7 @@ class DeepseekV2MoE(nn.Module):
                 hidden_states=state.pop("hidden_states_experts_output"),
                 topk_idx=state.pop("topk_idx_dispatched"),
                 topk_weights=state.pop("topk_weights_dispatched"),
-                forward_mode=state.forward_batch.forward_mode,
+                forward_batch=state.forward_batch,
                 tbo_subbatch_index=state.get("tbo_subbatch_index"),
             )
 
@@ -1881,7 +1881,7 @@ class DeepseekV2DecoderLayer(nn.Module):
             and hidden_states.shape[0] == 0
         ):
             state.hidden_states_mlp_output = self.mlp(
-                hidden_states, state.forward_batch.forward_mode
+                hidden_states, state.forward_batch
             )
         else:
             state.hidden_states_mlp_output = hidden_states
diff --git a/python/sglang/srt/models/qwen3_moe.py b/python/sglang/srt/models/qwen3_moe.py
index 0ca0a9509..7c7c7551b 100644
--- a/python/sglang/srt/models/qwen3_moe.py
+++ b/python/sglang/srt/models/qwen3_moe.py
@@ -229,7 +229,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
                 hidden_states=hidden_states,
                 topk_idx=topk_idx,
                 topk_weights=topk_weights,
-                forward_mode=forward_mode,
+                forward_batch=forward_batch,
             )
         final_hidden_states = self.experts(
             hidden_states=hidden_states,
@@ -240,14 +240,14 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
             masked_m=masked_m,
             expected_m=expected_m,
             num_recv_tokens_per_expert=num_recv_tokens_per_expert,
-            forward_mode=forward_mode,
+            forward_batch=forward_batch,
         )
         if self.ep_size > 1:
             final_hidden_states = self.deepep_dispatcher.combine(
                 hidden_states=final_hidden_states,
                 topk_idx=topk_idx,
                 topk_weights=topk_weights,
-                forward_mode=forward_mode,
+                forward_batch=forward_batch,
             )
         return final_hidden_states
 
@@ -293,7 +293,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
                 hidden_states=state.pop("hidden_states_mlp_input"),
                 topk_idx=state.pop("topk_idx_local"),
                 topk_weights=state.pop("topk_weights_local"),
-                forward_mode=state.forward_batch.forward_mode,
+                forward_batch=state.forward_batch,
                 tbo_subbatch_index=state.get("tbo_subbatch_index"),
             )
 
@@ -325,7 +325,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
             masked_m=state.pop("masked_m"),
             expected_m=state.pop("expected_m"),
             num_recv_tokens_per_expert=state.pop("num_recv_tokens_per_expert"),
-            forward_mode=state.forward_batch.forward_mode,
+            forward_batch=state.forward_batch,
         )
 
     def op_combine_a(self, state):
@@ -334,7 +334,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
                 hidden_states=state.pop("hidden_states_experts_output"),
                 topk_idx=state.pop("topk_idx_dispatched"),
                 topk_weights=state.pop("topk_weights_dispatched"),
-                forward_mode=state.forward_batch.forward_mode,
+                forward_batch=state.forward_batch,
                 tbo_subbatch_index=state.get("tbo_subbatch_index"),
             )
 
@@ -647,9 +647,7 @@ class Qwen3MoeDecoderLayer(nn.Module):
 
     def op_mlp(self, state):
         hidden_states = state.pop("hidden_states_mlp_input")
-        state.hidden_states_mlp_output = self.mlp(
-            hidden_states, state.forward_batch.forward_mode
-        )
+        state.hidden_states_mlp_output = self.mlp(hidden_states, state.forward_batch)
 
     def op_comm_postprocess_layer(self, state):
         hidden_states, residual = self.layer_communicator.postprocess_layer(
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 756c17811..ce9710985 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -418,10 +418,6 @@ class ServerArgs:
 
         # DeepEP MoE
         if self.enable_deepep_moe:
-            if self.deepep_mode == "auto":
-                assert (
-                    not self.enable_dp_attention
-                ), "DeepEP MoE `auto` mode is not supported with DP Attention."
             if self.deepep_mode == "normal":
                 logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
                 self.disable_cuda_graph = True
diff --git a/python/sglang/srt/two_batch_overlap.py b/python/sglang/srt/two_batch_overlap.py
index 0879bcf2b..2d32b118a 100644
--- a/python/sglang/srt/two_batch_overlap.py
+++ b/python/sglang/srt/two_batch_overlap.py
@@ -13,7 +13,7 @@ from sglang.srt.layers.communicator import (
 )
 from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
 from sglang.srt.layers.quantization import deep_gemm_wrapper
-from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.managers.schedule_batch import ScheduleBatch, global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.operations import execute_operations, execute_overlapped_operations
 from sglang.srt.operations_strategy import OperationsStrategy
@@ -272,7 +272,11 @@ class TboCudaGraphRunnerPlugin:
 
 class TboDPAttentionPreparer:
     def prepare_all_gather(
-        self, local_batch, deepep_mode, enable_deepep_moe, enable_two_batch_overlap
+        self,
+        local_batch: ScheduleBatch,
+        deepep_mode: DeepEPMode,
+        enable_deepep_moe: bool,
+        enable_two_batch_overlap: bool,
     ):
         self.enable_two_batch_overlap = enable_two_batch_overlap
 
@@ -294,7 +298,7 @@ class TboDPAttentionPreparer:
                 extend_lens=local_batch.extend_lens,
                 token_num_per_seq=token_num_per_seq,
             )
-            resolved_deepep_mode = deepep_mode.resolve(local_batch.forward_mode)
+            resolved_deepep_mode = deepep_mode.resolve(local_batch.is_extend_in_batch)
             local_can_run_tbo = (self.local_tbo_split_seq_index is not None) and not (
                 (
                     local_batch.forward_mode.is_extend()
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index 74ea47dac..00101f060 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -2202,14 +2202,14 @@ class DeepEPMode(Enum):
     def enable_low_latency(self):
         return self in [DeepEPMode.low_latency, DeepEPMode.auto]
 
-    def resolve(self, forward_mode):
+    def resolve(self, is_extend_in_batch: bool):
         if self != DeepEPMode.auto:
             return self
 
-        if forward_mode.is_decode():
-            return DeepEPMode.low_latency
-        else:
+        if is_extend_in_batch:
             return DeepEPMode.normal
+        else:
+            return DeepEPMode.low_latency
 
 
 def is_non_idle_and_non_empty(forward_mode, hidden_states):
diff --git a/test/srt/test_hybrid_dp_ep_tp_mtp.py b/test/srt/test_hybrid_dp_ep_tp_mtp.py
index c909c0ced..a3d44a67a 100644
--- a/test/srt/test_hybrid_dp_ep_tp_mtp.py
+++ b/test/srt/test_hybrid_dp_ep_tp_mtp.py
@@ -539,8 +539,9 @@ class Test10(CustomTestCase):
                 "8",
                 "--enable-deepep-moe",
                 "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
+                "auto",
+                "--cuda-graph-max-bs",
+                "128",
             ],
         )
 
@@ -593,8 +594,9 @@ class Test11(CustomTestCase):
                 "4",
                 "--enable-deepep-moe",
                 "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
+                "auto",
+                "--cuda-graph-max-bs",
+                "128",
             ],
         )
 
@@ -647,8 +649,9 @@ class Test12(CustomTestCase):
                 "8",
                 "--enable-deepep-moe",
                 "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
+                "auto",
+                "--cuda-graph-max-bs",
+                "128",
             ],
         )
 
@@ -700,8 +703,9 @@ class Test13(CustomTestCase):
                 "1",
                 "--enable-deepep-moe",
                 "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
+                "auto",
+                "--cuda-graph-max-bs",
+                "128",
             ],
         )
 
@@ -756,8 +760,9 @@ class Test14(CustomTestCase):
                 "1",
                 "--enable-deepep-moe",
                 "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
+                "auto",
+                "--cuda-graph-max-bs",
+                "128",
             ],
         )
 
@@ -812,8 +817,9 @@ class Test15(CustomTestCase):
                 "1",
                 "--enable-deepep-moe",
                 "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
+                "auto",
+                "--cuda-graph-max-bs",
+                "128",
             ],
         )
 
@@ -867,8 +873,9 @@ class Test16(CustomTestCase):
                 "--enable-dp-lm-head",
                 "--enable-deepep-moe",
                 "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
+                "auto",
+                "--cuda-graph-max-bs",
+                "128",
             ],
         )
 
@@ -922,8 +929,9 @@ class Test17(CustomTestCase):
                 "--enable-dp-lm-head",
                 "--enable-deepep-moe",
                 "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
+                "auto",
+                "--cuda-graph-max-bs",
+                "128",
             ],
         )
 
@@ -979,8 +987,9 @@ class Test18(CustomTestCase):
                 "--enable-dp-lm-head",
                 "--enable-deepep-moe",
                 "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
+                "auto",
+                "--cuda-graph-max-bs",
+                "128",
             ],
         )
 
@@ -1036,8 +1045,9 @@ class Test19(CustomTestCase):
                 "--enable-dp-lm-head",
                 "--enable-deepep-moe",
                 "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
+                "auto",
+                "--cuda-graph-max-bs",
+                "128",
             ],
         )
 
@@ -2213,8 +2223,11 @@ class Test40(CustomTestCase):
                 "8",
                 "--enable-deepep-moe",
                 "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
+                "auto",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "32",
                 "--speculative-algo",
                 "NEXTN",
                 "--speculative-draft",
@@ -2277,8 +2290,11 @@ class Test41(CustomTestCase):
                 "4",
                 "--enable-deepep-moe",
                 "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
+                "auto",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "32",
                 "--speculative-algo",
                 "NEXTN",
                 "--speculative-draft",
@@ -2341,8 +2357,11 @@ class Test42(CustomTestCase):
                 "8",
                 "--enable-deepep-moe",
                 "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
+                "auto",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "32",
                 "--speculative-algo",
                 "NEXTN",
                 "--speculative-draft",
@@ -2404,8 +2423,11 @@ class Test43(CustomTestCase):
                 "1",
                 "--enable-deepep-moe",
                 "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
+                "auto",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "32",
                 "--speculative-algo",
                 "NEXTN",
                 "--speculative-draft",
@@ -2470,8 +2492,11 @@ class Test44(CustomTestCase):
                 "1",
                 "--enable-deepep-moe",
                 "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
+                "auto",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "32",
                 "--speculative-algo",
                 "NEXTN",
                 "--speculative-draft",
@@ -2536,8 +2561,11 @@ class Test45(CustomTestCase):
                 "1",
                 "--enable-deepep-moe",
                 "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
+                "auto",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "32",
                 "--speculative-algo",
                 "NEXTN",
                 "--speculative-draft",
@@ -2601,8 +2629,11 @@ class Test46(CustomTestCase):
                 "--enable-dp-lm-head",
                 "--enable-deepep-moe",
                 "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
+                "auto",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "32",
                 "--speculative-algo",
                 "NEXTN",
                 "--speculative-draft",
@@ -2666,8 +2697,11 @@ class Test47(CustomTestCase):
                 "--enable-dp-lm-head",
                 "--enable-deepep-moe",
                 "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
+                "auto",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "32",
                 "--speculative-algo",
                 "NEXTN",
                 "--speculative-draft",
@@ -2733,8 +2767,11 @@ class Test48(CustomTestCase):
                 "--enable-dp-lm-head",
                 "--enable-deepep-moe",
                 "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
+                "auto",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "32",
                 "--speculative-algo",
                 "NEXTN",
                 "--speculative-draft",
@@ -2800,8 +2837,11 @@ class Test49(CustomTestCase):
                 "--enable-dp-lm-head",
                 "--enable-deepep-moe",
                 "--deepep-mode",
-                "normal",
-                "--disable-cuda-graph",
+                "auto",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "32",
                 "--speculative-algo",
                 "NEXTN",
                 "--speculative-draft",