diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py
index fd59624bc..578f7193c 100644
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -71,7 +71,6 @@ from sglang.srt.utils import (
     is_cuda,
     kill_process_tree,
     launch_dummy_health_check_server,
-    maybe_set_triton_cache_manager,
     prepare_model_and_tokenizer,
     set_prometheus_multiproc_dir,
     set_ulimit,
@@ -637,11 +636,6 @@ def _set_envs_and_config(server_args: ServerArgs):
     # Set ulimit
     set_ulimit()
 
-    # Fix triton bugs
-    if server_args.tp_size * server_args.dp_size > 1:
-        # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
-        maybe_set_triton_cache_manager()
-
     # Check flashinfer version
     if server_args.attention_backend == "flashinfer":
         assert_pkg_version(
diff --git a/python/sglang/srt/entrypoints/http_server.py b/python/sglang/srt/entrypoints/http_server.py
index 43819e1a6..e2ce86847 100644
--- a/python/sglang/srt/entrypoints/http_server.py
+++ b/python/sglang/srt/entrypoints/http_server.py
@@ -107,6 +107,8 @@ from sglang.version import __version__
 logger = logging.getLogger(__name__)
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 
+HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
+
 
 # Store global states
 @dataclasses.dataclass
@@ -212,9 +214,6 @@ async def validate_json_request(raw_request: Request):
         )
 
 
-HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
-
-
 ##### Native API endpoints #####
 
 
@@ -807,6 +806,24 @@ async def retrieve_model(model: str):
     )
 
 
+@app.post("/v1/score", dependencies=[Depends(validate_json_request)])
+async def v1_score_request(request: ScoringRequest, raw_request: Request):
+    """Endpoint for the decoder-only scoring API. See Engine.score() for detailed documentation."""
+    return await raw_request.app.state.openai_serving_score.handle_request(
+        request, raw_request
+    )
+
+
+@app.api_route(
+    "/v1/rerank", methods=["POST", "PUT"], dependencies=[Depends(validate_json_request)]
+)
+async def v1_rerank_request(request: V1RerankReqInput, raw_request: Request):
+    """Endpoint for reranking documents based on query relevance."""
+    return await raw_request.app.state.openai_serving_rerank.handle_request(
+        request, raw_request
+    )
+
+
 ## SageMaker API
 @app.get("/ping")
 async def sagemaker_health() -> Response:
@@ -852,24 +869,6 @@ async def vertex_generate(vertex_req: VertexGenerateReqInput, raw_request: Reque
     return ORJSONResponse({"predictions": ret})
 
 
-@app.post("/v1/score", dependencies=[Depends(validate_json_request)])
-async def v1_score_request(request: ScoringRequest, raw_request: Request):
-    """Endpoint for the decoder-only scoring API. See Engine.score() for detailed documentation."""
-    return await raw_request.app.state.openai_serving_score.handle_request(
-        request, raw_request
-    )
-
-
-@app.api_route(
-    "/v1/rerank", methods=["POST", "PUT"], dependencies=[Depends(validate_json_request)]
-)
-async def v1_rerank_request(request: V1RerankReqInput, raw_request: Request):
-    """Endpoint for reranking documents based on query relevance."""
-    return await raw_request.app.state.openai_serving_rerank.handle_request(
-        request, raw_request
-    )
-
-
 def _create_error_response(e):
     return ORJSONResponse(
         {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
@@ -916,15 +915,6 @@ def launch_server(
         add_prometheus_middleware(app)
         enable_func_timer()
 
-    image_token_text = None
-    if (
-        tokenizer_manager.image_token_id is not None
-        and not server_args.skip_tokenizer_init
-    ):
-        image_token_text = tokenizer_manager.tokenizer.decode(
-            [tokenizer_manager.image_token_id]
-        )
-
     # Send a warmup request - we will create the thread launch it
     # in the lifespan after all other warmups have fired.
     warmup_thread = threading.Thread(
@@ -932,7 +922,6 @@ def launch_server(
         args=(
             server_args,
             pipe_finish_writer,
-            image_token_text,
             launch_callback,
         ),
     )
@@ -1066,7 +1055,6 @@ def _execute_server_warmup(
 def _wait_and_warmup(
     server_args: ServerArgs,
     pipe_finish_writer: Optional[multiprocessing.connection.Connection],
-    image_token_text: str,
     launch_callback: Optional[Callable[[], None]] = None,
 ):
     if not server_args.skip_server_warmup:
diff --git a/python/sglang/srt/layers/moe/topk.py b/python/sglang/srt/layers/moe/topk.py
index ce00fb9c8..253c269b6 100644
--- a/python/sglang/srt/layers/moe/topk.py
+++ b/python/sglang/srt/layers/moe/topk.py
@@ -15,7 +15,7 @@
 from __future__ import annotations
 
 import math
-from typing import TYPE_CHECKING, Callable, NamedTuple, Optional
+from typing import Callable, NamedTuple, Optional
 
 import torch
 import torch.nn.functional as F
@@ -39,10 +39,10 @@ from sglang.srt.utils import (
 
 _is_cuda = is_cuda()
 _is_hip = is_hip()
-_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
-_is_cpu_amx_available = cpu_has_amx_support()
 _is_cpu = is_cpu()
+_is_cpu_amx_available = cpu_has_amx_support()
 _is_npu = is_npu()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 
 if _is_cuda:
     from sgl_kernel import moe_fused_gate
@@ -54,7 +54,6 @@ if _use_aiter:
         from aiter import biased_grouped_topk as aiter_biased_grouped_topk
     except ImportError:
         raise ImportError("aiter is required when SGLANG_USE_AITER is set to True")
-
 if _is_npu:
     import torch_npu
 
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index c3b5fc2e8..9b7a8b7d9 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -653,6 +653,9 @@ class Scheduler(
             )
         )
 
+        embedding_cache_size = int(os.environ.get("SGLANG_VLM_CACHE_SIZE_MB", "100"))
+        init_embedding_cache(embedding_cache_size * 1024 * 1024)
+
     def init_profier(self):
         self.torch_profiler = None
         self.torch_profiler_output_dir: Optional[str] = None
@@ -2895,9 +2898,9 @@ def run_scheduler_process(
         prefix += f" PP{pp_rank}"
 
     # Config the process
-    kill_itself_when_parent_died()
     setproctitle.setproctitle(f"sglang::scheduler{prefix.replace(' ', '_')}")
     faulthandler.enable()
+    kill_itself_when_parent_died()
     parent_process = psutil.Process().parent()
 
     # [For Router] if env var "SGLANG_DP_RANK" exist, set dp_rank to the value of the env var
@@ -2912,10 +2915,6 @@ def run_scheduler_process(
     if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
         set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, gpu_id)
 
-    embedding_cache_size = 100
-    if "SGLANG_VLM_CACHE_SIZE_MB" in os.environ:
-        embedding_cache_size = int(os.environ["SGLANG_VLM_CACHE_SIZE_MB"])
-    init_embedding_cache(embedding_cache_size * 1024 * 1024)
     # Create a scheduler and run the event loop
     try:
         scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, pp_rank, dp_rank)
@@ -2926,8 +2925,8 @@ def run_scheduler_process(
                 "max_req_input_len": scheduler.max_req_input_len,
             }
         )
-        disaggregation_mode: DisaggregationMode = scheduler.disaggregation_mode
 
+        disaggregation_mode: DisaggregationMode = scheduler.disaggregation_mode
         if disaggregation_mode == DisaggregationMode.NULL:
             if server_args.pp_size > 1:
                 scheduler.event_loop_pp()
diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py
index d6850aabd..8f278e523 100644
--- a/python/sglang/srt/model_executor/forward_batch_info.py
+++ b/python/sglang/srt/model_executor/forward_batch_info.py
@@ -74,8 +74,6 @@ class ForwardMode(IntEnum):
     MIXED = auto()
     # No sequence to forward. For data parallel attention, some workers will be IDLE if no sequence are allocated.
     IDLE = auto()
-    # Split Prefill for PD multiplexing
-    SPLIT_PREFILL = auto()
 
     # Used in speculative decoding: verify a batch in the target model.
     TARGET_VERIFY = auto()
@@ -86,6 +84,9 @@ class ForwardMode(IntEnum):
     # It is now used for triggering the sampling_info_done event for the first prefill batch.
     DUMMY_FIRST = auto()
 
+    # Split Prefill for PD multiplexing
+    SPLIT_PREFILL = auto()
+
     def is_prefill(self):
         return self.is_extend()
 
@@ -103,12 +104,12 @@ class ForwardMode(IntEnum):
     def is_mixed(self):
         return self == ForwardMode.MIXED
 
-    def is_split_prefill(self):
-        return self == ForwardMode.SPLIT_PREFILL
-
     def is_idle(self):
         return self == ForwardMode.IDLE
 
+    def is_decode_or_idle(self):
+        return self == ForwardMode.DECODE or self == ForwardMode.IDLE
+
     def is_target_verify(self):
         return self == ForwardMode.TARGET_VERIFY
 
@@ -132,8 +133,8 @@ class ForwardMode(IntEnum):
     def is_dummy_first(self):
         return self == ForwardMode.DUMMY_FIRST
 
-    def is_decode_or_idle(self):
-        return self == ForwardMode.DECODE or self == ForwardMode.IDLE
+    def is_split_prefill(self):
+        return self == ForwardMode.SPLIT_PREFILL
 
 
 @total_ordering
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 3d3be71f1..fbb08077c 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -109,7 +109,6 @@ from sglang.srt.utils import (
     get_bool_env_var,
     get_cpu_ids_by_node,
     init_custom_process_group,
-    is_cuda,
     is_fa3_default_architecture,
     is_flashinfer_available,
     is_hip,
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index b48cbf725..107c63646 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -80,7 +80,7 @@ class ServerArgs:
     schedule_policy: str = "fcfs"
     schedule_conservativeness: float = 1.0
     cpu_offload_gb: int = 0
-    page_size: int = 1
+    page_size: Optional[int] = None
     hybrid_kvcache_ratio: Optional[float] = None
     swa_full_tokens_ratio: float = 0.8
     disable_hybrid_swa_memory: bool = False
@@ -266,31 +266,20 @@ class ServerArgs:
 
     def __post_init__(self):
         # Expert parallelism
+        # We put it here first due to some internal ckpt conversation issues.
         if self.enable_ep_moe:
             self.ep_size = self.tp_size
             logger.warning(
                 f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
             )
-        if self.enable_flashinfer_moe:
-            assert (
-                self.quantization == "modelopt_fp4"
-            ), "modelopt_fp4 quantization is required for Flashinfer MOE"
-            os.environ["TRTLLM_ENABLE_PDL"] = "1"
-            self.disable_shared_experts_fusion = True
-            logger.warning(
-                f"Flashinfer MoE is enabled. Shared expert fusion is disabled."
-            )
 
         # Set missing default values
         if self.tokenizer_path is None:
             self.tokenizer_path = self.model_path
-
-        if self.device is None:
-            self.device = get_device()
-
         if self.served_model_name is None:
             self.served_model_name = self.model_path
-
+        if self.device is None:
+            self.device = get_device()
         if self.random_seed is None:
             self.random_seed = random.randint(0, 1 << 30)
 
@@ -359,7 +348,6 @@ class ServerArgs:
                     self.chunked_prefill_size = 16384
             else:
                 self.chunked_prefill_size = 4096
-        assert self.chunked_prefill_size % self.page_size == 0
 
         # Set cuda graph max batch size
         if self.cuda_graph_max_bs is None:
@@ -410,6 +398,14 @@ class ServerArgs:
             )
             self.page_size = 128
 
+        # Set page size
+        if self.page_size is None:
+            self.page_size = 1
+
+        # AMD-specific Triton attention KV splits default number
+        if is_hip():
+            self.triton_attention_num_kv_splits = 16
+
         # Choose grammar backend
         if self.grammar_backend is None:
             self.grammar_backend = "xgrammar"
@@ -431,6 +427,17 @@ class ServerArgs:
                 self.enable_dp_attention
             ), "Please enable dp attention when setting enable_dp_lm_head. "
 
+        # MoE kernel
+        if self.enable_flashinfer_moe:
+            assert (
+                self.quantization == "modelopt_fp4"
+            ), "modelopt_fp4 quantization is required for Flashinfer MOE"
+            os.environ["TRTLLM_ENABLE_PDL"] = "1"
+            self.disable_shared_experts_fusion = True
+            logger.warning(
+                f"Flashinfer MoE is enabled. Shared expert fusion is disabled."
+            )
+
         # DeepEP MoE
         if self.enable_deepep_moe:
             if self.deepep_mode == "normal":
@@ -502,14 +509,6 @@ class ServerArgs:
                     logger.warning(
                         "DeepSeek MTP does not require setting speculative_draft_model_path."
                     )
-            elif "Llama4" in model_arch:
-                # TODO: remove this after Llama4 supports in other backends
-                if self.attention_backend != "fa3":
-                    self.attention_backend = "fa3"
-                    logger.warning(
-                        "Llama4 requires using fa3 attention backend. "
-                        "Attention backend is automatically set to fa3."
-                    )
 
             # Auto choose parameters
             if self.speculative_num_steps is None:
@@ -542,12 +541,11 @@ class ServerArgs:
         ) and check_gguf_file(self.model_path):
             self.quantization = self.load_format = "gguf"
 
+        # Model loading
         if is_remote_url(self.model_path):
             self.load_format = "remote"
-
-        # AMD-specific Triton attention KV splits default number
-        if is_hip():
-            self.triton_attention_num_kv_splits = 16
+        if self.custom_weight_loader is None:
+            self.custom_weight_loader = []
 
         # PD disaggregation
         if self.disaggregation_mode == "decode":
@@ -572,6 +570,7 @@ class ServerArgs:
             self.disable_cuda_graph = True
             logger.warning("Cuda graph is disabled for prefill server")
 
+        # Propagate env vars
         os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
             "1" if self.enable_torch_compile else "0"
         )
@@ -580,9 +579,6 @@ class ServerArgs:
             "1" if self.disable_outlines_disk_cache else "0"
         )
 
-        if self.custom_weight_loader is None:
-            self.custom_weight_loader = []
-
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
         # Model and tokenizer
@@ -1227,6 +1223,13 @@ class ServerArgs:
             default=ServerArgs.grammar_backend,
             help="Choose the backend for grammar-guided decoding.",
         )
+        parser.add_argument(
+            "--mm-attention-backend",
+            type=str,
+            choices=["sdpa", "fa3", "triton_attn"],
+            default=ServerArgs.mm_attention_backend,
+            help="Set multimodal attention backend.",
+        )
 
         # Speculative decoding
         parser.add_argument(
@@ -1276,13 +1279,6 @@ class ServerArgs:
             help="The path of the draft model's small vocab table.",
             default=ServerArgs.speculative_token_map,
         )
-        parser.add_argument(
-            "--mm-attention-backend",
-            type=str,
-            choices=["sdpa", "fa3", "triton_attn"],
-            default=ServerArgs.mm_attention_backend,
-            help="Set multimodal attention backend.",
-        )
 
         # Expert parallelism
         parser.add_argument(
@@ -1530,11 +1526,6 @@ class ServerArgs:
             action="store_true",
             help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
         )
-        parser.add_argument(
-            "--disable-overlap-cg-plan",
-            action="store_true",
-            help="Disable the overlap optimization for cudagraph preparation in eagle verify.",
-        )
         parser.add_argument(
             "--enable-mixed-chunk",
             action="store_true",
@@ -1792,11 +1783,11 @@ class ServerArgs:
         return hf_config
 
     def check_server_args(self):
+        # Check parallel size constraints
         assert (
             self.tp_size * self.pp_size
         ) % self.nnodes == 0, "tp_size must be divisible by number of nodes"
 
-        # FIXME pp constraints
         if self.pp_size > 1:
             assert (
                 self.disable_overlap_schedule
@@ -1807,11 +1798,7 @@ class ServerArgs:
         assert not (
             self.dp_size > 1 and self.nnodes != 1 and not self.enable_dp_attention
         ), "multi-node data parallel is not supported unless dp attention!"
-        assert (
-            self.max_loras_per_batch > 0
-            # FIXME
-            and (self.lora_paths is None or self.disable_radix_cache)
-        ), "compatibility of lora and radix attention is in progress"
+
         assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
         assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
 
@@ -1820,9 +1807,32 @@ class ServerArgs:
             None,
         }, "moe_dense_tp_size only support 1 and None currently"
 
+        # Check model architecture
+        model_arch = self.get_hf_config().architectures[0]
+        if "Llama4" in model_arch:
+            assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
+
+        # Check LoRA
         self.check_lora_server_args()
 
+        # Check speculative decoding
+        if self.speculative_algorithm is not None:
+            assert (
+                not self.enable_mixed_chunk
+            ), "enable_mixed_chunk is required for speculative decoding"
+
+        # Check chunked prefill
+        assert (
+            self.chunked_prefill_size % self.page_size == 0
+        ), "chunked_prefill_size must be divisible by page_size"
+
     def check_lora_server_args(self):
+        assert (
+            self.max_loras_per_batch > 0
+            # FIXME
+            and (self.lora_paths is None or self.disable_radix_cache)
+        ), "compatibility of lora and radix attention is in progress"
+
         # Enable LoRA if any LoRA paths are provided for backward compatibility.
         if self.lora_paths:
             if self.enable_lora is None:
diff --git a/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py b/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
index 2c8cdf255..8cc324158 100644
--- a/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
+++ b/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
@@ -336,7 +336,6 @@ class EAGLEDraftCudaGraphRunner:
             forward_batch.req_pool_indices = self.req_pool_indices[:bs]
             forward_batch.positions = self.positions[:num_tokens]
 
-        # Special handle for seq_len_cpu used when flashinfer mla is used
         if forward_batch.seq_lens_cpu is not None:
             if bs != raw_bs:
                 self.seq_lens_cpu.fill_(self.seq_len_fill_value)
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index 52a1e20b8..db45a3580 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -937,71 +937,6 @@ def monkey_patch_vllm_gguf_config():
     setattr(GGUFConfig, "get_quant_method", get_quant_method_with_embedding_replaced)
 
 
-def maybe_set_triton_cache_manager() -> None:
-    """Set environment variable to tell Triton to use a
-    custom cache manager"""
-    cache_manger = os.environ.get("TRITON_CACHE_MANAGER", None)
-    if cache_manger is None:
-        manager = "sglang.srt.utils:CustomCacheManager"
-        logger.debug("Setting Triton cache manager to: %s", manager)
-        os.environ["TRITON_CACHE_MANAGER"] = manager
-
-
-class CustomCacheManager(FileCacheManager):
-    # Adapted from: https://github.com/tdoublep/vllm/blob/3307522289fdfefe323b6c00d0db696651989a2f/vllm/triton_utils/custom_cache_manager.py
-    def __init__(self, key, override=False, dump=False):
-        from sglang.srt.distributed.parallel_state import get_tp_group
-
-        self.key = key
-        self.lock_path = None
-
-        try:
-            module_path = "triton.runtime.cache"
-            cache_module = importlib.import_module(module_path)
-
-            default_cache_dir = getattr(cache_module, "default_cache_dir", None)
-            default_dump_dir = getattr(cache_module, "default_dump_dir", None)
-            default_override_dir = getattr(cache_module, "default_override_dir", None)
-        except (ModuleNotFoundError, AttributeError) as e:
-            default_cache_dir = None
-            default_dump_dir = None
-            default_override_dir = None
-
-        if dump:
-            self.cache_dir = (
-                default_dump_dir()
-                if default_dump_dir is not None
-                else os.path.join(Path.home(), ".triton", "dump")
-            )
-            self.cache_dir = os.path.join(self.cache_dir, self.key)
-            self.lock_path = os.path.join(self.cache_dir, "lock")
-            os.makedirs(self.cache_dir, exist_ok=True)
-        elif override:
-            self.cache_dir = (
-                default_override_dir()
-                if default_override_dir is not None
-                else os.path.join(Path.home(), ".triton", "override")
-            )
-            self.cache_dir = os.path.join(self.cache_dir, self.key)
-        else:
-            # create cache directory if it doesn't exist
-            self.cache_dir = os.getenv("TRITON_CACHE_DIR", "").strip() or (
-                default_cache_dir()
-                if default_cache_dir is not None
-                else os.path.join(Path.home(), ".triton", "cache")
-            )
-            if self.cache_dir:
-                try:
-                    self.cache_dir = f"{self.cache_dir}_{get_tp_group().local_rank}"
-                except:
-                    self.cache_dir = f"{self.cache_dir}_{os.getpid()}"
-                self.cache_dir = os.path.join(self.cache_dir, self.key)
-                self.lock_path = os.path.join(self.cache_dir, "lock")
-                os.makedirs(self.cache_dir, exist_ok=True)
-            else:
-                raise RuntimeError("Could not create or locate cache dir")
-
-
 def set_ulimit(target_soft_limit=65535):
     # number of open files
     resource_type = resource.RLIMIT_NOFILE
diff --git a/test/srt/test_deepep_large.py b/test/srt/test_deepep_large.py
index 703eb7789..9492f093a 100644
--- a/test/srt/test_deepep_large.py
+++ b/test/srt/test_deepep_large.py
@@ -101,7 +101,7 @@ class TestDeepseekMTP(CustomTestCase):
                 "--max-running-requests",
                 "512",
                 "--speculative-algorithm",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-num-steps",
                 "1",
                 "--speculative-eagle-topk",
diff --git a/test/srt/test_deepep_small.py b/test/srt/test_deepep_small.py
index 0f6ccb955..0c5931a8d 100644
--- a/test/srt/test_deepep_small.py
+++ b/test/srt/test_deepep_small.py
@@ -261,7 +261,7 @@ class TestMTP(CustomTestCase):
                 "--enable-dp-lm-head",
                 "--enable-deepep-moe",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN,
                 "--speculative-num-steps",
@@ -329,7 +329,7 @@ class TestMTPWithTBO(CustomTestCase):
                 "--enable-deepep-moe",
                 "--trust-remote-code",
                 "--speculative-algorithm",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-num-steps",
                 "2",
                 "--speculative-eagle-topk",
diff --git a/test/srt/test_hybrid_dp_ep_tp_mtp.py b/test/srt/test_hybrid_dp_ep_tp_mtp.py
index 74363649a..15fa98f72 100644
--- a/test/srt/test_hybrid_dp_ep_tp_mtp.py
+++ b/test/srt/test_hybrid_dp_ep_tp_mtp.py
@@ -1224,7 +1224,7 @@ class Test30(CustomTestCase):
                 "--tp",
                 "8",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -1271,7 +1271,7 @@ class Test31(CustomTestCase):
                 "--dp",
                 "4",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -1318,7 +1318,7 @@ class Test32(CustomTestCase):
                 "--dp",
                 "8",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -1364,7 +1364,7 @@ class Test33(CustomTestCase):
                 "--moe-dense-tp-size",
                 "1",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -1413,7 +1413,7 @@ class Test34(CustomTestCase):
                 "--moe-dense-tp-size",
                 "1",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -1462,7 +1462,7 @@ class Test35(CustomTestCase):
                 "--moe-dense-tp-size",
                 "1",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -1510,7 +1510,7 @@ class Test36(CustomTestCase):
                 "4",
                 "--enable-dp-lm-head",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -1558,7 +1558,7 @@ class Test37(CustomTestCase):
                 "8",
                 "--enable-dp-lm-head",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -1608,7 +1608,7 @@ class Test38(CustomTestCase):
                 "1",
                 "--enable-dp-lm-head",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -1658,7 +1658,7 @@ class Test39(CustomTestCase):
                 "1",
                 "--enable-dp-lm-head",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -1709,7 +1709,7 @@ class Test40(CustomTestCase):
                 "--max-running-requests",
                 "32",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -1763,7 +1763,7 @@ class Test41(CustomTestCase):
                 "--max-running-requests",
                 "32",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -1817,7 +1817,7 @@ class Test42(CustomTestCase):
                 "--max-running-requests",
                 "32",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -1870,7 +1870,7 @@ class Test43(CustomTestCase):
                 "--max-running-requests",
                 "32",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -1926,7 +1926,7 @@ class Test44(CustomTestCase):
                 "--max-running-requests",
                 "32",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -1982,7 +1982,7 @@ class Test45(CustomTestCase):
                 "--max-running-requests",
                 "32",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -2037,7 +2037,7 @@ class Test46(CustomTestCase):
                 "--max-running-requests",
                 "32",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -2092,7 +2092,7 @@ class Test47(CustomTestCase):
                 "--max-running-requests",
                 "32",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -2149,7 +2149,7 @@ class Test48(CustomTestCase):
                 "--max-running-requests",
                 "32",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -2206,7 +2206,7 @@ class Test49(CustomTestCase):
                 "--max-running-requests",
                 "32",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -2251,7 +2251,7 @@ class Test50(CustomTestCase):
                 "8",
                 "--enable-ep-moe",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -2299,7 +2299,7 @@ class Test51(CustomTestCase):
                 "4",
                 "--enable-ep-moe",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -2347,7 +2347,7 @@ class Test52(CustomTestCase):
                 "8",
                 "--enable-ep-moe",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -2394,7 +2394,7 @@ class Test53(CustomTestCase):
                 "1",
                 "--enable-ep-moe",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -2444,7 +2444,7 @@ class Test54(CustomTestCase):
                 "1",
                 "--enable-ep-moe",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -2494,7 +2494,7 @@ class Test55(CustomTestCase):
                 "1",
                 "--enable-ep-moe",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -2543,7 +2543,7 @@ class Test56(CustomTestCase):
                 "--enable-dp-lm-head",
                 "--enable-ep-moe",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -2592,7 +2592,7 @@ class Test57(CustomTestCase):
                 "--enable-dp-lm-head",
                 "--enable-ep-moe",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -2643,7 +2643,7 @@ class Test58(CustomTestCase):
                 "--enable-dp-lm-head",
                 "--enable-ep-moe",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",
@@ -2694,7 +2694,7 @@ class Test59(CustomTestCase):
                 "--enable-dp-lm-head",
                 "--enable-ep-moe",
                 "--speculative-algo",
-                "NEXTN",
+                "EAGLE",
                 "--speculative-draft",
                 "lmsys/DeepSeek-V3-0324-NextN",
                 "--speculative-num-steps",