diff --git a/docs/backend/hyperparameter_tuning.md b/docs/backend/hyperparameter_tuning.md
index ca2f52b42..993ad983b 100644
--- a/docs/backend/hyperparameter_tuning.md
+++ b/docs/backend/hyperparameter_tuning.md
@@ -23,8 +23,8 @@ If you frequently see `token usage < 0.9` and `#queue-req > 0`, it means the ser
 The case of server being too conservative can happen when users send many requests with a large `max_new_tokens` but the requests stop very early due to EOS or stop strings.
 
 On the other hand, if you see `token usage` very high and you frequently see warnings like
-`decode out of memory happened, #retracted_reqs: 1, #new_token_ratio: 0.9998 -> 1.0000`, you can increase `--schedule-conservativeness` to a value like 1.3.
-If you see `decode out of memory happened` occasionally but not frequently, it is okay.
+`KV cache pool is full. Retract requests. #retracted_reqs: 1, #new_token_ratio: 0.9998 -> 1.0000`, you can increase `--schedule-conservativeness` to a value like 1.3.
+If you see `KV cache pool is full. Retract requests.` occasionally but not frequently, it is okay.
 
 ### Tune `--dp-size` and `--tp-size`
 
diff --git a/python/sglang/bench_one_batch_server.py b/python/sglang/bench_one_batch_server.py
index b87770690..29168e793 100644
--- a/python/sglang/bench_one_batch_server.py
+++ b/python/sglang/bench_one_batch_server.py
@@ -8,6 +8,7 @@ Usage:
 python3 -m sglang.bench_one_batch_server --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
 
 python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
+python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 --show-report --profile --profile-by-stage
 """
 
 import argparse
@@ -19,10 +20,10 @@ import os
 import time
 from typing import Tuple
 
-import numpy as np
 import requests
 
 from sglang.bench_serving import get_tokenizer, sample_random_requests
+from sglang.profiler import run_profile
 from sglang.srt.entrypoints.http_server import launch_server
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import kill_process_tree
@@ -42,6 +43,8 @@ class BenchArgs:
     base_url: str = ""
     skip_warmup: bool = False
     show_report: bool = False
+    profile: bool = False
+    profile_by_stage: bool = False
 
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
@@ -68,6 +71,8 @@ class BenchArgs:
         parser.add_argument("--base-url", type=str, default=BenchArgs.base_url)
         parser.add_argument("--skip-warmup", action="store_true")
         parser.add_argument("--show-report", action="store_true")
+        parser.add_argument("--profile", action="store_true")
+        parser.add_argument("--profile-by-stage", action="store_true")
 
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
@@ -93,8 +98,8 @@ def launch_server_process(server_args: ServerArgs):
     base_url = f"http://{server_args.host}:{server_args.port}"
     timeout = 600
 
-    start_time = time.perf_counter()
-    while time.perf_counter() - start_time < timeout:
+    start_time = time.time()
+    while time.time() - start_time < timeout:
         try:
             headers = {
                 "Content-Type": "application/json; charset=utf-8",
@@ -119,6 +124,8 @@ def run_one_case(
     run_name: str,
     result_filename: str,
     tokenizer,
+    profile: bool = False,
+    profile_by_stage: bool = False,
 ):
     requests.post(url + "/flush_cache")
     input_requests = sample_random_requests(
@@ -145,6 +152,12 @@ def run_one_case(
     else:
         json_schema = None
 
+    profile_link = None
+    if profile:
+        profile_link: str = run_profile(
+            url, 3, ["CPU", "GPU"], None, None, profile_by_stage
+        )
+
     tic = time.perf_counter()
     response = requests.post(
         url + "/generate",
@@ -194,8 +207,8 @@ def run_one_case(
     print(f"output_len: {output_len}")
     print(f"latency: {latency:.2f} s")
     print(f"ttft: {ttft:.2f} s")
-    print(f"Last generation throughput: {last_gen_throughput:.2f} tok/s")
-    print(f"Input throughput: {input_throughput:.2f} tok/s")
+    print(f"last generation throughput: {last_gen_throughput:.2f} tok/s")
+    print(f"input throughput: {input_throughput:.2f} tok/s")
     if output_len != 1:
         print(f"output throughput: {output_throughput:.2f} tok/s")
 
@@ -222,6 +235,7 @@ def run_one_case(
         overall_throughput,
         last_gen_throughput,
         acc_length,
+        profile_link if profile else None,
     )
 
 
@@ -253,6 +267,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
 
     # benchmark
     result = []
+    bench_result = []
     try:
         for bs, il, ol in itertools.product(
             bench_args.batch_size, bench_args.input_len, bench_args.output_len
@@ -271,6 +286,33 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
                     tokenizer=tokenizer,
                 )
             )
+
+        if bench_args.profile:
+            try:
+                for bs, il, ol in itertools.product(
+                    bench_args.batch_size, bench_args.input_len, bench_args.output_len
+                ):
+                    bench_result.append(
+                        (
+                            run_one_case(
+                                base_url,
+                                bs,
+                                il,
+                                ol,
+                                temperature=bench_args.temperature,
+                                return_logprob=bench_args.return_logprob,
+                                input_len_step_percentage=bench_args.input_len_step_percentage,
+                                run_name=bench_args.run_name,
+                                result_filename=bench_args.result_filename,
+                                tokenizer=tokenizer,
+                                profile=bench_args.profile,
+                                profile_by_stage=bench_args.profile_by_stage,
+                            )[-1],
+                        )
+                    )
+                result = [t1[:-1] + t2 for t1, t2 in zip(result, bench_result)]
+            except Exception as e:
+                print(f"Error profiling, there will be no profile trace dump: {e}")
     finally:
         if proc:
             kill_process_tree(proc.pid)
@@ -280,8 +322,20 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
     if not bench_args.show_report:
         return
 
-    summary = " | batch size | latency (s) | input throughput (tok/s)  | output throughput (tok/s) | acc length | ITL (ms) | input price ($/1M) | output price ($/1M) |\n"
-    summary += "| ---------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ------------------ | ------------------- |\n"
+    summary = (
+        f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
+    )
+    summary += "| batch size | latency (s) | input throughput (tok/s)  | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) |"
+
+    if bench_args.profile:
+        summary += " profile |"
+
+    summary += "\n"
+    summary += "| ---------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ |"
+
+    if bench_args.profile:
+        summary += "-------------|"
+    summary += "\n"
 
     for (
         batch_size,
@@ -292,6 +346,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
         overall_throughput,
         last_gen_throughput,
         acc_length,
+        trace_link,
     ) in result:
         hourly_cost = 2 * server_args.tp_size  # $2/hour for one H100
         input_util = 0.7
@@ -304,17 +359,18 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
             f"{accept_length} | "
             f"{1 / (output_throughput/batch_size) * 1000:.2f} | "
             f"{1e6 / (input_throughput * input_util) / 3600 * hourly_cost:.2f} | "
-            f"{1e6 / output_throughput / 3600 * hourly_cost:.2f} |\n"
+            f"{1e6 / output_throughput / 3600 * hourly_cost:.2f} |"
         )
+        if trace_link:
+            line += f" [Profile]({trace_link}) |"
+        line += "\n"
         summary += line
 
     # print metrics table
     print(summary)
 
     if is_in_ci():
-        write_github_step_summary(
-            f"### Test Nightly Benchmark (bench_one_batch) \n{summary}"
-        )
+        write_github_step_summary(summary)
 
 
 if __name__ == "__main__":
diff --git a/python/sglang/profiler.py b/python/sglang/profiler.py
new file mode 100644
index 000000000..3503ae7fc
--- /dev/null
+++ b/python/sglang/profiler.py
@@ -0,0 +1,167 @@
+"""
+Run live profiling.
+
+Usage:
+python3 -m sglang.profiler
+"""
+
+import argparse
+import json
+import os
+import time
+import urllib.parse
+from argparse import ArgumentParser
+from pathlib import Path
+from typing import List, Optional
+
+import requests
+
+PARENT_FOLDER = "/tmp/sglang-profile"
+
+
+def _run_profile(
+    url: Optional[str],
+    num_steps: int,
+    activities: List[str],
+    output_dir: Optional[str] = None,
+    profile_name: Optional[str] = None,
+    profile_by_stage: bool = False,
+) -> str:
+    if output_dir is None:
+        output_dir = PARENT_FOLDER
+
+    output_dir = os.path.normpath(output_dir)
+    output_dir = os.path.abspath(output_dir)
+    output_dir = Path(output_dir)
+
+    # Add "profile_name/timestamp" to the path.
+    if profile_name:
+        output_dir = output_dir / profile_name
+    output_dir = output_dir / str(time.time())
+    output_dir.mkdir(exist_ok=True, parents=True)
+
+    print(f"Dump profiling traces to {output_dir}")
+    print(
+        f"Waiting for {num_steps} steps and the trace to be flushed.... ({profile_by_stage=})"
+    )
+
+    # Dump server args.
+    file_path = Path(output_dir) / "server_args.json"
+    if not file_path.exists():
+        response = requests.get(url + "/get_server_info")
+        response.raise_for_status()
+        server_args_data = response.json()
+        with open(file_path, "w") as file:
+            file.write(json.dumps(server_args_data))
+
+    # Start profiler. The API replies when all steps are processed
+    # and files are generated.
+    json_data = {
+        "output_dir": str(output_dir),
+        "num_steps": str(num_steps),
+        "activities": activities,
+        "profile_by_stage": profile_by_stage,
+    }
+
+    response = requests.post(url=url + "/start_profile", json=json_data)
+    response.raise_for_status()
+
+    trace_link = str(output_dir)
+    return trace_link
+
+
+def run_profile(
+    url: Optional[str],
+    num_steps: int,
+    activities: List[str],
+    output_dir: Optional[str] = None,
+    profile_name: Optional[str] = None,
+    profile_by_stage: bool = False,
+):
+    # step based profile will self terminate on num_steps constraints
+    link = _run_profile(
+        url, num_steps, activities, output_dir, profile_name, profile_by_stage
+    )
+    return link
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Benchmark the online serving throughput.")
+    parser.add_argument(
+        "--url",
+        type=str,
+        default="http://localhost:30000",
+        help="Server or API base url if not using http host and port.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Profile directory to dump profile traces.",
+    )
+    parser.add_argument(
+        "--profile-name",
+        type=str,
+        default=None,
+        help="The name of this profile run.",
+    )
+    parser.add_argument(
+        "--num-steps",
+        type=int,
+        default=5,
+        help="The number of forward steps to profile.",
+    )
+    parser.add_argument(
+        "--profile-by-stage",
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=False,
+        help="The number of forward steps to profile.",
+    )
+    parser.add_argument(
+        "--cpu",
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=True,
+        help="Whether to profile CPU activity",
+    )
+    parser.add_argument(
+        "--gpu",
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=True,
+        help="Whether to profile GPU activity",
+    )
+    parser.add_argument(
+        "--mem",
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=False,
+        help="Whether to memory usage (https://pytorch.org/memory_viz)",
+    )
+    parser.add_argument(
+        "--rpd",
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=False,
+        help="Whether to use rpd profiler (https://github.com/ROCm/rocmProfileData)",
+    )
+
+    args = parser.parse_args()
+    activities = []
+    if args.cpu:
+        activities.append("CPU")
+    if args.gpu:
+        activities.append("GPU")
+    if args.mem:
+        activities.append("MEM")
+    if args.rpd:
+        activities.append("RPD")
+    run_profile(
+        args.url,
+        args.num_steps,
+        activities,
+        args.output_dir,
+        args.profile_name,
+        args.profile_by_stage,
+    )
diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py
index 0c9988df1..d2c1d9d09 100644
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -514,9 +514,7 @@ def _set_envs_and_config(server_args: ServerArgs):
         pid, exitcode = os.waitpid(0, os.WNOHANG)
         if exitcode != 0:
             logger.warning(
-                "Child process unexpectedly failed with an exit code %d. pid=%d",
-                exitcode,
-                pid,
+                f"Child process unexpectedly failed with {exitcode=}. {pid=}"
             )
 
     signal.signal(signal.SIGCHLD, sigchld_handler)
diff --git a/python/sglang/srt/entrypoints/http_server.py b/python/sglang/srt/entrypoints/http_server.py
index 1c6892bca..52ff310b3 100644
--- a/python/sglang/srt/entrypoints/http_server.py
+++ b/python/sglang/srt/entrypoints/http_server.py
@@ -350,6 +350,7 @@ async def start_profile_async(obj: Optional[ProfileReqInput] = None):
         activities=obj.activities,
         with_stack=obj.with_stack,
         record_shapes=obj.record_shapes,
+        profile_by_stage=obj.profile_by_stage,
     )
     return Response(
         content="Start profiling.\n",
diff --git a/python/sglang/srt/managers/expert_location.py b/python/sglang/srt/managers/expert_location.py
index d12fd4975..615e0a440 100644
--- a/python/sglang/srt/managers/expert_location.py
+++ b/python/sglang/srt/managers/expert_location.py
@@ -401,7 +401,6 @@ def compute_initial_expert_location_metadata(
 ) -> ExpertLocationMetadata:
     data = server_args.init_expert_location
     if data == "trivial":
-        logger.info("init_expert_location from trivial")
         return ExpertLocationMetadata.init_trivial(server_args, model_config)
 
     # TODO unify with the utils function
diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py
index 48e6fd6b6..f13b23b17 100644
--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -848,7 +848,8 @@ class ProfileReqInput:
     # If it is set, profiling is automatically stopped after this step, and
     # the caller doesn't need to run stop_profile.
     num_steps: Optional[int] = None
-    activities: Optional[List[Literal["CPU", "GPU", "MEM", "CUDA_PROFILER"]]] = None
+    activities: Optional[List[str]] = None
+    profile_by_stage: bool = False
     with_stack: Optional[bool] = None
     record_shapes: Optional[bool] = None
 
@@ -875,6 +876,7 @@ class ProfileReq:
     output_dir: Optional[str] = None
     num_steps: Optional[int] = None
     activities: Optional[List[str]] = None
+    profile_by_stage: bool = False
     with_stack: Optional[bool] = None
     record_shapes: Optional[bool] = None
     profile_id: Optional[str] = None
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 9c9a8f6a9..6132db8e1 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -34,7 +34,6 @@ import zmq
 from torch.distributed import barrier
 
 from sglang.global_config import global_config
-from sglang.srt import two_batch_overlap
 from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.constrained.base_grammar_backend import create_grammar_backend
 from sglang.srt.disaggregation.decode import (
@@ -63,7 +62,6 @@ from sglang.srt.hf_transformers_utils import (
 from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.expert_distribution import (
-    ExpertDistributionRecorder,
     get_global_expert_distribution_recorder,
 )
 from sglang.srt.managers.io_struct import (
@@ -140,6 +138,7 @@ from sglang.srt.utils import (
     broadcast_pyobj,
     configure_logger,
     disable_request_logging,
+    get_available_gpu_memory,
     get_bool_env_var,
     get_zmq_socket,
     kill_itself_when_parent_died,
@@ -213,7 +212,6 @@ class Scheduler(
         self.gpu_id = gpu_id
         self.enable_hierarchical_cache = server_args.enable_hierarchical_cache
         self.page_size = server_args.page_size
-        # Distributed rank info
         self.dp_size = server_args.dp_size
         self.attn_tp_rank, self.attn_tp_size, self.attn_dp_rank = (
             compute_dp_attention_world_info(
@@ -333,12 +331,16 @@ class Scheduler(
 
         # Print debug info
         if tp_rank == 0:
+            avail_mem = get_available_gpu_memory(
+                self.device, self.gpu_id, empty_cache=False
+            )
             logger.info(
                 f"max_total_num_tokens={self.max_total_num_tokens}, "
                 f"chunked_prefill_size={server_args.chunked_prefill_size}, "
                 f"max_prefill_tokens={self.max_prefill_tokens}, "
                 f"max_running_requests={self.max_running_requests}, "
-                f"context_len={self.model_config.context_len}"
+                f"context_len={self.model_config.context_len}, "
+                f"available_gpu_mem={avail_mem:.2f} GB"
             )
 
         # Init memory pool and cache
@@ -362,6 +364,7 @@ class Scheduler(
         self.current_stream = torch.get_device_module(self.device).current_stream()
         if self.device == "cpu":
             self.current_stream.synchronize = lambda: None  # No-op for CPU
+        self.forward_sleep_time = None
 
         # Init session info
         self.sessions: Dict[str, Session] = {}
@@ -425,8 +428,14 @@ class Scheduler(
         self.profiler_activities: Optional[List[str]] = None
         self.profiler_id: Optional[str] = None
         self.profiler_target_forward_ct: Optional[int] = None
-
-        self.forward_sleep_time = None
+        self.profiler_target_prefill_ct: Optional[int] = None
+        self.profiler_target_decode_ct: Optional[int] = None
+        self.profiler_prefill_ct: Optional[int] = None
+        self.profiler_decode_ct: Optional[int] = None
+        self.profile_by_stage: bool = False
+        self.profile_steps: Optional[int] = None
+        self.profile_in_progress: bool = False
+        self.rpd_profiler = None
 
         # Init metrics stats
         self.init_metrics()
@@ -1518,7 +1527,7 @@ class Scheduler(
             self.new_token_ratio = new_token_ratio
 
             logger.info(
-                "Decode out of memory happened. "
+                "KV cache pool is full. Retract requests. "
                 f"#retracted_reqs: {len(retracted_reqs)}, "
                 f"#new_token_ratio: {old_ratio:.4f} -> {self.new_token_ratio:.4f}"
             )
@@ -1542,13 +1551,8 @@ class Scheduler(
         """Run a batch."""
         self.forward_ct += 1
 
-        # Check profiler
-        if (
-            self.profiler_target_forward_ct
-            and self.profiler_target_forward_ct <= self.forward_ct
-        ):
-            self.send_to_tokenizer.send_pyobj(self.stop_profile())
-
+        # Whether to run the profiler
+        self._profile_batch_predicate(batch)
         if self.forward_sleep_time is not None:
             logger.info(f"Scheduler.run_batch sleep {self.forward_sleep_time}s")
             time.sleep(self.forward_sleep_time)
@@ -2121,46 +2125,82 @@ class Scheduler(
 
     def profile(self, recv_req: ProfileReq):
         if recv_req.type == ProfileReqType.START_PROFILE:
-            return self.start_profile(
-                recv_req.output_dir,
-                recv_req.num_steps,
-                recv_req.activities,
-                recv_req.with_stack,
-                recv_req.record_shapes,
-                recv_req.profile_id,
-            )
+            if recv_req.profile_by_stage:
+                return self.init_profile(
+                    recv_req.output_dir,
+                    recv_req.num_steps,
+                    recv_req.activities,
+                    recv_req.with_stack,
+                    recv_req.record_shapes,
+                    recv_req.profile_by_stage,
+                )
+            else:
+                self.init_profile(
+                    recv_req.output_dir,
+                    recv_req.num_steps,
+                    recv_req.activities,
+                    recv_req.with_stack,
+                    recv_req.record_shapes,
+                    recv_req.profile_by_stage,
+                )
+                return self.start_profile(True)
         else:
             return self.stop_profile()
 
-    def start_profile(
+    def init_profile(
         self,
         output_dir: Optional[str],
         num_steps: Optional[int],
         activities: Optional[List[str]],
         with_stack: Optional[bool],
         record_shapes: Optional[bool],
-        profile_id: Optional[str],
-    ) -> None:
-        if self.profiler_activities:
+        profile_by_stage: bool,
+    ) -> ProfileReqOutput:
+        if self.profile_in_progress:
             return ProfileReqOutput(
                 success=False,
                 message="Profiling is already in progress. Call /stop_profile first.",
             )
 
+        self.profile_by_stage = profile_by_stage
+
         if output_dir is None:
             output_dir = os.getenv("SGLANG_TORCH_PROFILER_DIR", "/tmp")
         if activities is None:
             activities = ["CPU", "GPU"]
 
         self.torch_profiler_output_dir = output_dir
+        self.torch_profiler_with_stack = with_stack
+        self.torch_profiler_record_shapes = record_shapes
         self.profiler_activities = activities
-        self.profiler_id = profile_id
+
+        if num_steps:
+            self.profile_steps = num_steps
+            if self.profile_by_stage:
+                self.profiler_target_prefill_ct = num_steps
+                self.profiler_target_decode_ct = num_steps
+                self.profiler_prefill_ct = 0
+                self.profiler_decode_ct = 0
+            else:
+                self.profiler_target_forward_ct = self.forward_ct + num_steps
+            # The caller will be notified when reaching profiler_target_forward_ct
+        else:
+            self.profiler_target_forward_ct = None
+
+        return ProfileReqOutput(success=True, message="Succeeded")
+
+    def start_profile(
+        self, stage: Optional[ForwardMode] = None
+    ) -> ProfileReqOutput | None:
+        stage_str = f" for {stage.__str__()}" if stage else ""
         logger.info(
-            "Profiling starts. Traces will be saved to: %s (with id %s)",
-            self.torch_profiler_output_dir,
-            self.profiler_id,
+            f"Profiling starts{stage_str}. Traces will be saved to: {self.torch_profiler_output_dir}",
         )
 
+        activities = self.profiler_activities
+        with_stack = self.torch_profiler_with_stack
+        record_shapes = self.torch_profiler_record_shapes
+
         activity_map = {
             "CPU": torch.profiler.ProfilerActivity.CPU,
             "GPU": torch.profiler.ProfilerActivity.CUDA,
@@ -2169,48 +2209,97 @@ class Scheduler(
             activity_map[a] for a in activities if a in activity_map
         ]
 
-        if torchprof_activities:
+        if "RPD" in activities:
+            from rpdTracerControl import rpdTracerControl
+
+            rpdTracerControl.skipCreate()
+
+            self.rpd_profile_path = os.path.join(
+                self.torch_profiler_output_dir,
+                "rpd-" + str(time.time()) + f"-TP-{self.tp_rank}" + ".trace.json.gz",
+            )
+
+            if self.tp_rank == 0:
+                import sqlite3
+
+                from rocpd.schema import RocpdSchema
+
+                if os.path.exists("trace.rpd"):
+                    os.unlink("trace.rpd")
+                schema = RocpdSchema()
+                connection = sqlite3.connect("trace.rpd")
+                schema.writeSchema(connection)
+                connection.commit()
+                del connection
+            torch.distributed.barrier(self.tp_cpu_group)
+
+            self.rpd_profiler = rpdTracerControl()
+            self.rpd_profiler.setPythonTrace(True)
+            self.rpd_profiler.start()
+            self.rpd_profiler.rangePush("", "rpd profile range", "")
+            self.profile_in_progress = True
+        elif torchprof_activities:
             self.torch_profiler = torch.profiler.profile(
                 activities=torchprof_activities,
                 with_stack=with_stack if with_stack is not None else True,
                 record_shapes=record_shapes if record_shapes is not None else False,
             )
             self.torch_profiler.start()
+            self.profile_in_progress = True
 
         if "MEM" in activities:
             torch.cuda.memory._record_memory_history(max_entries=100000)
+            self.profile_in_progress = True
 
         if "CUDA_PROFILER" in activities:
             torch.cuda.cudart().cudaProfilerStart()
 
-        if num_steps:
-            self.profiler_target_forward_ct = self.forward_ct + num_steps
-            # The caller will be notified when reaching profiler_target_forward_ct
-        else:
-            self.profiler_target_forward_ct = None
-            return ProfileReqOutput(success=True, message="Succeeded")
+        return ProfileReqOutput(success=True, message="Succeeded")
 
-    def stop_profile(self) -> None:
-        if self.profiler_activities is None:
+    def stop_profile(
+        self, stage: Optional[ForwardMode] = None
+    ) -> ProfileReqOutput | None:
+        if not self.profile_in_progress:
             return ProfileReqOutput(
                 success=False,
                 message="Profiling is not in progress. Call /start_profile first.",
             )
 
-        logger.info("Stop profiling...")
+        stage_suffix = f"-{stage.__str__()}" if stage else ""
+        logger.info("Stop profiling" + stage_suffix + "...")
         if self.torch_profiler is not None:
             self.torch_profiler.stop()
             self.torch_profiler.export_chrome_trace(
                 os.path.join(
                     self.torch_profiler_output_dir,
-                    self.profiler_id + f"-TP-{self.tp_rank}" + ".trace.json.gz",
+                    str(time.time())
+                    + f"-TP-{self.tp_rank}"
+                    + stage_suffix
+                    + ".trace.json.gz",
                 )
             )
+            torch.distributed.barrier(self.tp_cpu_group)
 
-        if "MEM" in self.profiler_activities:
+        if self.rpd_profiler is not None:
+            self.rpd_profiler.rangePop()
+            self.rpd_profiler.stop()
+            self.rpd_profiler.flush()
+
+            torch.distributed.barrier(self.tp_cpu_group)
+            if self.tp_rank == 0:
+                from sglang.srt.utils import rpd_to_chrome_trace
+
+                rpd_to_chrome_trace("trace.rpd", self.rpd_profile_path)
+            self.rpd_profiler = None
+            self.rpd_profiler_path = None
+
+        if self.profiler_activities is not None and "MEM" in self.profiler_activities:
             memory_profile_path = os.path.join(
                 self.torch_profiler_output_dir,
-                self.profiler_id + f"-TP-{self.tp_rank}-memory" + ".pickle",
+                str(time.time())
+                + f"-TP-{self.tp_rank}-memory"
+                + stage_suffix
+                + ".pickle",
             )
             torch.cuda.memory._dump_snapshot(memory_profile_path)
             torch.cuda.memory._record_memory_history(enabled=None)
@@ -2223,11 +2312,38 @@ class Scheduler(
             self.torch_profiler_output_dir,
         )
         self.torch_profiler = None
-        self.torch_profiler_output_dir = None
-        self.profiler_activities = None
-        self.profiler_target_forward_ct = None
+        self.profile_in_progress = False
 
-        return ProfileReqOutput(success=True, message="Succeeded")
+        return ProfileReqOutput(success=True, message="Succeeded.")
+
+    def _profile_batch_predicate(self, batch):
+        if self.profile_by_stage:
+            if batch.forward_mode.is_prefill():
+                if self.profiler_prefill_ct == 0:
+                    self.start_profile(batch.forward_mode)
+                self.profiler_prefill_ct += 1
+                if self.profiler_prefill_ct > self.profiler_target_prefill_ct:
+                    if self.profile_in_progress:
+                        self.stop_profile(stage=ForwardMode.EXTEND)
+            elif batch.forward_mode.is_decode():
+                if self.profiler_decode_ct == 0:
+                    if self.profile_in_progress:
+                        # force trace flush
+                        self.stop_profile(ForwardMode.EXTEND)
+                    self.start_profile(batch.forward_mode)
+                self.profiler_decode_ct += 1
+                if self.profiler_decode_ct > self.profiler_target_decode_ct:
+                    if self.profile_in_progress:
+                        self.stop_profile(stage=ForwardMode.DECODE)
+            else:
+                raise RuntimeError("unsupported profile stage")
+        else:
+            # Check profiler
+            if (
+                self.profiler_target_forward_ct
+                and self.profiler_target_forward_ct <= self.forward_ct
+            ):
+                self.stop_profile()
 
     def expert_distribution_handle(self, recv_req: ExpertDistributionReq):
         if recv_req == ExpertDistributionReq.START_RECORD:
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index cc0b30038..9d52de62e 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -796,6 +796,7 @@ class TokenizerManager:
         activities: Optional[List[str]] = None,
         with_stack: Optional[bool] = None,
         record_shapes: Optional[bool] = None,
+        profile_by_stage: bool = False,
     ):
         self.auto_create_handle_loop()
         req = ProfileReq(
@@ -805,6 +806,7 @@ class TokenizerManager:
             activities=activities,
             with_stack=with_stack,
             record_shapes=record_shapes,
+            profile_by_stage=profile_by_stage,
             profile_id=str(time.time()),
         )
         return await self._execute_profile(req)
diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py
index 8a46d2318..af35514c7 100644
--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -39,10 +39,7 @@ from sglang.srt.model_executor.forward_batch_info import (
     PPProxyTensors,
 )
 from sglang.srt.patch_torch import monkey_patch_torch_compile
-from sglang.srt.two_batch_overlap import (
-    TboCudaGraphRunnerPlugin,
-    TboForwardBatchPreparer,
-)
+from sglang.srt.two_batch_overlap import TboCudaGraphRunnerPlugin
 from sglang.srt.utils import (
     get_available_gpu_memory,
     get_device_memory_capacity,
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 6d5248376..35429cbf0 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -77,11 +77,7 @@ from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
 from sglang.srt.model_executor.expert_location_updater import ExpertLocationUpdater
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
 from sglang.srt.model_loader import get_model
-from sglang.srt.model_loader.loader import (
-    DefaultModelLoader,
-    device_loading_context,
-    get_model_loader,
-)
+from sglang.srt.model_loader.loader import DefaultModelLoader, get_model_loader
 from sglang.srt.model_loader.utils import set_default_torch_dtype
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.patch_torch import monkey_patch_torch_reductions
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 1c2958b9c..fedbc84eb 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -1643,7 +1643,7 @@ def auto_choose_speculative_params(arch: str):
         return (5, 4, 8)
     elif arch in ["DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"]:
         # The default value for deepseek
-        return (5, 4, 8)
+        return (3, 1, 4)
     elif arch in ["Grok1ForCausalLM", "Grok1VForCausalLM"]:
         return (5, 4, 8)
     else:
diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index fe213b9ee..e0f965be9 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -93,6 +93,11 @@ def is_in_ci():
     return get_bool_env_var("SGLANG_IS_IN_CI")
 
 
+def is_in_amd_ci():
+    """Return whether it is in an AMD CI runner."""
+    return get_bool_env_var("SGLANG_AMD_CI")
+
+
 if is_in_ci():
     DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
         5000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index bcec75782..323aeb1eb 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -16,7 +16,8 @@ suites = {
         TestFile("models/lora/test_lora.py", 76),
         TestFile("models/lora/test_lora_backend.py", 99),
         TestFile("models/lora/test_multi_lora_backend.py", 60),
-        TestFile("models/test_embedding_models.py", 184),
+        TestFile("models/lora/test_lora_cuda_graph.py", 250),
+        TestFile("models/test_embedding_models.py", 73),
         # TestFile("models/test_clip_models.py", 52),
         TestFile("models/test_compressed_tensors_models.py", 42),
         TestFile("models/test_generation_models.py", 103),
@@ -24,44 +25,43 @@ suites = {
         # TestFile("models/test_grok_models.py", 60),  # Disabled due to illegal memory access
         TestFile("models/test_qwen_models.py", 82),
         TestFile("models/test_reward_models.py", 132),
-        TestFile("models/test_vlm_models.py", 317),
+        TestFile("models/test_vlm_models.py", 437),
         TestFile("test_abort.py", 51),
         TestFile("test_block_int8.py", 22),
         TestFile("test_create_kvindices.py", 2),
-        TestFile("test_chunked_prefill.py", 285),
-        TestFile("test_eagle_infer.py", 584),
+        TestFile("test_chunked_prefill.py", 313),
+        TestFile("test_eagle_infer.py", 619),
         TestFile("test_ebnf_constrained.py", 108),
+        TestFile("test_enable_thinking.py", 70),
         TestFile("test_embedding_openai_server.py", 141),
         TestFile("test_eval_fp8_accuracy.py", 303),
         TestFile("test_fa3.py", 376),
-        TestFile("test_fim_completion.py", 40),
+        TestFile("test_flashmla.py", 352),
         TestFile("test_fp8_kernel.py", 8),
         TestFile("test_function_call_parser.py", 10),
         TestFile("test_fused_moe.py", 30),
         TestFile("test_hicache.py", 116),
-        TestFile("test_hicache_mla.py", 254),
+        TestFile("test_hicache_mla.py", 127),
         TestFile("test_hidden_states.py", 55),
         TestFile("test_int8_kernel.py", 8),
         TestFile("test_input_embeddings.py", 38),
         TestFile("test_json_constrained.py", 98),
         TestFile("test_large_max_new_tokens.py", 41),
         TestFile("test_metrics.py", 32),
-        TestFile("test_mla.py", 242),
-        TestFile("test_mla_deepseek_v3.py", 221),
-        TestFile("test_mla_int8_deepseek_v3.py", 389),
-        TestFile("test_mla_flashinfer.py", 395),
-        TestFile("test_mla_fp8.py", 153),
-        TestFile("test_flashmla.py", 300),
+        TestFile("test_mla.py", 167),
+        TestFile("test_mla_deepseek_v3.py", 342),
+        TestFile("test_mla_int8_deepseek_v3.py", 429),
+        TestFile("test_mla_flashinfer.py", 302),
+        TestFile("test_mla_fp8.py", 93),
         TestFile("test_no_chunked_prefill.py", 108),
-        TestFile("test_no_overlap_scheduler.py", 216),
+        TestFile("test_no_overlap_scheduler.py", 234),
         TestFile("test_openai_function_calling.py", 60),
         TestFile("test_openai_server.py", 149),
         TestFile("test_penalty.py", 41),
         TestFile("test_page_size.py", 60),
         TestFile("test_pytorch_sampling_backend.py", 66),
-        TestFile("test_radix_attention.py", 167),
+        TestFile("test_radix_attention.py", 105),
         TestFile("test_reasoning_content.py", 89),
-        TestFile("test_enable_thinking.py", 70),
         TestFile("test_regex_constrained.py", 64),
         TestFile("test_release_memory_occupation.py", 44),
         TestFile("test_request_length_validation.py", 31),
@@ -70,13 +70,13 @@ suites = {
         TestFile("test_skip_tokenizer_init.py", 117),
         TestFile("test_srt_engine.py", 261),
         TestFile("test_srt_endpoint.py", 130),
-        TestFile("test_tool_choice.py", 120),
+        TestFile("test_tool_choice.py", 226),
         TestFile("test_torch_compile.py", 76),
         TestFile("test_torch_compile_moe.py", 172),
         TestFile("test_torch_native_attention_backend.py", 123),
         TestFile("test_torchao.py", 70),
         TestFile("test_triton_attention_kernels.py", 4),
-        TestFile("test_triton_attention_backend.py", 134),
+        TestFile("test_triton_attention_backend.py", 150),
         TestFile("test_triton_moe_channel_fp8_kernel.py", 25),
         TestFile("test_triton_sliding_window.py", 250),
         TestFile("test_update_weights_from_disk.py", 114),
@@ -84,10 +84,9 @@ suites = {
         TestFile("test_vertex_endpoint.py", 31),
         TestFile("test_vision_chunked_prefill.py", 175),
         TestFile("test_vlm_input_format.py", 300),
-        TestFile("test_vision_openai_server_a.py", 700),
-        TestFile("test_vision_openai_server_b.py", 700),
+        TestFile("test_vision_openai_server_a.py", 584),
+        TestFile("test_vision_openai_server_b.py", 556),
         TestFile("test_w8a8_quantization.py", 46),
-        TestFile("models/lora/test_lora_cuda_graph.py", 250),
     ],
     "per-commit-amd": [
         TestFile("test_mla.py", 242),
@@ -119,9 +118,9 @@ suites = {
         # TestFile("test_deepep_intranode.py", 50),
         # TestFile("test_deepep_low_latency.py", 50),
         # TestFile("test_moe_deepep_eval_accuracy_large.py", 250),
-        TestFile("test_disaggregation.py", 210),
-        TestFile("test_disaggregation_different_tp.py", 210),
-        TestFile("test_full_deepseek_v3.py", 250),
+        TestFile("test_disaggregation.py", 270),
+        TestFile("test_disaggregation_different_tp.py", 155),
+        TestFile("test_full_deepseek_v3.py", 463),
     ],
     "per-commit-8-gpu-amd": [
         TestFile("test_full_deepseek_v3.py", 250),
@@ -133,11 +132,11 @@ suites = {
         TestFile("test_nightly_gsm8k_eval_amd.py"),
     ],
     "vllm_dependency_test": [
-        TestFile("test_vllm_dependency.py"),
         TestFile("test_awq.py"),
+        TestFile("test_bnb.py"),
         TestFile("test_gguf.py", 78),
         TestFile("test_gptqmodel_dynamic.py", 72),
-        TestFile("test_bnb.py"),
+        TestFile("test_vllm_dependency.py"),
     ],
 }
 
diff --git a/test/srt/test_bench_one_batch.py b/test/srt/test_bench_one_batch.py
index 0fc11db2e..2de46ee51 100644
--- a/test/srt/test_bench_one_batch.py
+++ b/test/srt/test_bench_one_batch.py
@@ -6,6 +6,7 @@ from sglang.test.test_utils import (
     DEFAULT_MOE_MODEL_NAME_FOR_TEST,
     DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
     CustomTestCase,
+    is_in_amd_ci,
     is_in_ci,
     run_bench_offline_throughput,
     run_bench_one_batch,
@@ -46,7 +47,7 @@ class TestBenchOneBatch(CustomTestCase):
                 f"### test_moe_tp2_bs1 (Mixtral-8x7B)\n"
                 f"output_throughput: {output_throughput:.2f} token/s\n"
             )
-            if os.getenv("SGLANG_AMD_CI") == "1":
+            if is_in_amd_ci():
                 self.assertGreater(output_throughput, 85)
             else:
                 self.assertGreater(output_throughput, 125)
@@ -62,7 +63,7 @@ class TestBenchOneBatch(CustomTestCase):
                 f"### test_torch_compile_tp2_bs1 (Mixtral-8x7B)\n"
                 f"output_throughput: {output_throughput:.2f} token/s\n"
             )
-            if os.getenv("SGLANG_AMD_CI") == "1":
+            if is_in_amd_ci():
                 self.assertGreater(output_throughput, 200)
             else:
                 self.assertGreater(output_throughput, 220)
diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py
index 6d640bf0f..a74c9dac3 100644
--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -1,4 +1,3 @@
-import os
 import unittest
 
 from sglang.test.test_utils import (
@@ -8,8 +7,8 @@ from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST_FP8,
     DEFAULT_MOE_MODEL_NAME_FOR_TEST,
     DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
-    DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST,
     CustomTestCase,
+    is_in_amd_ci,
     is_in_ci,
     run_bench_serving,
     write_github_step_summary,
@@ -31,7 +30,7 @@ class TestBenchServing(CustomTestCase):
                 f"### test_offline_throughput_default\n"
                 f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
             )
-            if os.getenv("SGLANG_AMD_CI") == "1":
+            if is_in_amd_ci():
                 self.assertGreater(res["output_throughput"], 3150)
             else:
                 self.assertGreater(res["output_throughput"], 3800)
@@ -69,7 +68,7 @@ class TestBenchServing(CustomTestCase):
                 f"### test_offline_throughput_without_radix_cache\n"
                 f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
             )
-            if os.getenv("SGLANG_AMD_CI") == "1":
+            if is_in_amd_ci():
                 self.assertGreater(res["output_throughput"], 3050)
             else:
                 self.assertGreater(res["output_throughput"], 3800)
@@ -107,7 +106,7 @@ class TestBenchServing(CustomTestCase):
                 f"### test_offline_throughput_with_triton_attention_backend\n"
                 f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
             )
-            if os.getenv("SGLANG_AMD_CI") == "1":
+            if is_in_amd_ci():
                 self.assertGreater(res["output_throughput"], 3500)
             else:
                 self.assertGreater(res["output_throughput"], 3700)
@@ -125,7 +124,7 @@ class TestBenchServing(CustomTestCase):
                 f"### test_offline_throughput_default_fp8\n"
                 f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
             )
-            if os.getenv("SGLANG_AMD_CI") == "1":
+            if is_in_amd_ci():
                 self.assertGreater(res["output_throughput"], 3500)
             else:
                 self.assertGreater(res["output_throughput"], 4300)
@@ -144,7 +143,7 @@ class TestBenchServing(CustomTestCase):
                 f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n'
             )
             self.assertLess(res["median_e2e_latency_ms"], 11000)
-            if os.getenv("SGLANG_AMD_CI") == "1":
+            if is_in_amd_ci():
                 self.assertLess(res["median_ttft_ms"], 115)
             else:
                 self.assertLess(res["median_ttft_ms"], 86)
@@ -167,7 +166,7 @@ class TestBenchServing(CustomTestCase):
                 f"### test_vlm_offline_throughput\n"
                 f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
             )
-            if os.getenv("SGLANG_AMD_CI") == "1":
+            if is_in_amd_ci():
                 self.assertGreater(res["output_throughput"], 2000)
                 # TODO: not set yet, need AMD machine
             else:
@@ -191,7 +190,7 @@ class TestBenchServing(CustomTestCase):
                 f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n'
             )
             self.assertLess(res["median_e2e_latency_ms"], 16500)
-            if os.getenv("SGLANG_AMD_CI") == "1":
+            if is_in_amd_ci():
                 self.assertLess(res["median_ttft_ms"], 150)
                 # TODO: not set yet, need AMD machine
             else:
@@ -230,7 +229,7 @@ class TestBenchServing(CustomTestCase):
                 f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n'
                 f'accept_length: {res["accept_length"]:.2f} \n'
             )
-            if os.getenv("SGLANG_AMD_CI") == "1":
+            if is_in_amd_ci():
                 self.assertLess(res["median_e2e_latency_ms"], 1800)
             else:
                 self.assertLess(res["median_e2e_latency_ms"], 900)
@@ -249,7 +248,7 @@ class TestBenchServing(CustomTestCase):
                 f"### test_moe_offline_throughput_default\n"
                 f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
             )
-            if os.getenv("SGLANG_AMD_CI") == "1":
+            if is_in_amd_ci():
                 self.assertGreater(res["output_throughput"], 2100)
             else:
                 self.assertGreater(res["output_throughput"], 2200)
@@ -267,7 +266,7 @@ class TestBenchServing(CustomTestCase):
                 f"### test_moe_offline_throughput_without_radix_cache\n"
                 f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
             )
-            if os.getenv("SGLANG_AMD_CI") == "1":
+            if is_in_amd_ci():
                 self.assertGreater(res["output_throughput"], 2100)
             else:
                 self.assertGreater(res["output_throughput"], 2200)
@@ -289,7 +288,7 @@ class TestBenchServing(CustomTestCase):
                 f"### test_pp_offline_throughput_default_decode\n"
                 f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
             )
-            self.assertGreater(res["output_throughput"], 7500)
+            self.assertGreater(res["output_throughput"], 6700)
 
     def test_pp_long_context_prefill(self):
         res = run_bench_serving(
diff --git a/test/srt/test_full_deepseek_v3.py b/test/srt/test_full_deepseek_v3.py
index d14c17bc7..6a7bb4729 100644
--- a/test/srt/test_full_deepseek_v3.py
+++ b/test/srt/test_full_deepseek_v3.py
@@ -1,4 +1,3 @@
-import os
 import unittest
 from types import SimpleNamespace
 
@@ -11,6 +10,7 @@ from sglang.test.test_utils import (
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
     CustomTestCase,
+    is_in_amd_ci,
     is_in_ci,
     popen_launch_server,
     write_github_step_summary,
@@ -67,7 +67,7 @@ class TestDeepseekV3(CustomTestCase):
             write_github_step_summary(
                 f"### test_bs_1_speed (deepseek-v3)\n" f"{speed=:.2f} token/s\n"
             )
-            if os.getenv("SGLANG_AMD_CI") == "1":
+            if is_in_amd_ci():
                 self.assertGreater(speed, 12)
             else:
                 self.assertGreater(speed, 75)
@@ -91,7 +91,7 @@ class TestDeepseekV3MTP(CustomTestCase):
             "--speculative-num-draft-tokens",
             "4",
         ]
-        if os.environ.get("SGLANG_AMD_CI") != "1":
+        if not is_in_amd_ci():
             other_args += ["--mem-frac", "0.7"]
         cls.process = popen_launch_server(
             cls.model,
@@ -148,11 +148,11 @@ class TestDeepseekV3MTP(CustomTestCase):
                 f"{acc_length=:.2f}\n"
                 f"{speed=:.2f} token/s\n"
             )
-            if os.getenv("SGLANG_AMD_CI") == "1":
+            if is_in_amd_ci():
                 self.assertGreater(acc_length, 2.8)
             else:
                 self.assertGreater(acc_length, 2.9)
-            if os.getenv("SGLANG_AMD_CI") == "1":
+            if is_in_amd_ci():
                 self.assertGreater(speed, 15)
             else:
                 self.assertGreater(speed, 105)
diff --git a/test/srt/test_mla.py b/test/srt/test_mla.py
index d5434a27f..af867797c 100644
--- a/test/srt/test_mla.py
+++ b/test/srt/test_mla.py
@@ -24,8 +24,8 @@ class TestMLA(CustomTestCase):
             other_args=[
                 "--trust-remote-code",
                 "--enable-torch-compile",
-                "--cuda-graph-max-bs",
-                "2",
+                "--torch-compile-max-bs",
+                "4",
                 "--chunked-prefill-size",
                 "256",
             ],
@@ -35,18 +35,6 @@ class TestMLA(CustomTestCase):
     def tearDownClass(cls):
         kill_process_tree(cls.process.pid)
 
-    def test_mmlu(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="mmlu",
-            num_examples=64,
-            num_threads=32,
-        )
-
-        metrics = run_eval(args)
-        self.assertGreater(metrics["score"], 0.5)
-
     def test_mgsm_en(self):
         args = SimpleNamespace(
             base_url=self.base_url,
diff --git a/test/srt/test_mla_flashinfer.py b/test/srt/test_mla_flashinfer.py
index aa971a582..7b3124df3 100644
--- a/test/srt/test_mla_flashinfer.py
+++ b/test/srt/test_mla_flashinfer.py
@@ -57,50 +57,6 @@ class TestFlashinferMLA(CustomTestCase):
         self.assertGreater(metrics["accuracy"], 0.62)
 
 
-class TestFlashinferMLANoRagged(CustomTestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "lmsys/sglang-ci-dsv3-test"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        other_args = ["--trust-remote-code"]
-        if torch.cuda.is_available() and torch.version.cuda:
-            other_args.extend(
-                [
-                    "--enable-torch-compile",
-                    "--disable-cuda-graph",
-                    "--cuda-graph-max-bs",
-                    "4",
-                    "--attention-backend",
-                    "flashinfer",
-                ]
-            )
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=other_args,
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-
-    def test_gsm8k(self):
-        args = SimpleNamespace(
-            num_shots=5,
-            data_path=None,
-            num_questions=200,
-            max_new_tokens=512,
-            parallel=128,
-            host="http://127.0.0.1",
-            port=int(self.base_url.split(":")[-1]),
-        )
-        metrics = run_eval_few_shot_gsm8k(args)
-        print(metrics)
-
-        self.assertGreater(metrics["accuracy"], 0.62)
-
-
 class TestFlashinferMLAMTP(CustomTestCase):
     @classmethod
     def setUpClass(cls):
diff --git a/test/srt/test_pp_single_node.py b/test/srt/test_pp_single_node.py
index dbac4c771..01aecdd38 100644
--- a/test/srt/test_pp_single_node.py
+++ b/test/srt/test_pp_single_node.py
@@ -17,6 +17,7 @@ from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
+    is_in_ci,
     popen_launch_server,
     run_bench_one_batch_server,
 )
@@ -59,7 +60,7 @@ class TestPPAccuracy(unittest.TestCase):
 
         self.assertGreater(metrics["accuracy"], 0.74)
         # Wait a little bit so that the memory check happens.
-        time.sleep(5)
+        time.sleep(4)
 
 
 class TestQwenPPAccuracy(unittest.TestCase):
@@ -97,20 +98,17 @@ class TestQwenPPAccuracy(unittest.TestCase):
         finally:
             kill_process_tree(process.pid)
 
-    def test_baseline_accuracy(self):
-        metrics = self.run_gsm8k_test(pp_size=1)
-        print(f"[Qwen Baseline] {metrics=}")
-        self.assertGreater(metrics["accuracy"], 0.74)
-
+    @unittest.skipIf(is_in_ci(), "To reduce the CI execution time.")
     def test_pp_consistency(self):
         baseline = self.run_gsm8k_test(pp_size=1)
         pp_metrics = self.run_gsm8k_test(pp_size=2)
 
         print(f"[Qwen PP Comparison] Baseline: {baseline} | PP: {pp_metrics}")
 
+        self.assertGreaterEqual(baseline["accuracy"], 0.74)
         self.assertGreaterEqual(
             pp_metrics["accuracy"],
-            baseline["accuracy"] - 0.01,
+            baseline["accuracy"] - 0.02,
             msg=(
                 f"PP accuracy dropped more than 1% compared to baseline. "
                 f"Baseline: {baseline['accuracy']:.2%}, PP: {pp_metrics['accuracy']:.2%}"
@@ -155,20 +153,16 @@ class TestQwenPPTieWeightsAccuracy(unittest.TestCase):
         finally:
             kill_process_tree(process.pid)
 
-    def test_baseline_accuracy(self):
-        metrics = self.run_gsm8k_test(pp_size=1)
-        print(f"[Qwen Baseline] {metrics=}")
-        self.assertGreater(metrics["accuracy"], 0.39)
-
     def test_pp_consistency(self):
         baseline = self.run_gsm8k_test(pp_size=1)
         pp_metrics = self.run_gsm8k_test(pp_size=2)
 
         print(f"[Qwen PP Comparison] Baseline: {baseline} | PP: {pp_metrics}")
 
+        self.assertGreaterEqual(baseline["accuracy"], 0.38)
         self.assertGreaterEqual(
             pp_metrics["accuracy"],
-            baseline["accuracy"] - 0.01,
+            baseline["accuracy"] - 0.02,
             msg=(
                 f"PP accuracy dropped more than 1% compared to baseline. "
                 f"Baseline: {baseline['accuracy']:.2%}, PP: {pp_metrics['accuracy']:.2%}"
@@ -211,20 +205,16 @@ class TestQwenMoePPAccuracy(unittest.TestCase):
         finally:
             kill_process_tree(process.pid)
 
-    def test_baseline_accuracy(self):
-        metrics = self.run_gsm8k_test(pp_size=1)
-        print(f"[Qwen Baseline] {metrics=}")
-        self.assertGreater(metrics["accuracy"], 0.74)
-
     def test_pp_consistency(self):
         baseline = self.run_gsm8k_test(pp_size=1)
         pp_metrics = self.run_gsm8k_test(pp_size=2)
 
         print(f"[Qwen PP Comparison] Baseline: {baseline} | PP: {pp_metrics}")
 
+        self.assertGreaterEqual(baseline["accuracy"], 0.74)
         self.assertGreaterEqual(
             pp_metrics["accuracy"],
-            baseline["accuracy"] - 0.01,
+            baseline["accuracy"] - 0.02,
             msg=(
                 f"PP accuracy dropped more than 1% compared to baseline. "
                 f"Baseline: {baseline['accuracy']:.2%}, PP: {pp_metrics['accuracy']:.2%}"
diff --git a/test/srt/test_radix_attention.py b/test/srt/test_radix_attention.py
index 554801dc0..66f948621 100644
--- a/test/srt/test_radix_attention.py
+++ b/test/srt/test_radix_attention.py
@@ -9,6 +9,7 @@ from sglang.test.test_utils import (
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
     CustomTestCase,
+    is_in_ci,
     kill_process_tree,
     popen_launch_server,
 )
@@ -88,6 +89,7 @@ class TestRadixCacheFCFS(CustomTestCase):
         run_test(self.base_url, nodes)
 
 
+@unittest.skipIf(is_in_ci(), "To reduce the CI execution time.")
 class TestRadixCacheLPM(TestRadixCacheFCFS):
     @classmethod
     def setUpClass(cls):
diff --git a/test/srt/test_torch_compile.py b/test/srt/test_torch_compile.py
index 904e49f9d..36815718a 100644
--- a/test/srt/test_torch_compile.py
+++ b/test/srt/test_torch_compile.py
@@ -11,6 +11,7 @@ from sglang.test.test_utils import (
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
     CustomTestCase,
+    is_in_amd_ci,
     popen_launch_server,
 )
 
@@ -68,7 +69,11 @@ class TestTorchCompile(CustomTestCase):
         print(f"{res=}")
         throughput = max_tokens / (tok - tic)
         print(f"Throughput: {throughput} tokens/s")
-        self.assertGreaterEqual(throughput, 152)
+
+        if is_in_amd_ci():
+            self.assertGreaterEqual(throughput, 145)
+        else:
+            self.assertGreaterEqual(throughput, 152)
 
 
 if __name__ == "__main__":
diff --git a/test/srt/test_vision_openai_server_a.py b/test/srt/test_vision_openai_server_a.py
index 4fef381fd..a4a2e770d 100644
--- a/test/srt/test_vision_openai_server_a.py
+++ b/test/srt/test_vision_openai_server_a.py
@@ -4,6 +4,8 @@ python3 -m unittest test_vision_openai_server.TestOpenAIVisionServer.test_mixed_
 python3 -m unittest test_vision_openai_server.TestOpenAIVisionServer.test_multi_images_chat_completion
 """
 
+import unittest
+
 from test_vision_openai_server_common import *
 
 from sglang.srt.utils import kill_process_tree
diff --git a/test/srt/test_vision_openai_server_b.py b/test/srt/test_vision_openai_server_b.py
index 404a4844b..6043dd107 100644
--- a/test/srt/test_vision_openai_server_b.py
+++ b/test/srt/test_vision_openai_server_b.py
@@ -1,10 +1,10 @@
+import unittest
+
 from test_vision_openai_server_common import *
 
-from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
-    CustomTestCase,
     popen_launch_server,
 )
 
@@ -75,28 +75,6 @@ class TestDeepseekVL2Server(TestOpenAIVisionServer):
         pass
 
 
-class TestDeepseekVL2TinyServer(TestOpenAIVisionServer):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "deepseek-ai/deepseek-vl2-tiny"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.api_key = "sk-123456"
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--context-length",
-                "4096",
-            ],
-        )
-        cls.base_url += "/v1"
-
-    def test_video_chat_completion(self):
-        pass
-
-
 class TestJanusProServer(TestOpenAIVisionServer):
     @classmethod
     def setUpClass(cls):
diff --git a/test/srt/test_vision_openai_server_common.py b/test/srt/test_vision_openai_server_common.py
index eda29f056..637345e2d 100644
--- a/test/srt/test_vision_openai_server_common.py
+++ b/test/srt/test_vision_openai_server_common.py
@@ -2,7 +2,6 @@ import base64
 import io
 import json
 import os
-import unittest
 from concurrent.futures import ThreadPoolExecutor
 
 import numpy as np