From 98c00a2df1f7363c86a3a89c163fea803de6898c Mon Sep 17 00:00:00 2001
From: Yueyang Pan <pyyjason@gmail.com>
Date: Mon, 9 Jun 2025 20:33:41 +0800
Subject: [PATCH] Fix torch profiler bugs for bench_offline_throughput.py
 (#6557)

---
 docs/references/benchmark_and_profiling.md    | 11 ++++++++
 docs/references/environment_variables.md      |  1 +
 python/sglang/bench_offline_throughput.py     | 14 ++++++++---
 .../sglang/lang/backend/runtime_endpoint.py   | 25 ++++++++++++++++++-
 .../sglang/srt/managers/tokenizer_manager.py  |  3 +++
 5 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/docs/references/benchmark_and_profiling.md b/docs/references/benchmark_and_profiling.md
index 98c67fd4d..3e96e0cef 100644
--- a/docs/references/benchmark_and_profiling.md
+++ b/docs/references/benchmark_and_profiling.md
@@ -52,6 +52,17 @@
   python -m sglang.bench_offline_throughput --model-path meta-llama/Llama-3.1-8B-Instruct --dataset-name random --num-prompts 10 --profile --mem-frac=0.8
   ```
 
+- Possible PyTorch Bug
+  If in any cases you encounter the following error (for example, using qwen 2.5 VL):
+  ```bash
+  RuntimeError: !stack.empty() INTERNAL ASSERT FAILED at "/pytorch/torch/csrc/autograd/profiler_python.cpp":983, please report a bug to PyTorch. Python replay stack is empty.
+  ```
+  This is likely a PyTorch Bug reported in [Bug: vLLM Profiler](https://github.com/vllm-project/vllm/issues/18240) and [Bug: torch.profiler.profile](https://github.com/pytorch/pytorch/issues/101632). As a workaround, you may disable `with_stack` with an environment variable such as follows:
+  ```bash
+  export SGLANG_PROFILE_WITH_STACK=False
+  python -m sglang.bench_offline_throughput --model-path meta-llama/Llama-3.1-8B-Instruct --dataset-name random --num-prompts 10 --profile --mem-frac=0.8
+  ```
+
 - View Traces
 
   Trace files can be loaded and visualized from:
diff --git a/docs/references/environment_variables.md b/docs/references/environment_variables.md
index 931fdbadb..2ce931b03 100644
--- a/docs/references/environment_variables.md
+++ b/docs/references/environment_variables.md
@@ -88,6 +88,7 @@ SGLang supports various environment variables that can be used to configure its
 | Environment Variable | Description | Default Value |
 | --- | --- | --- |
 | `SGLANG_TORCH_PROFILER_DIR` | Directory for PyTorch profiler output | `/tmp` |
+| `SGLANG_PROFILE_WITH_STACK` | Set `with_stack` option (bool) for PyTorch profiler (capture stack trace) | `true` |
 
 ## Storage & Caching
 
diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index 3bc80f9c4..1ae893d46 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -11,7 +11,9 @@ python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1
 """
 
 import argparse
+import asyncio
 import dataclasses
+import inspect
 import json
 import logging
 import os
@@ -235,8 +237,10 @@ def throughput_test_once(
     latency = time.perf_counter() - st
 
     if profile:
+        dir = os.getenv("SGLANG_TORCH_PROFILER_DIR")
+        known_files = set(os.listdir(dir))
         backend.stop_profile()
-        monitor_trace_file(os.getenv("SGLANG_TORCH_PROFILER_DIR"))
+        monitor_trace_file(known_files, dir)
 
     if backend_name == "runtime":
         gen_out = json.loads(gen_out)
@@ -260,6 +264,10 @@ def throughput_test_once(
         measurement_results["total_input_tokens"]
         + measurement_results["total_output_tokens"]
     ) / latency
+
+    if inspect.isawaitable(server_info):
+        server_info = asyncio.run(server_info)
+
     measurement_results["last_gen_throughput"] = server_info["internal_states"][0][
         "last_gen_throughput"
     ]
@@ -267,11 +275,9 @@ def throughput_test_once(
     return measurement_results
 
 
-def monitor_trace_file(directory, interval=1):
+def monitor_trace_file(known_files, directory, interval=1):
     print(f"Monitoring {directory} for new trace files...")
 
-    known_files = set(os.listdir(directory))
-
     while True:
         flag = False
         time.sleep(interval)
diff --git a/python/sglang/lang/backend/runtime_endpoint.py b/python/sglang/lang/backend/runtime_endpoint.py
index afda890a7..349f9934a 100644
--- a/python/sglang/lang/backend/runtime_endpoint.py
+++ b/python/sglang/lang/backend/runtime_endpoint.py
@@ -85,6 +85,22 @@ class RuntimeEndpoint(BaseBackend):
         )
         self._assert_success(res)
 
+    def start_profile(self):
+        res = http_request(
+            self.base_url + "/start_profile",
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+
+    def stop_profile(self):
+        res = http_request(
+            self.base_url + "/stop_profile",
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+
     def commit_lazy_operations(self, s: StreamExecutor):
         data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
         self._add_images(s, data)
@@ -374,7 +390,8 @@ class Runtime:
         self.pid = None
         pipe_reader, pipe_writer = multiprocessing.Pipe(duplex=False)
 
-        proc = multiprocessing.Process(
+        ctx = multiprocessing.get_context("spawn")
+        proc = ctx.Process(
             target=launch_server,
             args=(self.server_args, pipe_writer),
         )
@@ -406,6 +423,12 @@ class Runtime:
             kill_process_tree(self.pid)
             self.pid = None
 
+    def start_profile(self):
+        self.endpoint.start_profile()
+
+    def stop_profile(self):
+        self.endpoint.stop_profile()
+
     def cache_prefix(self, prefix: str):
         self.endpoint.cache_prefix(prefix)
 
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index 24821007b..5c6033c11 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -116,6 +116,7 @@ from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.utils import (
     dataclass_to_string_truncated,
+    get_bool_env_var,
     get_zmq_socket,
     kill_process_tree,
 )
@@ -805,6 +806,8 @@ class TokenizerManager:
         profile_by_stage: bool = False,
     ):
         self.auto_create_handle_loop()
+        env_with_stack: bool = get_bool_env_var("SGLANG_PROFILE_WITH_STACK", "true")
+        with_stack = False if with_stack is False or env_with_stack is False else True
         req = ProfileReq(
             type=ProfileReqType.START_PROFILE,
             output_dir=output_dir,