diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index d12ed8c57..fb6202932 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -18,12 +18,10 @@ import copy import dataclasses import logging import os -import pickle import signal import sys import time import uuid -from datetime import datetime from typing import Any, Awaitable, Dict, Generic, List, Optional, Tuple, TypeVar, Union import fastapi @@ -107,7 +105,6 @@ class TokenizerManager: # Parse args self.server_args = server_args self.enable_metrics = server_args.enable_metrics - self.dump_requsts_folder = server_args.dump_requests_folder # Init inter-process communication context = zmq.asyncio.Context(2) @@ -166,7 +163,6 @@ class TokenizerManager: # Store states self.to_create_loop = True self.rid_to_state: Dict[str, ReqState] = {} - self.dump_request_list: List[Tuple] = [] # The event to notify the weight sync is finished. self.model_update_lock = RWLock() @@ -684,9 +680,6 @@ class TokenizerManager: if self.enable_metrics: self.collect_metrics(state, recv_obj, i) - if self.dump_requsts_folder and state.finished: - self.dump_requests(state, out_dict) - elif isinstance(recv_obj, OpenSessionReqOutput): self.session_futures[recv_obj.session_id].set_result( recv_obj.session_id if recv_obj.success else None @@ -825,27 +818,6 @@ class TokenizerManager: (time.time() - state.created_time) / completion_tokens ) - def dump_requests(self, state: ReqState, out_dict: dict): - self.dump_request_list.append( - (state.obj, out_dict, state.created_time, time.time()) - ) - - if len(self.dump_request_list) > int( - os.environ.get("SGLANG_DUMP_REQUESTS_THRESHOLD", "1000") - ): - to_dump = self.dump_request_list - self.dump_request_list = [] - - def background_task(): - os.makedirs(self.dump_requsts_folder, exist_ok=True) - current_time = datetime.now() - filename = current_time.strftime("%Y-%m-%d_%H-%M-%S") + ".pkl" - with open(os.path.join(self.dump_requsts_folder, filename), "wb") as f: - pickle.dump(to_dump, f) - - # Schedule the task to run in the background without awaiting it - asyncio.create_task(asyncio.to_thread(background_task)) - class SignalHandler: def __init__(self, tokenizer_manager): diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index e5c423a35..be85a3670 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -23,6 +23,7 @@ from typing import List, Optional import torch from sglang.srt.hf_transformers_utils import check_gguf_file +from sglang.srt.speculative.spec_info import SpeculativeAlgorithm from sglang.srt.utils import ( get_amdgpu_memory_capacity, get_hpu_memory_capacity, @@ -88,7 +89,6 @@ class ServerArgs: show_time_cost: bool = False enable_metrics: bool = False decode_log_interval: int = 40 - dump_requests_folder: str = "" # API related api_key: Optional[str] = None @@ -554,13 +554,7 @@ class ServerArgs: "--decode-log-interval", type=int, default=ServerArgs.decode_log_interval, - help="The log interval of decode batch.", - ) - parser.add_argument( - "--dump-requests-folder", - type=str, - default=ServerArgs.decode_log_interval, - help="Dump raw requests to a folder for replay.", + help="The log interval of decode batch", ) # API related