diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 3eca72de4..1a909caa8 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -163,7 +163,6 @@ async def async_request_openai_completions( "max_tokens": request_func_input.output_len, "stream": not args.disable_stream, "ignore_eos": not args.disable_ignore_eos, - "lora_path": request_func_input.lora_name, **request_func_input.extra_request_body, } headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 657e0c2ca..24a285952 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -16,6 +16,7 @@ import gc import json import logging +import time from typing import Optional import torch @@ -129,7 +130,7 @@ class ModelRunner: # Global vars if server_args.show_time_cost: enable_show_time_cost() - if server_args.disable_disk_cache: + if server_args.disable_outlines_disk_cache: from outlines.caching import disable_cache disable_cache() @@ -623,8 +624,10 @@ class ModelRunner: if self.server_args.disable_cuda_graph: return + tic = time.time() logger.info("Capture cuda graph begin. This can take up to several minutes.") self.cuda_graph_runner = CudaGraphRunner(self) + logger.info(f"Capture cuda graph end. Time elapsed: {time.time() - tic:.2f}s") def apply_torch_tp(self): logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.") diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 37ad6cfc5..788686a1e 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -122,7 +122,7 @@ class ServerArgs: disable_jump_forward: bool = False disable_cuda_graph: bool = False disable_cuda_graph_padding: bool = False - disable_disk_cache: bool = False + disable_outlines_disk_cache: bool = False disable_custom_all_reduce: bool = False disable_mla: bool = False disable_overlap_schedule: bool = False @@ -159,7 +159,7 @@ class ServerArgs: if self.tp_size >= 16: self.mem_fraction_static = 0.79 elif self.tp_size >= 8: - self.mem_fraction_static = 0.82 + self.mem_fraction_static = 0.81 elif self.tp_size >= 4: self.mem_fraction_static = 0.85 elif self.tp_size >= 2: @@ -192,7 +192,7 @@ class ServerArgs: ) if self.attention_backend == "torch_native": - logger.info( + logger.warning( "Cuda graph is disabled because of using torch native attention backend" ) self.disable_cuda_graph = True @@ -204,12 +204,12 @@ class ServerArgs: self.cuda_graph_max_bs = min(self.cuda_graph_max_bs, 96) self.schedule_conservativeness = self.schedule_conservativeness * 0.3 self.disable_overlap_schedule = True - logger.info( + logger.warning( f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. " f"The CUDA graph max batch size is adjusted to {self.cuda_graph_max_bs}. " f"The schedule conservativeness is adjusted to {self.schedule_conservativeness}. " "Data parallel size is adjusted to be the same as tensor parallel size. " - "Overlap schedule is disabled." + "Overlap scheduler is disabled." ) # GGUF @@ -642,9 +642,9 @@ class ServerArgs: help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.", ) parser.add_argument( - "--disable-disk-cache", + "--disable-outlines-disk-cache", action="store_true", - help="Disable disk cache to avoid possible crashes related to file system or high concurrency.", + help="Disable disk cache of outlines to avoid possible crashes related to file system or high concurrency.", ) parser.add_argument( "--disable-custom-all-reduce", @@ -745,6 +745,11 @@ class ServerArgs: action=DeprecatedAction, help="'--disable-flashinfer-sampling' is deprecated. Please use '--sampling-backend pytroch' instead.", ) + parser.add_argument( + "--disable-disk-cache", + action=DeprecatedAction, + help="'--disable-disk-cache' is deprecated. Please use '--disable-outlines-disk-cache' instead.", + ) @classmethod def from_cli_args(cls, args: argparse.Namespace):