diff --git a/README.md b/README.md index 7a1bc8e8b..332aaa0af 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,9 @@ SGLang is a structured generation language designed for large language models (LLMs). It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system. -The core features of SGLang include: +The core features include: - **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction. -- **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatic KV cache reuse across multiple calls. It also supports other common techniques like continuous batching and tensor parallelism. +- **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatically reusing the KV cache across multiple calls. It can also be used as a standalone serving engine with all common techniques implemented, such as continuous batching and tensor parallelism. ## News - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)). diff --git a/docs/hyperparameter_tuning.md b/docs/hyperparameter_tuning.md index 0263d557a..dec516bc9 100644 --- a/docs/hyperparameter_tuning.md +++ b/docs/hyperparameter_tuning.md @@ -6,7 +6,7 @@ Achieving a large batch size is the most important thing for attaining high thro When the server is running at full load, look for the following in the log: -```[gpu_id=0] #running-req: 233, #token: 370959, token usage: 0.82, gen throughput (token/s): 4594.01, #queue-req: 417``` +```[gpu_id=0] Decode batch. #running-req: 233, #token: 370959, token usage: 0.82, gen throughput (token/s): 4594.01, #queue-req: 417``` ### Tune Your Request Submission Speed `#queue-req` indicates the number of requests in the queue. If you frequently see `#queue-req == 0`, it suggests you are bottlenecked by the request submission speed. diff --git a/python/sglang/backend/litellm.py b/python/sglang/backend/litellm.py index 9a0060f33..dc89dc16d 100644 --- a/python/sglang/backend/litellm.py +++ b/python/sglang/backend/litellm.py @@ -9,6 +9,7 @@ try: import litellm except ImportError as e: litellm = e + litellm.num_retries = 1 class LiteLLM(BaseBackend): diff --git a/python/sglang/srt/managers/controller/tp_worker.py b/python/sglang/srt/managers/controller/tp_worker.py index 30e209b22..7fb5e1b3b 100644 --- a/python/sglang/srt/managers/controller/tp_worker.py +++ b/python/sglang/srt/managers/controller/tp_worker.py @@ -111,7 +111,10 @@ class ModelTpServer: f"context_len={self.model_config.context_len}, " ) if self.tp_rank == 0: - logger.info(f"server_args: {server_args.print_mode_args()}") + logger.info( + f"[gpu_id={self.gpu_id}] " + f"server_args: {server_args.print_mode_args()}" + ) # Init cache self.tree_cache = RadixCache( @@ -226,7 +229,7 @@ class ModelTpServer: self.num_generated_tokens = 0 self.last_stats_tic = time.time() logger.info( - f"[gpu_id={self.gpu_id}] " + f"[gpu_id={self.gpu_id}] Decode batch. " f"#running-req: {len(self.running_batch.reqs)}, " f"#token: {num_used}, " f"token usage: {num_used / self.max_total_num_tokens:.2f}, " @@ -397,12 +400,13 @@ class ModelTpServer: self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"] ) logger.info( - f"new fill batch. #seq: {len(can_run_list)}. " - f"#cached_token: {hit_tokens}. " - f"#new_token: {new_batch_input_tokens}. " - f"#remaining_req: {len(self.forward_queue) - len(can_run_list)}. " - f"#running_req: {running_req}. " - f"tree_cache_hit_rate: {100.0 * tree_cache_hit_rate:.2f}%. " + f"[gpu_id={self.gpu_id}] Prefil batch. " + f"#new-seq: {len(can_run_list)}, " + f"#new-token: {new_batch_input_tokens}, " + f"#cached-token: {hit_tokens}, " + f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, " + f"#running-req: {running_req}, " + f"#queue-req: {len(self.forward_queue) - len(can_run_list)}" ) # logger.debug( # f"fsm_cache_hit_rate: {100.0 * self.regex_fsm_cache.get_cache_hit_rate():.2f}%. "