From 29ebe3dff475b87f9e252fa9257ab9b64ee4988f Mon Sep 17 00:00:00 2001 From: ws Date: Fri, 15 Nov 2024 19:39:10 +0800 Subject: [PATCH] fix: align enable_overlap_scheduler naming between code and docs (#2038) --- docs/backend/backend.md | 2 +- docs/references/hyperparameter_tuning.md | 2 +- python/sglang/test/test_utils.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/backend/backend.md b/docs/backend/backend.md index 51c9bc35a..2e6b4287e 100644 --- a/docs/backend/backend.md +++ b/docs/backend/backend.md @@ -79,7 +79,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct ``` python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096 ``` -- To enable the experimental overlapped scheduler, add `--enable-overlap-scheduler`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currenly. +- To enable the experimental overlapped scheduler, add `--enable-overlap-schedule`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currenly. - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currenly. - To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies. - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments. diff --git a/docs/references/hyperparameter_tuning.md b/docs/references/hyperparameter_tuning.md index 499b81bc0..cb5089951 100644 --- a/docs/references/hyperparameter_tuning.md +++ b/docs/references/hyperparameter_tuning.md @@ -31,7 +31,7 @@ If you see out of memory (OOM) errors, you can try to tune the following paramet - You can also try to decrease `--mem-fraction-static`, which reduces the memory usage of the KV cache memory pool and helps both prefill and decoding. ### Try Advanced Options -- To enable the experimental overlapped scheduler, add `--enable-overlap-scheduler`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currenly. +- To enable the experimental overlapped scheduler, add `--enable-overlap-schedule`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currenly. - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currenly. ### Tune `--schedule-policy` diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index ec85a12c3..5d878d6af 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -673,7 +673,7 @@ def run_and_check_memory_leak( if enable_mixed_chunk: other_args += ["--enable-mixed-chunk"] if enable_overlap: - other_args += ["--enable-overlap-scheduler"] + other_args += ["--enable-overlap-schedule"] model = DEFAULT_MODEL_NAME_FOR_TEST port = random.randint(4000, 5000)