diff --git a/README.md b/README.md index 9c6f1bd7c..e7a48ec24 100644 --- a/README.md +++ b/README.md @@ -154,6 +154,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1 ``` - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md). +- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments. ### Supported Models diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index b1abcd35e..7cf19f249 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -143,7 +143,7 @@ async def async_request_openai_completions( "temperature": 0.0, "best_of": 1, "max_tokens": request_func_input.output_len, - "stream": True, + "stream": not args.disable_stream, "ignore_eos": True, } headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} @@ -166,8 +166,9 @@ async def async_request_openai_completions( continue chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ") + latency = time.perf_counter() - st if chunk == "[DONE]": - latency = time.perf_counter() - st + pass else: data = json.loads(chunk) @@ -897,6 +898,11 @@ if __name__ == "__main__": help="Range of request rates in the format start,stop,step. Default is 2,34,2", ) parser.add_argument("--output-file", type=str, help="Output JSONL file name.") + parser.add_argument( + "--disable-stream", + action="store_true", + help="Disable streaming mode.", + ) set_ulimit() diff --git a/python/sglang/srt/managers/controller/schedule_heuristic.py b/python/sglang/srt/managers/controller/schedule_heuristic.py index aae6cfb86..46a5bf239 100644 --- a/python/sglang/srt/managers/controller/schedule_heuristic.py +++ b/python/sglang/srt/managers/controller/schedule_heuristic.py @@ -28,11 +28,16 @@ class ScheduleHeuristic: # longest prefix match forward_queue.sort(key=lambda x: -len(x.prefix_indices)) return forward_queue + elif self.schedule_heuristic == "fcfs": + # first come first serve + return forward_queue + elif self.schedule_heuristic == "lof": + # longest output first + forward_queue.sort(key=lambda x: -x.sampling_params.max_new_tokens) + return forward_queue elif self.schedule_heuristic == "random": random.shuffle(forward_queue) return forward_queue - elif self.schedule_heuristic == "fcfs": - return forward_queue elif self.schedule_heuristic == "dfs-weight": last_node_to_reqs = defaultdict(list) for req in forward_queue: