support non-streaming benchmark (#682)
This commit is contained in:
@@ -154,6 +154,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|||||||
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
|
||||||
```
|
```
|
||||||
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
|
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
|
||||||
|
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
||||||
|
|
||||||
### Supported Models
|
### Supported Models
|
||||||
|
|
||||||
|
|||||||
@@ -143,7 +143,7 @@ async def async_request_openai_completions(
|
|||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"best_of": 1,
|
"best_of": 1,
|
||||||
"max_tokens": request_func_input.output_len,
|
"max_tokens": request_func_input.output_len,
|
||||||
"stream": True,
|
"stream": not args.disable_stream,
|
||||||
"ignore_eos": True,
|
"ignore_eos": True,
|
||||||
}
|
}
|
||||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||||
@@ -166,8 +166,9 @@ async def async_request_openai_completions(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
|
chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
|
||||||
|
latency = time.perf_counter() - st
|
||||||
if chunk == "[DONE]":
|
if chunk == "[DONE]":
|
||||||
latency = time.perf_counter() - st
|
pass
|
||||||
else:
|
else:
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
|
|
||||||
@@ -897,6 +898,11 @@ if __name__ == "__main__":
|
|||||||
help="Range of request rates in the format start,stop,step. Default is 2,34,2",
|
help="Range of request rates in the format start,stop,step. Default is 2,34,2",
|
||||||
)
|
)
|
||||||
parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
|
parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--disable-stream",
|
||||||
|
action="store_true",
|
||||||
|
help="Disable streaming mode.",
|
||||||
|
)
|
||||||
|
|
||||||
set_ulimit()
|
set_ulimit()
|
||||||
|
|
||||||
|
|||||||
@@ -28,11 +28,16 @@ class ScheduleHeuristic:
|
|||||||
# longest prefix match
|
# longest prefix match
|
||||||
forward_queue.sort(key=lambda x: -len(x.prefix_indices))
|
forward_queue.sort(key=lambda x: -len(x.prefix_indices))
|
||||||
return forward_queue
|
return forward_queue
|
||||||
|
elif self.schedule_heuristic == "fcfs":
|
||||||
|
# first come first serve
|
||||||
|
return forward_queue
|
||||||
|
elif self.schedule_heuristic == "lof":
|
||||||
|
# longest output first
|
||||||
|
forward_queue.sort(key=lambda x: -x.sampling_params.max_new_tokens)
|
||||||
|
return forward_queue
|
||||||
elif self.schedule_heuristic == "random":
|
elif self.schedule_heuristic == "random":
|
||||||
random.shuffle(forward_queue)
|
random.shuffle(forward_queue)
|
||||||
return forward_queue
|
return forward_queue
|
||||||
elif self.schedule_heuristic == "fcfs":
|
|
||||||
return forward_queue
|
|
||||||
elif self.schedule_heuristic == "dfs-weight":
|
elif self.schedule_heuristic == "dfs-weight":
|
||||||
last_node_to_reqs = defaultdict(list)
|
last_node_to_reqs = defaultdict(list)
|
||||||
for req in forward_queue:
|
for req in forward_queue:
|
||||||
|
|||||||
Reference in New Issue
Block a user