Report median instead of mean in bench_latency.py (#1269)
This commit is contained in:
@@ -233,7 +233,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|||||||
### Supported Models
|
### Supported Models
|
||||||
|
|
||||||
**Generative Models**
|
**Generative Models**
|
||||||
- Exaone 3.0
|
|
||||||
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
||||||
- Mistral / Mixtral / Mistral NeMo
|
- Mistral / Mixtral / Mistral NeMo
|
||||||
- Gemma / Gemma 2
|
- Gemma / Gemma 2
|
||||||
@@ -253,6 +252,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|||||||
- Grok
|
- Grok
|
||||||
- ChatGLM
|
- ChatGLM
|
||||||
- InternLM 2
|
- InternLM 2
|
||||||
|
- Exaone 3
|
||||||
|
|
||||||
**Embedding Models**
|
**Embedding Models**
|
||||||
|
|
||||||
|
|||||||
@@ -292,6 +292,7 @@ def latency_test_run_once(
|
|||||||
measurement_results["prefill_throughput"] = throughput
|
measurement_results["prefill_throughput"] = throughput
|
||||||
|
|
||||||
# Decode
|
# Decode
|
||||||
|
decode_latencies = []
|
||||||
for i in range(output_len):
|
for i in range(output_len):
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
tic = time.time()
|
tic = time.time()
|
||||||
@@ -300,17 +301,18 @@ def latency_test_run_once(
|
|||||||
latency = time.time() - tic
|
latency = time.time() - tic
|
||||||
tot_latency += latency
|
tot_latency += latency
|
||||||
throughput = batch_size / latency
|
throughput = batch_size / latency
|
||||||
|
decode_latencies.append(latency)
|
||||||
if i < 5:
|
if i < 5:
|
||||||
rank_print(
|
rank_print(
|
||||||
f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
|
f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
|
||||||
)
|
)
|
||||||
avg_decode_latency = (tot_latency - prefill_latency) / output_len
|
med_decode_latency = np.median(decode_latencies)
|
||||||
avg_decode_throughput = batch_size / avg_decode_latency
|
med_decode_throughput = batch_size / med_decode_latency
|
||||||
rank_print(
|
rank_print(
|
||||||
f"Decode. avg latency: {avg_decode_latency:6.5f} s, avg throughput: {avg_decode_throughput:9.2f} token/s"
|
f"Decode. median latency: {med_decode_latency:6.5f} s, median throughput: {med_decode_throughput:9.2f} token/s"
|
||||||
)
|
)
|
||||||
measurement_results["avg_decode_latency"] = avg_decode_latency
|
measurement_results["median_decode_latency"] = med_decode_latency
|
||||||
measurement_results["avg_decode_throughput"] = avg_decode_throughput
|
measurement_results["median_decode_throughput"] = med_decode_throughput
|
||||||
|
|
||||||
throughput = (input_len + output_len) * batch_size / tot_latency
|
throughput = (input_len + output_len) * batch_size / tot_latency
|
||||||
rank_print(
|
rank_print(
|
||||||
|
|||||||
@@ -50,8 +50,6 @@ for name, cls in _CONFIG_REGISTRY.items():
|
|||||||
with contextlib.suppress(ValueError):
|
with contextlib.suppress(ValueError):
|
||||||
AutoConfig.register(name, cls)
|
AutoConfig.register(name, cls)
|
||||||
|
|
||||||
from sglang.srt.utils import is_multimodal_model
|
|
||||||
|
|
||||||
|
|
||||||
def download_from_hf(model_path: str):
|
def download_from_hf(model_path: str):
|
||||||
if os.path.exists(model_path):
|
if os.path.exists(model_path):
|
||||||
@@ -60,12 +58,6 @@ def download_from_hf(model_path: str):
|
|||||||
return snapshot_download(model_path, allow_patterns=["*.json", "*.bin", "*.model"])
|
return snapshot_download(model_path, allow_patterns=["*.json", "*.bin", "*.model"])
|
||||||
|
|
||||||
|
|
||||||
def get_config_json(model_path: str):
|
|
||||||
with open(os.path.join(model_path, "configs.json")) as f:
|
|
||||||
config = json.load(f)
|
|
||||||
return config
|
|
||||||
|
|
||||||
|
|
||||||
def get_config(
|
def get_config(
|
||||||
model: str,
|
model: str,
|
||||||
trust_remote_code: bool,
|
trust_remote_code: bool,
|
||||||
|
|||||||
Reference in New Issue
Block a user