diff --git a/README.md b/README.md index bfd01e208..2991b4063 100644 --- a/README.md +++ b/README.md @@ -233,7 +233,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct ### Supported Models **Generative Models** -- Exaone 3.0 - Llama / Llama 2 / Llama 3 / Llama 3.1 - Mistral / Mixtral / Mistral NeMo - Gemma / Gemma 2 @@ -253,6 +252,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct - Grok - ChatGLM - InternLM 2 +- Exaone 3 **Embedding Models** diff --git a/python/sglang/bench_latency.py b/python/sglang/bench_latency.py index 3a4874085..966a97d20 100644 --- a/python/sglang/bench_latency.py +++ b/python/sglang/bench_latency.py @@ -292,6 +292,7 @@ def latency_test_run_once( measurement_results["prefill_throughput"] = throughput # Decode + decode_latencies = [] for i in range(output_len): torch.cuda.synchronize() tic = time.time() @@ -300,17 +301,18 @@ def latency_test_run_once( latency = time.time() - tic tot_latency += latency throughput = batch_size / latency + decode_latencies.append(latency) if i < 5: rank_print( f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s" ) - avg_decode_latency = (tot_latency - prefill_latency) / output_len - avg_decode_throughput = batch_size / avg_decode_latency + med_decode_latency = np.median(decode_latencies) + med_decode_throughput = batch_size / med_decode_latency rank_print( - f"Decode. avg latency: {avg_decode_latency:6.5f} s, avg throughput: {avg_decode_throughput:9.2f} token/s" + f"Decode. median latency: {med_decode_latency:6.5f} s, median throughput: {med_decode_throughput:9.2f} token/s" ) - measurement_results["avg_decode_latency"] = avg_decode_latency - measurement_results["avg_decode_throughput"] = avg_decode_throughput + measurement_results["median_decode_latency"] = med_decode_latency + measurement_results["median_decode_throughput"] = med_decode_throughput throughput = (input_len + output_len) * batch_size / tot_latency rank_print( diff --git a/python/sglang/srt/hf_transformers_utils.py b/python/sglang/srt/hf_transformers_utils.py index 7fce3b240..b22c61020 100644 --- a/python/sglang/srt/hf_transformers_utils.py +++ b/python/sglang/srt/hf_transformers_utils.py @@ -50,8 +50,6 @@ for name, cls in _CONFIG_REGISTRY.items(): with contextlib.suppress(ValueError): AutoConfig.register(name, cls) -from sglang.srt.utils import is_multimodal_model - def download_from_hf(model_path: str): if os.path.exists(model_path): @@ -60,12 +58,6 @@ def download_from_hf(model_path: str): return snapshot_download(model_path, allow_patterns=["*.json", "*.bin", "*.model"]) -def get_config_json(model_path: str): - with open(os.path.join(model_path, "configs.json")) as f: - config = json.load(f) - return config - - def get_config( model: str, trust_remote_code: bool,