diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index 7f555110d..11c94775c 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -38,6 +38,11 @@ jobs: cd test/srt python3 -m unittest test_serving_throughput.TestServingThroughput.test_default + - name: Benchmark Serving Latency + timeout-minutes: 10 + run: | + python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 1 --input 128 --output 8 + - name: Benchmark Serving Throughput (w/o RadixAttention) timeout-minutes: 10 run: | diff --git a/python/sglang/bench_latency.py b/python/sglang/bench_latency.py index dea910f57..6a918fbd1 100644 --- a/python/sglang/bench_latency.py +++ b/python/sglang/bench_latency.py @@ -200,16 +200,14 @@ def extend(reqs, model_runner): tree_cache=None, ) batch.prepare_for_extend(model_runner.model_config.vocab_size) - output = model_runner.forward(batch, ForwardMode.EXTEND) - next_token_ids = batch.sample(output.next_token_logits) - return next_token_ids, output.next_token_logits, batch + sample_output, logits_output = model_runner.forward(batch, ForwardMode.EXTEND) + return sample_output.batch_next_token_ids, logits_output.next_token_logits, batch def decode(input_token_ids, batch, model_runner): batch.prepare_for_decode(input_token_ids.cpu().numpy()) - output = model_runner.forward(batch, ForwardMode.DECODE) - next_token_ids = batch.sample(output.next_token_logits) - return next_token_ids, output.next_token_logits + sample_output, logits_output = model_runner.forward(batch, ForwardMode.DECODE) + return sample_output.batch_next_token_ids, logits_output.next_token_logits @torch.inference_mode()