From 79794af52d90abfb00e73871109f0cdc4e0b7f34 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Fri, 6 Sep 2024 00:00:06 +1000
Subject: [PATCH] docs: highlight ttft itl and throughput (#1337)

---
 benchmark/benchmark_vllm_060/README.md | 28 +++++++++++++++-----------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/benchmark/benchmark_vllm_060/README.md b/benchmark/benchmark_vllm_060/README.md
index acb55f897..157bd9df7 100644
--- a/benchmark/benchmark_vllm_060/README.md
+++ b/benchmark/benchmark_vllm_060/README.md
@@ -12,6 +12,10 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 pip install vllm==0.6.0
 ```
 
+## Notes
+
+We referred to the reproduction method in https://github.com/vllm-project/vllm/issues/8176, and added the `--num-scheduler-steps 10` parameter when starting the vLLM server. The `gpu_memory_utilization` of vLLM is by default 0.9 at both TP 1 and TP 4, while SGLang's `mem_frac` is 0.88 at TP 1 and 0.85 at TP 4, so we manually set it to 0.88 at TP 4.
+
 ## Online benchmarks
 
 ```bash
@@ -52,19 +56,19 @@ python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --num-pro
 
 | RPS  | Num prompts | Engine | Median E2E Latency | Median TTFT | Median TPOT | Median ITL |
 |------|-------------|--------|--------------------|-------------|-------------|------------|
-| 4    | 1200        | SGLang | 1564.17            | 31.98       | 13.17       | 11.93      |
-| 4    | 1200        | vLLM   | 1691.97            | 100.48      | 14.14       | 129.32     |
-| 8    | 2400        | SGLang | 2175.02            | 35.68       | 17.85       | 14.41      |
-| 8    | 2400        | vLLM   | 2137.16            | 120.39      | 17.09       | 158.63     |
+| 4    | 1200        | SGLang | 1564.17            | **31.98**   | 13.17       | **11.93**  |
+| 4    | 1200        | vLLM   | 1691.97            | **100.48**  | 14.14       | **129.32** |
+| 8    | 2400        | SGLang | 2175.02            | **35.68**   | 17.85       | **14.41**  |
+| 8    | 2400        | vLLM   | 2137.16            | **120.39**  | 17.09       | **158.63** |
 
 ### Llama 3.1 70B Insruct 4 x H100 80G
 
 | RPS  | Num Prompts | Engine | Median E2E Latency | Median TTFT | Median TPOT | Median ITL |
 |------|-------------|--------|--------------------|-------------|-------------|------------|
-| 4    | 1200        | SGLang | 3005.24            | 53.94       | 25.03       | 21.67      |
-| 4    | 1200        | vLLM   | 2915.60            | 179.15      | 23.58       | 231.23     |
-| 8    | 2400        | SGLang | 4064.98            | 58.11       | 33.07       | 24.45      |
-| 8    | 2400        | vLLM   | 3752.38            | 207.12      | 29.15       | 275.32     |
+| 4    | 1200        | SGLang | 3005.24            | **53.94**   | 25.03       | **21.67**  |
+| 4    | 1200        | vLLM   | 2915.60            | **179.15**  | 23.58       | **231.23** |
+| 8    | 2400        | SGLang | 4064.98            | **58.11**   | 33.07       | **24.45**  |
+| 8    | 2400        | vLLM   | 3752.38            | **207.12**  | 29.15       | **275.32** |
 
 ## Offline benchmark results
 
@@ -72,12 +76,12 @@ python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --num-pro
 
 | RPS  | Num Prompts | Engine | Request throughput | Output token throughput |
 |------|-------------|--------|--------------------|-------------------------|
-| inf  | 5000        | SGLang | 22.03              | 4281.51                 |
-| inf  | 5000        | vLLM   | 21.27              | 4132.37                 |
+| inf  | 5000        | SGLang | 22.03              | **4281.51**             |
+| inf  | 5000        | vLLM   | 21.27              | **4132.37**             |
 
 ### Llama 3.1 70B Insruct 4 x H100 80G
 
 | RPS  | Num Prompts | Engine | Request throughput | Output token throughput |
 |------|-------------|--------|--------------------|-------------------------|
-| inf  | 5000        | SGLang | 19.84              | 3856.01                 |
-| inf  | 5000        | vLLM   | 19.04              | 3700.64                 |
+| inf  | 5000        | SGLang | 19.84              | **3856.01**             |
+| inf  | 5000        | vLLM   | 19.04              | **3700.64**             |