[Bugfix]Fix the performance gap between 0.9.2rc1 and 0.9.1 (#1811)
### What this PR does / why we need it?
maybe fixes
[#1728](https://github.com/vllm-project/vllm-ascend/issues/1728#issuecomment-3065083433)
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Test Qwen3-32B tp=4 with:
```bash
vllm serve --port 1234 Qwen/Qwen3-32B \
--served-model-name Qwen3-32B \
--tensor-parallel-size 4 \
--swap-space 16 \
--max-model-len 6000 \
--load-format dummy \
--disable-log-stats \
--disable-log-requests \
```
Request batch_size=128 input/output token=1024
**In 0.9.2rc1**
```text
=====================================================
Total TPS with prefill(tokens/s) : 785.1395
Total TPS without prefill : 846.6809
Mean TPS with prefill : 6.1339
Mean TPS without prefill : 6.6147
=====================================================
Mean TTFT(ms) : 10307.8123
Max TTFT(ms) : 21423.0733
Min TTFT(ms) : 362.3602
=====================================================
Mean TPOT(ms) : 151.3051
Max TPOT(ms) : 159.4649
Min TPOT(ms) : 140.899
=====================================================
Total Time(s) : 175.6032
Request Throughput(requests/s) : 0.7289
=====================================================
```
**Apply this PR**
```text
=====================================================
Total TPS with prefill(tokens/s) : 811.0014
Total TPS without prefill : 876.4423
Mean TPS with prefill : 6.3359
Mean TPS without prefill : 6.8472
=====================================================
Mean TTFT(ms) : 10263.8382
Max TTFT(ms) : 21151.2547
Min TTFT(ms) : 375.9136
=====================================================
Mean TPOT(ms) : 146.1686
Max TPOT(ms) : 154.0957
Min TPOT(ms) : 136.8879
=====================================================
Total Time(s) : 169.8579
Request Throughput(requests/s) : 0.7536
=====================================================
```
The TPOT performance gap between these two sets of data is about 3%.
- vLLM version: v0.9.2
- vLLM main:
8dfb45ca33
Signed-off-by: lianyibo <lianyibo1@kunlunit.com>
This commit is contained in:
@@ -239,17 +239,27 @@ class TestUtils(TestBase):
|
||||
def test_vllm_version_is(self):
|
||||
with mock.patch.dict(os.environ, {"VLLM_VERSION": "1.0.0"}):
|
||||
with mock.patch("vllm.__version__", "1.0.0"):
|
||||
self.assertTrue(utils.vllm_version_is("1.0.0"))
|
||||
self.assertFalse(utils.vllm_version_is("2.0.0"))
|
||||
self.assertTrue(utils.vllm_version_is.__wrapped__("1.0.0"))
|
||||
self.assertFalse(utils.vllm_version_is.__wrapped__("2.0.0"))
|
||||
with mock.patch("vllm.__version__", "2.0.0"):
|
||||
self.assertTrue(utils.vllm_version_is("1.0.0"))
|
||||
self.assertFalse(utils.vllm_version_is("2.0.0"))
|
||||
self.assertTrue(utils.vllm_version_is.__wrapped__("1.0.0"))
|
||||
self.assertFalse(utils.vllm_version_is.__wrapped__("2.0.0"))
|
||||
with mock.patch("vllm.__version__", "1.0.0"):
|
||||
self.assertTrue(utils.vllm_version_is("1.0.0"))
|
||||
self.assertFalse(utils.vllm_version_is("2.0.0"))
|
||||
self.assertTrue(utils.vllm_version_is.__wrapped__("1.0.0"))
|
||||
self.assertFalse(utils.vllm_version_is.__wrapped__("2.0.0"))
|
||||
with mock.patch("vllm.__version__", "2.0.0"):
|
||||
self.assertTrue(utils.vllm_version_is("2.0.0"))
|
||||
self.assertFalse(utils.vllm_version_is("1.0.0"))
|
||||
self.assertTrue(utils.vllm_version_is.__wrapped__("2.0.0"))
|
||||
self.assertFalse(utils.vllm_version_is.__wrapped__("1.0.0"))
|
||||
# Test caching takes effect
|
||||
utils.vllm_version_is.cache_clear()
|
||||
utils.vllm_version_is("1.0.0")
|
||||
misses = utils.vllm_version_is.cache_info().misses
|
||||
hits = utils.vllm_version_is.cache_info().hits
|
||||
self.assertEqual(misses, 1)
|
||||
self.assertEqual(hits, 0)
|
||||
utils.vllm_version_is("1.0.0")
|
||||
hits = utils.vllm_version_is.cache_info().hits
|
||||
self.assertEqual(hits, 1)
|
||||
|
||||
def test_update_aclgraph_sizes(self):
|
||||
# max_num_batch_sizes < len(original_sizes)
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
|
||||
import atexit
|
||||
import fcntl
|
||||
import functools
|
||||
import math
|
||||
import os
|
||||
import shutil
|
||||
@@ -280,6 +281,7 @@ def adapt_patch(is_global_patch: bool = False):
|
||||
from vllm_ascend.patch import worker # noqa: F401
|
||||
|
||||
|
||||
@functools.cache
|
||||
def vllm_version_is(target_vllm_version: str):
|
||||
if envs.VLLM_VERSION is not None:
|
||||
vllm_version = envs.VLLM_VERSION
|
||||
|
||||
Reference in New Issue
Block a user