[Bugfix]Fix the performance gap between 0.9.2rc1 and 0.9.1 (#1811)

### What this PR does / why we need it?

maybe fixes
[#1728](https://github.com/vllm-project/vllm-ascend/issues/1728#issuecomment-3065083433)

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Test Qwen3-32B tp=4 with: 

```bash
vllm serve --port 1234 Qwen/Qwen3-32B \
    --served-model-name Qwen3-32B \
    --tensor-parallel-size 4 \
    --swap-space 16 \
    --max-model-len 6000 \
    --load-format dummy \
    --disable-log-stats \
    --disable-log-requests \
```

Request batch_size=128 input/output token=1024

**In 0.9.2rc1**

```text
=====================================================
Total TPS with    prefill(tokens/s)         : 785.1395
Total TPS without prefill                   : 846.6809
Mean TPS with    prefill                    : 6.1339
Mean TPS without prefill                    : 6.6147
=====================================================
Mean TTFT(ms)                               : 10307.8123
Max  TTFT(ms)                               : 21423.0733
Min  TTFT(ms)                               : 362.3602
=====================================================
Mean TPOT(ms)                               : 151.3051
Max  TPOT(ms)                               : 159.4649
Min  TPOT(ms)                               : 140.899
=====================================================
Total Time(s)                               : 175.6032
Request Throughput(requests/s)              : 0.7289
=====================================================
```

**Apply this PR**

```text
=====================================================
Total TPS with    prefill(tokens/s)         : 811.0014
Total TPS without prefill                   : 876.4423
Mean TPS with    prefill                    : 6.3359
Mean TPS without prefill                    : 6.8472
=====================================================
Mean TTFT(ms)                               : 10263.8382
Max  TTFT(ms)                               : 21151.2547
Min  TTFT(ms)                               : 375.9136
=====================================================
Mean TPOT(ms)                               : 146.1686
Max  TPOT(ms)                               : 154.0957
Min  TPOT(ms)                               : 136.8879
=====================================================
Total Time(s)                               : 169.8579
Request Throughput(requests/s)              : 0.7536
=====================================================
```

The TPOT performance gap between these two sets of data is about 3%.

- vLLM version: v0.9.2
- vLLM main:
8dfb45ca33

Signed-off-by: lianyibo <lianyibo1@kunlunit.com>
This commit is contained in:
lianyibo
2025-07-18 23:09:54 +08:00
committed by GitHub
parent 574fe407eb
commit 53d2ea3789
2 changed files with 20 additions and 8 deletions

View File

@@ -239,17 +239,27 @@ class TestUtils(TestBase):
def test_vllm_version_is(self):
with mock.patch.dict(os.environ, {"VLLM_VERSION": "1.0.0"}):
with mock.patch("vllm.__version__", "1.0.0"):
self.assertTrue(utils.vllm_version_is("1.0.0"))
self.assertFalse(utils.vllm_version_is("2.0.0"))
self.assertTrue(utils.vllm_version_is.__wrapped__("1.0.0"))
self.assertFalse(utils.vllm_version_is.__wrapped__("2.0.0"))
with mock.patch("vllm.__version__", "2.0.0"):
self.assertTrue(utils.vllm_version_is("1.0.0"))
self.assertFalse(utils.vllm_version_is("2.0.0"))
self.assertTrue(utils.vllm_version_is.__wrapped__("1.0.0"))
self.assertFalse(utils.vllm_version_is.__wrapped__("2.0.0"))
with mock.patch("vllm.__version__", "1.0.0"):
self.assertTrue(utils.vllm_version_is("1.0.0"))
self.assertFalse(utils.vllm_version_is("2.0.0"))
self.assertTrue(utils.vllm_version_is.__wrapped__("1.0.0"))
self.assertFalse(utils.vllm_version_is.__wrapped__("2.0.0"))
with mock.patch("vllm.__version__", "2.0.0"):
self.assertTrue(utils.vllm_version_is("2.0.0"))
self.assertFalse(utils.vllm_version_is("1.0.0"))
self.assertTrue(utils.vllm_version_is.__wrapped__("2.0.0"))
self.assertFalse(utils.vllm_version_is.__wrapped__("1.0.0"))
# Test caching takes effect
utils.vllm_version_is.cache_clear()
utils.vllm_version_is("1.0.0")
misses = utils.vllm_version_is.cache_info().misses
hits = utils.vllm_version_is.cache_info().hits
self.assertEqual(misses, 1)
self.assertEqual(hits, 0)
utils.vllm_version_is("1.0.0")
hits = utils.vllm_version_is.cache_info().hits
self.assertEqual(hits, 1)
def test_update_aclgraph_sizes(self):
# max_num_batch_sizes < len(original_sizes)

View File

@@ -19,6 +19,7 @@
import atexit
import fcntl
import functools
import math
import os
import shutil
@@ -280,6 +281,7 @@ def adapt_patch(is_global_patch: bool = False):
from vllm_ascend.patch import worker # noqa: F401
@functools.cache
def vllm_version_is(target_vllm_version: str):
if envs.VLLM_VERSION is not None:
vllm_version = envs.VLLM_VERSION