From 53d2ea3789ffce32bf3ceb055d5582d28eadc6c7 Mon Sep 17 00:00:00 2001 From: lianyibo Date: Fri, 18 Jul 2025 23:09:54 +0800 Subject: [PATCH] [Bugfix]Fix the performance gap between 0.9.2rc1 and 0.9.1 (#1811) ### What this PR does / why we need it? maybe fixes [#1728](https://github.com/vllm-project/vllm-ascend/issues/1728#issuecomment-3065083433) ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Test Qwen3-32B tp=4 with: ```bash vllm serve --port 1234 Qwen/Qwen3-32B \ --served-model-name Qwen3-32B \ --tensor-parallel-size 4 \ --swap-space 16 \ --max-model-len 6000 \ --load-format dummy \ --disable-log-stats \ --disable-log-requests \ ``` Request batch_size=128 input/output token=1024 **In 0.9.2rc1** ```text ===================================================== Total TPS with prefill(tokens/s) : 785.1395 Total TPS without prefill : 846.6809 Mean TPS with prefill : 6.1339 Mean TPS without prefill : 6.6147 ===================================================== Mean TTFT(ms) : 10307.8123 Max TTFT(ms) : 21423.0733 Min TTFT(ms) : 362.3602 ===================================================== Mean TPOT(ms) : 151.3051 Max TPOT(ms) : 159.4649 Min TPOT(ms) : 140.899 ===================================================== Total Time(s) : 175.6032 Request Throughput(requests/s) : 0.7289 ===================================================== ``` **Apply this PR** ```text ===================================================== Total TPS with prefill(tokens/s) : 811.0014 Total TPS without prefill : 876.4423 Mean TPS with prefill : 6.3359 Mean TPS without prefill : 6.8472 ===================================================== Mean TTFT(ms) : 10263.8382 Max TTFT(ms) : 21151.2547 Min TTFT(ms) : 375.9136 ===================================================== Mean TPOT(ms) : 146.1686 Max TPOT(ms) : 154.0957 Min TPOT(ms) : 136.8879 ===================================================== Total Time(s) : 169.8579 Request Throughput(requests/s) : 0.7536 ===================================================== ``` The TPOT performance gap between these two sets of data is about 3%. - vLLM version: v0.9.2 - vLLM main: https://github.com/vllm-project/vllm/commit/8dfb45ca3379b3a789ec529af4bf725daa07f10d Signed-off-by: lianyibo --- tests/ut/test_utils.py | 26 ++++++++++++++++++-------- vllm_ascend/utils.py | 2 ++ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/tests/ut/test_utils.py b/tests/ut/test_utils.py index 3902b6d..0bfe26b 100644 --- a/tests/ut/test_utils.py +++ b/tests/ut/test_utils.py @@ -239,17 +239,27 @@ class TestUtils(TestBase): def test_vllm_version_is(self): with mock.patch.dict(os.environ, {"VLLM_VERSION": "1.0.0"}): with mock.patch("vllm.__version__", "1.0.0"): - self.assertTrue(utils.vllm_version_is("1.0.0")) - self.assertFalse(utils.vllm_version_is("2.0.0")) + self.assertTrue(utils.vllm_version_is.__wrapped__("1.0.0")) + self.assertFalse(utils.vllm_version_is.__wrapped__("2.0.0")) with mock.patch("vllm.__version__", "2.0.0"): - self.assertTrue(utils.vllm_version_is("1.0.0")) - self.assertFalse(utils.vllm_version_is("2.0.0")) + self.assertTrue(utils.vllm_version_is.__wrapped__("1.0.0")) + self.assertFalse(utils.vllm_version_is.__wrapped__("2.0.0")) with mock.patch("vllm.__version__", "1.0.0"): - self.assertTrue(utils.vllm_version_is("1.0.0")) - self.assertFalse(utils.vllm_version_is("2.0.0")) + self.assertTrue(utils.vllm_version_is.__wrapped__("1.0.0")) + self.assertFalse(utils.vllm_version_is.__wrapped__("2.0.0")) with mock.patch("vllm.__version__", "2.0.0"): - self.assertTrue(utils.vllm_version_is("2.0.0")) - self.assertFalse(utils.vllm_version_is("1.0.0")) + self.assertTrue(utils.vllm_version_is.__wrapped__("2.0.0")) + self.assertFalse(utils.vllm_version_is.__wrapped__("1.0.0")) + # Test caching takes effect + utils.vllm_version_is.cache_clear() + utils.vllm_version_is("1.0.0") + misses = utils.vllm_version_is.cache_info().misses + hits = utils.vllm_version_is.cache_info().hits + self.assertEqual(misses, 1) + self.assertEqual(hits, 0) + utils.vllm_version_is("1.0.0") + hits = utils.vllm_version_is.cache_info().hits + self.assertEqual(hits, 1) def test_update_aclgraph_sizes(self): # max_num_batch_sizes < len(original_sizes) diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index aed5772..1115e49 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -19,6 +19,7 @@ import atexit import fcntl +import functools import math import os import shutil @@ -280,6 +281,7 @@ def adapt_patch(is_global_patch: bool = False): from vllm_ascend.patch import worker # noqa: F401 +@functools.cache def vllm_version_is(target_vllm_version: str): if envs.VLLM_VERSION is not None: vllm_version = envs.VLLM_VERSION