From 4976b48b98f7268a68fe055265f928afd779ccc4 Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Sun, 8 Jun 2025 22:33:37 +0800 Subject: [PATCH] [Build] Move numba/quart to requirments and update DS baseline and sync graph typo fix (#1121) ### What this PR does / why we need it? 1. The dependency was introduced by https://github.com/vllm-project/vllm-ascend/pull/874 - Move numba/quart from requirements-dev to requirments - Align pyproject.toml with requirements 2. This patch also fix deepseek accuracy baseline which https://github.com/vllm-project/vllm-ascend/pull/1118 was not addressed. According to https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite the gsm8k is about `41.1` 3. This also sync the vLLM upstream changes: https://github.com/vllm-project/vllm/commit/eaa2e51088d4daf36d47e566ad90e812f80e91b8 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed vllm ascend test (basic workflow) vllm longterm test (spec decode) Closes: https://github.com/vllm-project/vllm-ascend/issues/1120 --------- Signed-off-by: Yikun Jiang --- pyproject.toml | 3 ++ requirements-dev.txt | 2 -- requirements.txt | 3 ++ .../test_deepseek_v2_lite_tp2_accuracy.py | 3 +- tests/singlecard/compile/test_simple.py | 32 ++++++++++++++----- vllm_ascend/compilation/piecewise_backend.py | 7 +++- 6 files changed, 37 insertions(+), 13 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index df5b6a1..b441970 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,5 +16,8 @@ requires = [ "torch>=2.5.1", "torchvision<0.21.0", "wheel", + "msgpack", + "quart", + "numba", ] build-backend = "setuptools.build_meta" diff --git a/requirements-dev.txt b/requirements-dev.txt index 6770a00..133d460 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -9,6 +9,4 @@ ray types-jsonschema xgrammar zmq -numba -quart types-psutil diff --git a/requirements.txt b/requirements.txt index 58afc40..2f844df 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,6 @@ wheel # requirements for disaggregated prefill msgpack quart + +# Required for N-gram speculative decoding +numba diff --git a/tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py b/tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py index 6a3118d..27986cb 100644 --- a/tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py +++ b/tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py @@ -34,8 +34,7 @@ FILTER = "exact_match,strict-match" # 3% relative tolerance for numerical accuracy. RTOL = 0.03 # Baseline accuracy after VLLM optimization. -# FIXME: fix the accuracy issue -EXPECTED_VALUE = 0.000758150113722517 +EXPECTED_VALUE = 0.3843821076573162 def run_test(model_name, queue, more_args=None): diff --git a/tests/singlecard/compile/test_simple.py b/tests/singlecard/compile/test_simple.py index cb54422..64d4cba 100644 --- a/tests/singlecard/compile/test_simple.py +++ b/tests/singlecard/compile/test_simple.py @@ -14,6 +14,8 @@ from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, set_current_vllm_config) from vllm.utils import direct_register_custom_op +from vllm_ascend.utils import vllm_version_is + global_counter = 0 # create a library to hold the custom op @@ -92,14 +94,28 @@ def test_simple_piecewise_compile(): inputs = torch.randn(100).npu() - with compilation_counter.expect( - num_graphs_seen=1, # one graph for the model - num_piecewise_graphs_seen=5, # 2 * num_layers + 1 - num_piecewise_capturable_graphs_seen=3, # 1 + num_layers - num_backend_compilations=3, # num_piecewise_capturable_graphs_seen - num_cudagraph_caputured= - 6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen - ): + if vllm_version_is("0.9.0"): + kwargs = { + "num_graphs_seen": 1, # one graph for the model + "num_piecewise_graphs_seen": 5, # 2 * num_layers + 1 + "num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers + "num_backend_compilations": + 3, # num_piecewise_capturable_graphs_seen + "num_cudagraph_caputured": + 6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + } + else: + kwargs = { + "num_graphs_seen": 1, # one graph for the model + "num_piecewise_graphs_seen": 5, # 2 * num_layers + 1 + "num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers + "num_backend_compilations": + 3, # num_piecewise_capturable_graphs_seen + "num_cudagraph_captured": + 6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + } + + with compilation_counter.expect(kwargs): model(inputs) diff --git a/vllm_ascend/compilation/piecewise_backend.py b/vllm_ascend/compilation/piecewise_backend.py index fc95983..95ce693 100644 --- a/vllm_ascend/compilation/piecewise_backend.py +++ b/vllm_ascend/compilation/piecewise_backend.py @@ -31,6 +31,8 @@ from vllm.config import VllmConfig from vllm.logger import logger from vllm.utils import weak_ref_tensors +from vllm_ascend.utils import vllm_version_is + @dataclasses.dataclass class ConcreteSizeEntry: @@ -205,7 +207,10 @@ class NPUPiecewiseBackend: entry.output = weak_ref_tensors(output) entry.aclgraph = aclgraph - compilation_counter.num_cudagraph_caputured += 1 + if vllm_version_is("0.9.0"): + compilation_counter.num_cudagraph_caputured += 1 + else: + compilation_counter.num_cudagraph_captured += 1 # important: we need to return the output, rather than # the weak ref of the output, so that pytorch can correctly