From 4976b48b98f7268a68fe055265f928afd779ccc4 Mon Sep 17 00:00:00 2001
From: Yikun Jiang <yikunkero@gmail.com>
Date: Sun, 8 Jun 2025 22:33:37 +0800
Subject: [PATCH] [Build] Move numba/quart to requirments and update DS
 baseline and sync graph typo fix (#1121)

### What this PR does / why we need it?
1. The dependency was introduced by
https://github.com/vllm-project/vllm-ascend/pull/874
- Move numba/quart from requirements-dev to requirments
- Align pyproject.toml with requirements

2. This patch also fix deepseek accuracy baseline which
https://github.com/vllm-project/vllm-ascend/pull/1118 was not addressed.
According to https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite the
gsm8k is about `41.1`

3. This also sync the vLLM upstream changes:
https://github.com/vllm-project/vllm/commit/eaa2e51088d4daf36d47e566ad90e812f80e91b8

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
CI passed
vllm ascend test (basic workflow)
vllm longterm test (spec decode)

Closes: https://github.com/vllm-project/vllm-ascend/issues/1120

---------

Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
---
 pyproject.toml                                |  3 ++
 requirements-dev.txt                          |  2 --
 requirements.txt                              |  3 ++
 .../test_deepseek_v2_lite_tp2_accuracy.py     |  3 +-
 tests/singlecard/compile/test_simple.py       | 32 ++++++++++++++-----
 vllm_ascend/compilation/piecewise_backend.py  |  7 +++-
 6 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index df5b6a1..b441970 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,5 +16,8 @@ requires = [
     "torch>=2.5.1",
     "torchvision<0.21.0",
     "wheel",
+    "msgpack",
+    "quart",
+    "numba",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 6770a00..133d460 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -9,6 +9,4 @@ ray
 types-jsonschema
 xgrammar
 zmq
-numba
-quart
 types-psutil
diff --git a/requirements.txt b/requirements.txt
index 58afc40..2f844df 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,3 +18,6 @@ wheel
 # requirements for disaggregated prefill
 msgpack
 quart
+
+# Required for N-gram speculative decoding
+numba
diff --git a/tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py b/tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py
index 6a3118d..27986cb 100644
--- a/tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py
+++ b/tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py
@@ -34,8 +34,7 @@ FILTER = "exact_match,strict-match"
 # 3% relative tolerance for numerical accuracy.
 RTOL = 0.03
 # Baseline accuracy after VLLM optimization.
-# FIXME: fix the accuracy issue
-EXPECTED_VALUE = 0.000758150113722517
+EXPECTED_VALUE = 0.3843821076573162
 
 
 def run_test(model_name, queue, more_args=None):
diff --git a/tests/singlecard/compile/test_simple.py b/tests/singlecard/compile/test_simple.py
index cb54422..64d4cba 100644
--- a/tests/singlecard/compile/test_simple.py
+++ b/tests/singlecard/compile/test_simple.py
@@ -14,6 +14,8 @@ from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
                          set_current_vllm_config)
 from vllm.utils import direct_register_custom_op
 
+from vllm_ascend.utils import vllm_version_is
+
 global_counter = 0
 
 # create a library to hold the custom op
@@ -92,14 +94,28 @@ def test_simple_piecewise_compile():
 
     inputs = torch.randn(100).npu()
 
-    with compilation_counter.expect(
-            num_graphs_seen=1,  # one graph for the model
-            num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
-            num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
-            num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_caputured=
-            6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-    ):
+    if vllm_version_is("0.9.0"):
+        kwargs = {
+            "num_graphs_seen": 1,  # one graph for the model
+            "num_piecewise_graphs_seen": 5,  # 2 * num_layers + 1
+            "num_piecewise_capturable_graphs_seen": 3,  # 1 + num_layers
+            "num_backend_compilations":
+            3,  # num_piecewise_capturable_graphs_seen
+            "num_cudagraph_caputured":
+            6  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        }
+    else:
+        kwargs = {
+            "num_graphs_seen": 1,  # one graph for the model
+            "num_piecewise_graphs_seen": 5,  # 2 * num_layers + 1
+            "num_piecewise_capturable_graphs_seen": 3,  # 1 + num_layers
+            "num_backend_compilations":
+            3,  # num_piecewise_capturable_graphs_seen
+            "num_cudagraph_captured":
+            6  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        }
+
+    with compilation_counter.expect(kwargs):
 
         model(inputs)
 
diff --git a/vllm_ascend/compilation/piecewise_backend.py b/vllm_ascend/compilation/piecewise_backend.py
index fc95983..95ce693 100644
--- a/vllm_ascend/compilation/piecewise_backend.py
+++ b/vllm_ascend/compilation/piecewise_backend.py
@@ -31,6 +31,8 @@ from vllm.config import VllmConfig
 from vllm.logger import logger
 from vllm.utils import weak_ref_tensors
 
+from vllm_ascend.utils import vllm_version_is
+
 
 @dataclasses.dataclass
 class ConcreteSizeEntry:
@@ -205,7 +207,10 @@ class NPUPiecewiseBackend:
             entry.output = weak_ref_tensors(output)
             entry.aclgraph = aclgraph
 
-            compilation_counter.num_cudagraph_caputured += 1
+            if vllm_version_is("0.9.0"):
+                compilation_counter.num_cudagraph_caputured += 1
+            else:
+                compilation_counter.num_cudagraph_captured += 1
 
             # important: we need to return the output, rather than
             # the weak ref of the output, so that pytorch can correctly