[Build] Move numba/quart to requirments and update DS baseline and sync graph typo fix (#1121)

### What this PR does / why we need it?
1. The dependency was introduced by
https://github.com/vllm-project/vllm-ascend/pull/874
- Move numba/quart from requirements-dev to requirments
- Align pyproject.toml with requirements

2. This patch also fix deepseek accuracy baseline which
https://github.com/vllm-project/vllm-ascend/pull/1118 was not addressed.
According to https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite the
gsm8k is about `41.1`

3. This also sync the vLLM upstream changes:
eaa2e51088

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
CI passed
vllm ascend test (basic workflow)
vllm longterm test (spec decode)

Closes: https://github.com/vllm-project/vllm-ascend/issues/1120

---------

Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
This commit is contained in:
Yikun Jiang
2025-06-08 22:33:37 +08:00
committed by GitHub
parent f1543d5e0d
commit 4976b48b98
6 changed files with 37 additions and 13 deletions

View File

@@ -16,5 +16,8 @@ requires = [
"torch>=2.5.1", "torch>=2.5.1",
"torchvision<0.21.0", "torchvision<0.21.0",
"wheel", "wheel",
"msgpack",
"quart",
"numba",
] ]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"

View File

@@ -9,6 +9,4 @@ ray
types-jsonschema types-jsonschema
xgrammar xgrammar
zmq zmq
numba
quart
types-psutil types-psutil

View File

@@ -18,3 +18,6 @@ wheel
# requirements for disaggregated prefill # requirements for disaggregated prefill
msgpack msgpack
quart quart
# Required for N-gram speculative decoding
numba

View File

@@ -34,8 +34,7 @@ FILTER = "exact_match,strict-match"
# 3% relative tolerance for numerical accuracy. # 3% relative tolerance for numerical accuracy.
RTOL = 0.03 RTOL = 0.03
# Baseline accuracy after VLLM optimization. # Baseline accuracy after VLLM optimization.
# FIXME: fix the accuracy issue EXPECTED_VALUE = 0.3843821076573162
EXPECTED_VALUE = 0.000758150113722517
def run_test(model_name, queue, more_args=None): def run_test(model_name, queue, more_args=None):

View File

@@ -14,6 +14,8 @@ from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
set_current_vllm_config) set_current_vllm_config)
from vllm.utils import direct_register_custom_op from vllm.utils import direct_register_custom_op
from vllm_ascend.utils import vllm_version_is
global_counter = 0 global_counter = 0
# create a library to hold the custom op # create a library to hold the custom op
@@ -92,14 +94,28 @@ def test_simple_piecewise_compile():
inputs = torch.randn(100).npu() inputs = torch.randn(100).npu()
with compilation_counter.expect( if vllm_version_is("0.9.0"):
num_graphs_seen=1, # one graph for the model kwargs = {
num_piecewise_graphs_seen=5, # 2 * num_layers + 1 "num_graphs_seen": 1, # one graph for the model
num_piecewise_capturable_graphs_seen=3, # 1 + num_layers "num_piecewise_graphs_seen": 5, # 2 * num_layers + 1
num_backend_compilations=3, # num_piecewise_capturable_graphs_seen "num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers
num_cudagraph_caputured= "num_backend_compilations":
6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen 3, # num_piecewise_capturable_graphs_seen
): "num_cudagraph_caputured":
6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
}
else:
kwargs = {
"num_graphs_seen": 1, # one graph for the model
"num_piecewise_graphs_seen": 5, # 2 * num_layers + 1
"num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers
"num_backend_compilations":
3, # num_piecewise_capturable_graphs_seen
"num_cudagraph_captured":
6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
}
with compilation_counter.expect(kwargs):
model(inputs) model(inputs)

View File

@@ -31,6 +31,8 @@ from vllm.config import VllmConfig
from vllm.logger import logger from vllm.logger import logger
from vllm.utils import weak_ref_tensors from vllm.utils import weak_ref_tensors
from vllm_ascend.utils import vllm_version_is
@dataclasses.dataclass @dataclasses.dataclass
class ConcreteSizeEntry: class ConcreteSizeEntry:
@@ -205,7 +207,10 @@ class NPUPiecewiseBackend:
entry.output = weak_ref_tensors(output) entry.output = weak_ref_tensors(output)
entry.aclgraph = aclgraph entry.aclgraph = aclgraph
compilation_counter.num_cudagraph_caputured += 1 if vllm_version_is("0.9.0"):
compilation_counter.num_cudagraph_caputured += 1
else:
compilation_counter.num_cudagraph_captured += 1
# important: we need to return the output, rather than # important: we need to return the output, rather than
# the weak ref of the output, so that pytorch can correctly # the weak ref of the output, so that pytorch can correctly