[Build] Move numba/quart to requirments and update DS baseline and sync graph typo fix (#1121)
### What this PR does / why we need it?
1. The dependency was introduced by
https://github.com/vllm-project/vllm-ascend/pull/874
- Move numba/quart from requirements-dev to requirments
- Align pyproject.toml with requirements
2. This patch also fix deepseek accuracy baseline which
https://github.com/vllm-project/vllm-ascend/pull/1118 was not addressed.
According to https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite the
gsm8k is about `41.1`
3. This also sync the vLLM upstream changes:
eaa2e51088
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
CI passed
vllm ascend test (basic workflow)
vllm longterm test (spec decode)
Closes: https://github.com/vllm-project/vllm-ascend/issues/1120
---------
Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
This commit is contained in:
@@ -16,5 +16,8 @@ requires = [
|
|||||||
"torch>=2.5.1",
|
"torch>=2.5.1",
|
||||||
"torchvision<0.21.0",
|
"torchvision<0.21.0",
|
||||||
"wheel",
|
"wheel",
|
||||||
|
"msgpack",
|
||||||
|
"quart",
|
||||||
|
"numba",
|
||||||
]
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|||||||
@@ -9,6 +9,4 @@ ray
|
|||||||
types-jsonschema
|
types-jsonschema
|
||||||
xgrammar
|
xgrammar
|
||||||
zmq
|
zmq
|
||||||
numba
|
|
||||||
quart
|
|
||||||
types-psutil
|
types-psutil
|
||||||
|
|||||||
@@ -18,3 +18,6 @@ wheel
|
|||||||
# requirements for disaggregated prefill
|
# requirements for disaggregated prefill
|
||||||
msgpack
|
msgpack
|
||||||
quart
|
quart
|
||||||
|
|
||||||
|
# Required for N-gram speculative decoding
|
||||||
|
numba
|
||||||
|
|||||||
@@ -34,8 +34,7 @@ FILTER = "exact_match,strict-match"
|
|||||||
# 3% relative tolerance for numerical accuracy.
|
# 3% relative tolerance for numerical accuracy.
|
||||||
RTOL = 0.03
|
RTOL = 0.03
|
||||||
# Baseline accuracy after VLLM optimization.
|
# Baseline accuracy after VLLM optimization.
|
||||||
# FIXME: fix the accuracy issue
|
EXPECTED_VALUE = 0.3843821076573162
|
||||||
EXPECTED_VALUE = 0.000758150113722517
|
|
||||||
|
|
||||||
|
|
||||||
def run_test(model_name, queue, more_args=None):
|
def run_test(model_name, queue, more_args=None):
|
||||||
|
|||||||
@@ -14,6 +14,8 @@ from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
|
|||||||
set_current_vllm_config)
|
set_current_vllm_config)
|
||||||
from vllm.utils import direct_register_custom_op
|
from vllm.utils import direct_register_custom_op
|
||||||
|
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
global_counter = 0
|
global_counter = 0
|
||||||
|
|
||||||
# create a library to hold the custom op
|
# create a library to hold the custom op
|
||||||
@@ -92,14 +94,28 @@ def test_simple_piecewise_compile():
|
|||||||
|
|
||||||
inputs = torch.randn(100).npu()
|
inputs = torch.randn(100).npu()
|
||||||
|
|
||||||
with compilation_counter.expect(
|
if vllm_version_is("0.9.0"):
|
||||||
num_graphs_seen=1, # one graph for the model
|
kwargs = {
|
||||||
num_piecewise_graphs_seen=5, # 2 * num_layers + 1
|
"num_graphs_seen": 1, # one graph for the model
|
||||||
num_piecewise_capturable_graphs_seen=3, # 1 + num_layers
|
"num_piecewise_graphs_seen": 5, # 2 * num_layers + 1
|
||||||
num_backend_compilations=3, # num_piecewise_capturable_graphs_seen
|
"num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers
|
||||||
num_cudagraph_caputured=
|
"num_backend_compilations":
|
||||||
6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
3, # num_piecewise_capturable_graphs_seen
|
||||||
):
|
"num_cudagraph_caputured":
|
||||||
|
6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
kwargs = {
|
||||||
|
"num_graphs_seen": 1, # one graph for the model
|
||||||
|
"num_piecewise_graphs_seen": 5, # 2 * num_layers + 1
|
||||||
|
"num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers
|
||||||
|
"num_backend_compilations":
|
||||||
|
3, # num_piecewise_capturable_graphs_seen
|
||||||
|
"num_cudagraph_captured":
|
||||||
|
6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||||
|
}
|
||||||
|
|
||||||
|
with compilation_counter.expect(kwargs):
|
||||||
|
|
||||||
model(inputs)
|
model(inputs)
|
||||||
|
|
||||||
|
|||||||
@@ -31,6 +31,8 @@ from vllm.config import VllmConfig
|
|||||||
from vllm.logger import logger
|
from vllm.logger import logger
|
||||||
from vllm.utils import weak_ref_tensors
|
from vllm.utils import weak_ref_tensors
|
||||||
|
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass
|
@dataclasses.dataclass
|
||||||
class ConcreteSizeEntry:
|
class ConcreteSizeEntry:
|
||||||
@@ -205,7 +207,10 @@ class NPUPiecewiseBackend:
|
|||||||
entry.output = weak_ref_tensors(output)
|
entry.output = weak_ref_tensors(output)
|
||||||
entry.aclgraph = aclgraph
|
entry.aclgraph = aclgraph
|
||||||
|
|
||||||
compilation_counter.num_cudagraph_caputured += 1
|
if vllm_version_is("0.9.0"):
|
||||||
|
compilation_counter.num_cudagraph_caputured += 1
|
||||||
|
else:
|
||||||
|
compilation_counter.num_cudagraph_captured += 1
|
||||||
|
|
||||||
# important: we need to return the output, rather than
|
# important: we need to return the output, rather than
|
||||||
# the weak ref of the output, so that pytorch can correctly
|
# the weak ref of the output, so that pytorch can correctly
|
||||||
|
|||||||
Reference in New Issue
Block a user