[Core] Make V1 work and enable V1 engine test (#389)

1. Make sure the version is string before parse in collect_env
2. Add basic V1 engine test

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-03-28 19:34:23 +08:00
committed by GitHub
parent 57a84bb7be
commit 31f29b9f30
8 changed files with 66 additions and 95 deletions

View File

@@ -126,11 +126,15 @@ jobs:
cd /code/pta/ cd /code/pta/
pip install ./torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl pip install ./torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
- name: Run vllm-project/vllm-ascend test - name: Run vllm-project/vllm-ascend test for V0 Engine
run: | run: |
VLLM_USE_V1=0 pytest -sv tests VLLM_USE_V1=0 pytest -sv tests
- name: Run vllm-project/vllm test - name: Run vllm-project/vllm-ascend test for V1 Engine
run: |
VLLM_USE_V1=1 VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -sv tests
- name: Run vllm-project/vllm test for V0 Engine
env: env:
VLLM_USE_V1: 0 VLLM_USE_V1: 0
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256 PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256

View File

@@ -61,7 +61,6 @@ DEFAULT_CONDA_PATTERNS = {
"soumith", "soumith",
"mkl", "mkl",
"magma", "magma",
"triton",
"optree", "optree",
"transformers", "transformers",
"zmq", "zmq",
@@ -73,7 +72,6 @@ DEFAULT_PIP_PATTERNS = {
"numpy", "numpy",
"mypy", "mypy",
"flake8", "flake8",
"triton",
"optree", "optree",
"onnx", "onnx",
"transformers", "transformers",
@@ -156,7 +154,7 @@ def get_cmake_version(run_lambda):
def _parse_version(version, version_tuple): def _parse_version(version, version_tuple):
version_str = version_tuple[-1] version_str = version_tuple[-1]
if version_str.startswith('g'): if isinstance(version_str, str) and version_str.startswith('g'):
if '.' in version_str: if '.' in version_str:
git_sha = version_str.split('.')[0][1:] git_sha = version_str.split('.')[0][1:]
date = version_str.split('.')[-1][1:] date = version_str.split('.')[-1][1:]

View File

@@ -100,3 +100,4 @@ def test_fused_experts(
e_map) e_map)
# TODO: The native params are: atol=2e-2, rtol=0, maybe related to the nan problem # TODO: The native params are: atol=2e-2, rtol=0, maybe related to the nan problem
torch.testing.assert_close(output, torch_output, atol=4e-2, rtol=1) torch.testing.assert_close(output, torch_output, atol=4e-2, rtol=1)
torch.npu.empty_cache()

View File

@@ -45,8 +45,6 @@ def test_models(
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
) -> None: ) -> None:
os.environ["VLLM_ATTENTION_BACKEND"] = "ASCEND"
# 5042 tokens for gemma2 # 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096 # gemma2 has alternating sliding window size of 4096
# we need a prompt with more than 4096 tokens to test the sliding window # we need a prompt with more than 4096 tokens to test the sliding window
@@ -60,3 +58,8 @@ def test_models(
enforce_eager=False, enforce_eager=False,
gpu_memory_utilization=0.7) as vllm_model: gpu_memory_utilization=0.7) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens) vllm_model.generate_greedy(example_prompts, max_tokens)
if __name__ == "__main__":
import pytest
pytest.main([__file__])

View File

View File

@@ -21,13 +21,16 @@ from typing import TYPE_CHECKING, Optional, Tuple
import torch import torch
import torch_npu # noqa: F401 import torch_npu # noqa: F401
import vllm.envs as envs import vllm.envs as envs
from vllm.config import CompilationLevel, VllmConfig from vllm.config import CompilationLevel
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.platforms import Platform, PlatformEnum from vllm.platforms import Platform, PlatformEnum
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.config import ModelConfig, VllmConfig
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
else: else:
ModelConfig = None
VllmConfig = None
FlexibleArgumentParser = None FlexibleArgumentParser = None
os.environ["RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES"] = "1" os.environ["RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES"] = "1"
@@ -108,10 +111,10 @@ class NPUPlatform(Platform):
parallel_config.worker_cls = "vllm_ascend.worker.worker.NPUWorker" parallel_config.worker_cls = "vllm_ascend.worker.worker.NPUWorker"
cache_config = vllm_config.cache_config cache_config = vllm_config.cache_config
if cache_config and cache_config.block_size is None: if cache_config:
if cache_config.block_size is None:
cache_config.block_size = 128 cache_config.block_size = 128
if envs.VLLM_USE_V1 and cache_config.enable_prefix_caching:
if envs.VLLM_USE_V1 and cache_config and cache_config.enable_prefix_caching:
logger.warning( logger.warning(
"Prefix caching is not supported for V1 now, disable prefix caching" "Prefix caching is not supported for V1 now, disable prefix caching"
) )
@@ -140,3 +143,10 @@ class NPUPlatform(Platform):
@classmethod @classmethod
def is_pin_memory_available(cls): def is_pin_memory_available(cls):
return True return True
@classmethod
def supports_v1(cls, model_config: ModelConfig) -> bool:
"""Returns whether the current platform can support v1 for the supplied
model configuration.
"""
return True

View File

@@ -42,9 +42,7 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheSpec) KVCacheSpec)
from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, ModelRunnerOutput
from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID, RejectionSampler
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
from vllm.v1.utils import bind_kv_cache from vllm.v1.utils import bind_kv_cache
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -112,7 +110,7 @@ class NPUModelRunner:
encoder_compute_budget, encoder_cache_size = compute_encoder_budget( encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
model_config=model_config, model_config=model_config,
scheduler_config=scheduler_config, scheduler_config=scheduler_config,
) mm_registry=self.mm_registry)
self.max_num_encoder_input_tokens = encoder_compute_budget self.max_num_encoder_input_tokens = encoder_compute_budget
self.encoder_cache_size = encoder_cache_size self.encoder_cache_size = encoder_cache_size
@@ -124,21 +122,6 @@ class NPUModelRunner:
# Set up speculative decoding. # Set up speculative decoding.
self.use_spec_decode = False self.use_spec_decode = False
if self.speculative_config:
self.use_spec_decode = True
self.rejection_sampler = RejectionSampler()
# TODO: find a better way to check if we are using ngram.
assert self.speculative_config.ngram_prompt_lookup_min, \
"Currently, only ngram spec decode is supported in V1."
if get_pp_group().is_last_rank:
self.drafter = NgramProposer()
# Trigger Numba JIT compilation for N-gram proposer.
# This usually takes less than 1 second.
self.drafter.propose(
np.zeros(1024, dtype=np.int32),
self.speculative_config.ngram_prompt_lookup_min,
self.speculative_config.num_speculative_tokens,
)
# Request states. # Request states.
self.requests: Dict[str, CachedRequestState] = {} self.requests: Dict[str, CachedRequestState] = {}
@@ -504,6 +487,9 @@ class NPUModelRunner:
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
) -> Union[ModelRunnerOutput, torch.Tensor]: ) -> Union[ModelRunnerOutput, torch.Tensor]:
self._update_states(scheduler_output) self._update_states(scheduler_output)
if not scheduler_output.total_num_scheduled_tokens:
# Return empty ModelRunnerOuptut if there's no work to do.
return EMPTY_MODEL_RUNNER_OUTPUT
hidden_states = self._process_reqs(scheduler_output, hidden_states = self._process_reqs(scheduler_output,
intermediate_tensors) intermediate_tensors)
logits = self.model.compute_logits(hidden_states, None) logits = self.model.compute_logits(hidden_states, None)
@@ -540,15 +526,6 @@ class NPUModelRunner:
if max_gen_len == 1: if max_gen_len == 1:
# No spec decode tokens. # No spec decode tokens.
valid_sampled_token_ids = sampled_token_ids.tolist() valid_sampled_token_ids = sampled_token_ids.tolist()
else:
# Includes spec decode tokens.
valid_mask = sampled_token_ids != INVALID_TOKEN_ID
gen_lens = valid_mask.sum(dim=1).tolist()
# TODO(woosuk): Optimize this.
valid_sampled_token_ids = [
seq.tolist()
for seq in sampled_token_ids[valid_mask].split(gen_lens)
]
model_runner_output = ModelRunnerOutput( model_runner_output = ModelRunnerOutput(
req_ids=self.input_batch.req_ids, req_ids=self.input_batch.req_ids,
@@ -723,34 +700,6 @@ class NPUModelRunner:
self.encoder_cache.clear() self.encoder_cache.clear()
gc.collect() gc.collect()
def generate_draft_token_ids(
self,
sampled_token_ids: list[list[int]],
) -> list[list[int]]:
# TODO(woosuk): Optimize.
draft_token_ids: list[list[int]] = []
for i, sampled_ids in enumerate(sampled_token_ids):
num_sampled_ids = len(sampled_ids)
if not num_sampled_ids:
# Skip speculative decoding.
draft_token_ids.append([])
continue
# Add sampled_token_ids to token_ids_cpu.
start_idx = self.input_batch.num_tokens_no_spec[i]
end_idx = start_idx + num_sampled_ids
self.input_batch.token_ids_cpu[i, start_idx:end_idx] = sampled_ids
drafter_output = self.drafter.propose(
self.input_batch.token_ids_cpu[i, :end_idx],
self.speculative_config.ngram_prompt_lookup_min,
self.speculative_config.num_speculative_tokens,
)
if drafter_output is None or len(drafter_output) == 0:
draft_token_ids.append([])
else:
draft_token_ids.append(drafter_output.tolist())
return draft_token_ids
def load_model(self) -> None: def load_model(self) -> None:
logger.info("Starting to load model %s...", self.model_config.model) logger.info("Starting to load model %s...", self.model_config.model)
@@ -770,34 +719,40 @@ class NPUModelRunner:
kv_cache_config: Configuration for the KV cache, including the KV kv_cache_config: Configuration for the KV cache, including the KV
cache size of each layer cache size of each layer
""" """
if len(kv_cache_config.groups) > 1:
raise NotImplementedError(
"Hybrid models with more than one KV cache type are not "
"supported yet.")
kv_caches: Dict[str, torch.Tensor] = {} kv_caches: Dict[str, torch.Tensor] = {}
for kv_cache_group in kv_cache_config.kv_cache_groups:
for layer_name, layer_spec in kv_cache_config.kv_cache_spec.items(): kv_cache_spec = kv_cache_group.kv_cache_spec
for layer_name in kv_cache_group.layer_names:
tensor_config = kv_cache_config.tensors[layer_name] tensor_config = kv_cache_config.tensors[layer_name]
assert tensor_config.size % layer_spec.page_size_bytes == 0 assert tensor_config.size % kv_cache_spec.page_size_bytes == 0
num_blocks = tensor_config.size // layer_spec.page_size_bytes num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes
if isinstance(layer_spec, FullAttentionSpec): # `num_blocks` is the number of blocks the model runner can use.
# `kv_cache_config.num_blocks` is the number of blocks that
# KVCacheManager may allocate.
# Since different GPUs may have different number of layers and
# different memory capacities, `num_blocks` can be different on
# different GPUs, and `kv_cache_config.num_blocks` is set to
# the min of all `num_blocks`. Verify it here.
assert num_blocks >= kv_cache_config.num_blocks
if isinstance(kv_cache_spec, FullAttentionSpec):
kv_cache_shape = AscendAttentionBackend.get_kv_cache_shape( kv_cache_shape = AscendAttentionBackend.get_kv_cache_shape(
num_blocks, layer_spec.block_size, layer_spec.num_kv_heads, num_blocks, kv_cache_spec.block_size,
layer_spec.head_size) kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
dtype = layer_spec.dtype dtype = kv_cache_spec.dtype
kv_caches[layer_name] = torch.zeros(kv_cache_shape, kv_caches[layer_name] = torch.zeros(kv_cache_shape,
dtype=dtype, dtype=dtype,
device=self.device) device=self.device)
else: else:
raise NotImplementedError # TODO: add new branches when introducing more types of
# KV cache specs.
raise ValueError("Unknown KV cache spec type.")
bind_kv_cache( bind_kv_cache(
kv_caches, kv_caches,
self.vllm_config.compilation_config.static_forward_context, self.vllm_config.compilation_config.static_forward_context,
self.kv_caches) self.kv_caches)
def get_kv_cache_spec(self) -> KVCacheSpec: def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
""" """
Generates the KVCacheSpec by parsing the kv cache format from each Generates the KVCacheSpec by parsing the kv cache format from each
Attention module in the static forward context. Attention module in the static forward context.
@@ -809,7 +764,7 @@ class NPUModelRunner:
forward_ctx = self.vllm_config.compilation_config.static_forward_context forward_ctx = self.vllm_config.compilation_config.static_forward_context
block_size = self.vllm_config.cache_config.block_size block_size = self.vllm_config.cache_config.block_size
use_mla = self.vllm_config.model_config.use_mla use_mla = self.vllm_config.model_config.use_mla
kv_cache_spec: KVCacheSpec = {} kv_cache_spec: dict[str, KVCacheSpec] = {}
for layer_name, attn_module in forward_ctx.items(): for layer_name, attn_module in forward_ctx.items():
if isinstance(attn_module, FusedMoE): if isinstance(attn_module, FusedMoE):
continue continue

View File

@@ -210,7 +210,7 @@ class NPUWorker(WorkerBase):
def get_model(self) -> nn.Module: def get_model(self) -> nn.Module:
return self.model_runner.get_model() return self.model_runner.get_model()
def get_kv_cache_spec(self) -> KVCacheSpec: def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
return self.model_runner.get_kv_cache_spec() return self.model_runner.get_kv_cache_spec()
def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None: def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None: