upgrade vLLM to main (#4608)
1. fix https://github.com/vllm-project/vllm/pull/28542 The model structure modifications we involved in are: - Qwen2.5-VL(still exist some patch) - Qwen2-VL - Qwen2 - DeepSeek series - Qwen-moe series 2. fix https://github.com/vllm-project/vllm/pull/29121 the output token now type changed from np to `list[list[int]]` 3. fix https://github.com/vllm-project/vllm/pull/29262 `xformers` backend for multimodal now has been deprecated 4. fix https://github.com/vllm-project/vllm/pull/29342 5. fix https://github.com/vllm-project/vllm/pull/28579 6. fix https://github.com/vllm-project/vllm/pull/28718 7. fix https://github.com/vllm-project/vllm/issues/28665 8. fix https://github.com/vllm-project/vllm/pull/26847 vllm introduced the `optimization-level`, some default config has been changed, and the param `--enforce-eager` has been deprecated 9. fix http://github.com/vllm-project/vllm/pull/29223 it retuns tuple for sampler. 10. fix https://github.com/vllm-project/vllm/pull/29471 we'll remove the related patch to avoid this kind of error. Co-authored-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: wangli <wangli858794774@gmail.com> - vLLM version: v0.11.2 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
@@ -39,9 +39,9 @@ import torch._dynamo.cache_size
|
||||
import torch.distributed as dist
|
||||
import torch.nn as nn
|
||||
from tqdm import tqdm # type: ignore
|
||||
from vllm.attention import AttentionType, get_attn_backend
|
||||
from vllm.attention.backends.abstract import AttentionBackend
|
||||
from vllm.attention.backends.abstract import AttentionBackend, AttentionType
|
||||
from vllm.attention.layer import Attention, MLAAttention
|
||||
from vllm.attention.selector import get_attn_backend
|
||||
from vllm.compilation.counter import compilation_counter
|
||||
from vllm.compilation.monitor import set_cudagraph_capturing_enabled
|
||||
from vllm.config import (CompilationMode, CUDAGraphMode, VllmConfig,
|
||||
@@ -53,7 +53,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
|
||||
from vllm.distributed.parallel_state import (get_dcp_group, get_dp_group,
|
||||
get_pp_group, get_tp_group,
|
||||
is_global_first_rank)
|
||||
from vllm.forward_context import BatchDescriptor, get_forward_context
|
||||
from vllm.forward_context import get_forward_context
|
||||
from vllm.logger import logger
|
||||
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
||||
from vllm.model_executor.layers.mamba.abstract import MambaBase
|
||||
@@ -244,11 +244,9 @@ class AsyncNPUModelRunnerOutput(AsyncModelRunnerOutput):
|
||||
# Release the device tensor once the copy has completed
|
||||
del self._sampled_token_ids
|
||||
|
||||
valid_sampled_token_ids: list[np.ndarray] = [
|
||||
row for row in self._sampled_token_ids_cpu.numpy()
|
||||
]
|
||||
valid_sampled_token_ids = self._sampled_token_ids_cpu.tolist()
|
||||
for i in self._invalid_req_indices:
|
||||
valid_sampled_token_ids[i] = np.array([])
|
||||
valid_sampled_token_ids[i].clear()
|
||||
|
||||
output = self._model_runner_output
|
||||
output.sampled_token_ids = valid_sampled_token_ids
|
||||
@@ -332,7 +330,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
# Ascend-specific configurations
|
||||
self.ascend_config = get_ascend_config()
|
||||
if self.ascend_config.ascend_scheduler_config.enabled:
|
||||
self.chunked_prefill_enabled = self.scheduler_config.chunked_prefill_enabled
|
||||
self.chunked_prefill_enabled = self.scheduler_config.enable_chunked_prefill
|
||||
else:
|
||||
self.chunked_prefill_enabled = True
|
||||
self.weight_prefetch_method = WeightPrefetchMethod(
|
||||
@@ -2130,7 +2128,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
def propose_draft_token_ids(
|
||||
self,
|
||||
valid_sampled_token_ids: Union[torch.Tensor, list[np.ndarray]],
|
||||
valid_sampled_token_ids: torch.Tensor | list[list[int]],
|
||||
sampling_metadata: SamplingMetadata,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
spec_decode_metadata: SpecDecodeMetadata,
|
||||
@@ -2309,10 +2307,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
uniform_decode = (max_query_len == self.uniform_decode_query_len) and (
|
||||
scheduler_output.total_num_scheduled_tokens
|
||||
== self.input_batch.num_reqs * max_query_len)
|
||||
batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
|
||||
uniform_decode=uniform_decode)
|
||||
has_lora = len(self.input_batch.lora_id_to_lora_request) > 0
|
||||
aclgraph_runtime_mode, batch_descriptor = \
|
||||
self.aclgraph_dispatcher.dispatch(batch_descriptor)
|
||||
self.aclgraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
|
||||
|
||||
# Run forward pass
|
||||
with ProfileExecuteDuration().capture_async("forward"):
|
||||
@@ -2510,18 +2507,16 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
max_gen_len = sampled_token_ids.shape[-1]
|
||||
if max_gen_len == 1:
|
||||
# No spec decode tokens. It's a tensor.
|
||||
valid_sampled_token_ids: list[np.ndarray] = [
|
||||
row for row in sampled_token_ids.cpu().numpy()
|
||||
]
|
||||
valid_sampled_token_ids = sampled_token_ids.tolist()
|
||||
else:
|
||||
# Includes spec decode tokens. It's a numpy array
|
||||
valid_sampled_token_ids = self.rejection_sampler.parse_output(
|
||||
valid_sampled_token_ids, _ = self.rejection_sampler.parse_output(
|
||||
sampled_token_ids,
|
||||
self.input_batch.vocab_size,
|
||||
)
|
||||
# Mask out the sampled tokens that should not be sampled.
|
||||
for i in discard_sampled_tokens_req_indices:
|
||||
valid_sampled_token_ids[int(i)] = np.array([])
|
||||
valid_sampled_token_ids[int(i)].clear()
|
||||
else:
|
||||
valid_sampled_token_ids = []
|
||||
invalid_req_indices = discard_sampled_tokens_req_indices.tolist(
|
||||
@@ -2547,17 +2542,16 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
# the sampled tokens back, because there's no direct communication
|
||||
# between the first-stage worker and the last-stage worker.
|
||||
for req_idx in range(num_sampled_tokens):
|
||||
sampled_ids: np.ndarray | None
|
||||
if self.use_async_scheduling:
|
||||
sampled_ids = (np.array([-1]) if req_idx
|
||||
not in invalid_req_indices_set else None)
|
||||
sampled_ids = [-1] * 1 if \
|
||||
req_idx not in invalid_req_indices_set else None
|
||||
else:
|
||||
sampled_ids = valid_sampled_token_ids[req_idx]
|
||||
if sampled_ids is None or sampled_ids.shape[0] == 0:
|
||||
if not sampled_ids:
|
||||
continue
|
||||
|
||||
start_idx = self.input_batch.num_tokens_no_spec[req_idx]
|
||||
end_idx = start_idx + sampled_ids.shape[0]
|
||||
end_idx = start_idx + len(sampled_ids)
|
||||
assert end_idx <= self.model_config.max_model_len, (
|
||||
"Sampled token IDs exceed the max model length. "
|
||||
f"Total number of tokens: {end_idx} > max_model_len: "
|
||||
@@ -2571,7 +2565,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
self.input_batch.num_tokens[req_idx] = end_idx
|
||||
req_id = self.input_batch.req_ids[req_idx]
|
||||
req_state = self.requests[req_id]
|
||||
req_state.output_token_ids.extend(sampled_ids.tolist())
|
||||
req_state.output_token_ids.extend(sampled_ids)
|
||||
|
||||
def propose_draft_token_ids(sampled_token_ids):
|
||||
assert self.spec_decode_common_attn_metadata is not None
|
||||
@@ -2877,7 +2871,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
assert aclgraph_runtime_mode is None or aclgraph_runtime_mode in {
|
||||
CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL
|
||||
}
|
||||
|
||||
# In multi-DP scenarios, there may be situations where all DP groups are executing dummy runs.
|
||||
# If sequence parallelism is enabled, it is essential to ensure that num_tokens is divisible by tp_size.
|
||||
if self.use_aclgraph and enable_sp(self.vllm_config):
|
||||
@@ -2971,19 +2964,18 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
k: v[:num_tokens]
|
||||
for k, v in self.intermediate_tensors.items()
|
||||
})
|
||||
|
||||
has_lora = True if self.lora_config and self.compilation_config.cudagraph_specialize_lora else False
|
||||
# filter out the valid batch descriptor
|
||||
_ag_mode, batch_descriptor = \
|
||||
self.aclgraph_dispatcher.dispatch(
|
||||
BatchDescriptor(num_tokens=num_tokens,
|
||||
uniform_decode=uniform_decode))
|
||||
self.aclgraph_dispatcher.dispatch(num_tokens=num_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
|
||||
if aclgraph_runtime_mode is not None:
|
||||
# we allow forcing NONE when the dispatcher disagrees to support
|
||||
# warm ups for aclgraph capture
|
||||
assert aclgraph_runtime_mode == CUDAGraphMode.NONE or \
|
||||
aclgraph_runtime_mode == _ag_mode, (
|
||||
f"Aclgraph runtime mode mismatch at dummy_run. "
|
||||
f"Expected {_ag_mode}, but got {aclgraph_runtime_mode}.")
|
||||
if aclgraph_runtime_mode != CUDAGraphMode.NONE and aclgraph_runtime_mode != _ag_mode:
|
||||
raise ValueError(
|
||||
f"Aclgraph runtime mode mismatch at dummy_run. "
|
||||
f"Expected {_ag_mode}, but got {aclgraph_runtime_mode}."
|
||||
)
|
||||
else:
|
||||
aclgraph_runtime_mode = _ag_mode
|
||||
|
||||
@@ -4466,18 +4458,3 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
self.input_ids_pcp_full_cpu[:total_num_scheduled_tokens_pcp_full],
|
||||
non_blocking=True,
|
||||
)
|
||||
|
||||
def _to_list(self, sampled_token_ids: torch.Tensor) -> list[np.ndarray]:
|
||||
# This is a short term mitigation for issue mentioned in
|
||||
# https://github.com/vllm-project/vllm/issues/22754.
|
||||
# `tolist` would trigger a cuda wise stream sync, which
|
||||
# would block other copy ops from other cuda streams.
|
||||
# A cuda event sync would avoid such a situation. Since
|
||||
# this is in the critical path of every single model
|
||||
# forward loop, this has caused perf issue for a disagg
|
||||
# setup.
|
||||
pinned = self.sampled_token_ids_pinned_cpu[:sampled_token_ids.shape[0]]
|
||||
pinned.copy_(sampled_token_ids, non_blocking=True)
|
||||
self.transfer_event.record()
|
||||
self.transfer_event.synchronize()
|
||||
return [row for row in pinned.numpy()]
|
||||
|
||||
Reference in New Issue
Block a user