[CI] Upgrade vllm to newest commit (#3182)
### What this PR does / why we need it? Upgrade vLLM to newest commit - Fix the aclgraph doesn't work problem, caused by24fab45d96- Fix PoolerOutput import error, caused by755ed7b05b- Fix the aclgraph weight load error to keep the same with torchair fix.4492e3a554### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? All test should pass - vLLM version: v0.10.2 - vLLM main:52d0cb8458--------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -64,11 +64,12 @@ from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
|
||||
from vllm.multimodal.utils import group_mm_kwargs_by_modality
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import SamplingType
|
||||
from vllm.sequence import IntermediateTensors, PoolerOutput
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
|
||||
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
|
||||
LazyLoader, cdiv, get_dtype_size,
|
||||
is_pin_memory_available)
|
||||
from vllm.utils.jsontree import json_map_leaves
|
||||
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
|
||||
from vllm.v1.attention.backends.utils import (
|
||||
AttentionCGSupport, reorder_batch_to_split_decodes_and_prefills)
|
||||
@@ -144,7 +145,9 @@ else:
|
||||
|
||||
if not vllm_version_is("0.10.2"):
|
||||
from vllm.v1.kv_cache_interface import UniformTypeKVCacheSpecs
|
||||
from vllm.v1.outputs import PoolerOutput
|
||||
else:
|
||||
from vllm.sequence import PoolerOutput
|
||||
UniformTypeKVCacheSpecs = None
|
||||
|
||||
|
||||
@@ -1806,18 +1809,30 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
device=hidden_states.device)
|
||||
seq_lens_cpu = self.seq_lens_cpu[:self.input_batch.num_reqs]
|
||||
|
||||
# Pooling models D2H & synchronize occurs in pooler.py:build_output
|
||||
raw_pooler_output = self.model.pooler(
|
||||
hidden_states=hidden_states, pooling_metadata=pooling_metadata)
|
||||
if vllm_version_is("0.10.2"):
|
||||
# Pooling models D2H & synchronize occurs in pooler.py:build_output
|
||||
raw_pooler_output = self.model.pooler(
|
||||
hidden_states=hidden_states, pooling_metadata=pooling_metadata)
|
||||
else:
|
||||
model = cast(VllmModelForPooling, self.model)
|
||||
raw_pooler_output = model.pooler(
|
||||
hidden_states=hidden_states,
|
||||
pooling_metadata=pooling_metadata,
|
||||
)
|
||||
raw_pooler_output = json_map_leaves(
|
||||
lambda x: x.to("cpu", non_blocking=True),
|
||||
raw_pooler_output,
|
||||
)
|
||||
torch.npu.synchronize()
|
||||
|
||||
pooler_output: list[Optional[torch.Tensor]] = []
|
||||
for raw_output, seq_len, prompt_len in zip(
|
||||
raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens):
|
||||
|
||||
if seq_len == prompt_len:
|
||||
pooler_output.append(raw_output.data)
|
||||
if vllm_version_is("0.10.2"):
|
||||
output = raw_output.data if seq_len == prompt_len else None
|
||||
else:
|
||||
pooler_output.append(None)
|
||||
output = raw_output if seq_len == prompt_len else None
|
||||
pooler_output.append(output)
|
||||
|
||||
return ModelRunnerOutput(
|
||||
req_ids=self.input_batch.req_ids,
|
||||
@@ -2582,7 +2597,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
for task in self.get_supported_pooling_tasks():
|
||||
# Run a full batch with each task to ensure none of them OOMs
|
||||
output = self._dummy_pooler_run_task(hidden_states, task)
|
||||
output_size[task] = output.get_data_nbytes()
|
||||
if vllm_version_is("0.10.2"):
|
||||
output_size[task] = output.get_data_nbytes()
|
||||
else:
|
||||
output_size[task] = sum(o.nbytes for o in output)
|
||||
del output # Allow GC
|
||||
|
||||
max_task = max(output_size.items(), key=lambda x: x[1])[0]
|
||||
|
||||
@@ -116,6 +116,12 @@ class NPUWorker(WorkerBase):
|
||||
# Buffers saved before sleep
|
||||
self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
|
||||
|
||||
# FixMe: this is a patch to fix the issue cause by https://github.com/vllm-project/vllm/commit/de94289a98d7ec52a5ef02719e01a1db8b505170
|
||||
from vllm.model_executor.layers.linear import \
|
||||
WEIGHT_LOADER_V2_SUPPORTED
|
||||
if "UnquantizedLinearMethod" in WEIGHT_LOADER_V2_SUPPORTED:
|
||||
WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod")
|
||||
|
||||
def sleep(self, level: int = 1) -> None:
|
||||
if not sleep_mode_enabled():
|
||||
raise ValueError(
|
||||
|
||||
Reference in New Issue
Block a user