upgrade to vllm 0.11.2 (#4400)
Bump vLLM version to v0.11.2 What's broken and changed by vLLM: 1. structured_output is broken by https://github.com/vllm-project/vllm/pull/26866 2. get_mrope_input_positions is broken by https://github.com/vllm-project/vllm/pull/28399 3. graph mode is broken by https://github.com/vllm-project/vllm/pull/25110 we'll upgrade torch to 2.8 to fix the problem later 4. embedding is broken by https://github.com/vllm-project/vllm/pull/27583 5. `get_attn_backend_cls` and attention backend is broken are broken by https://github.com/vllm-project/vllm/pull/28534 6. spec decode is broken by https://github.com/vllm-project/vllm/pull/28771 7. sp feature is broken by https://github.com/vllm-project/vllm/pull/27126 8. mtp is broken by https://github.com/vllm-project/vllm/pull/27922 9. lora is broken by https://github.com/vllm-project/vllm/pull/21068 10. execute_model is broken by https://github.com/vllm-project/vllm/pull/26866 11. `VLLM_DISABLE_SHARED_EXPERTS_STREAM` env is broken by https://github.com/vllm-project/vllm/pull/28159 12. kv cahe is broken by https://github.com/vllm-project/vllm/pull/27753 13. dp is broken by https://github.com/vllm-project/vllm/pull/25110 What's broken and changed by ourself: 1. qwen vl is broken by https://github.com/vllm-project/vllm/pull/28455 We'll remove model files in the future to avoid this kind of error 2. Engine core is broken by https://github.com/vllm-project/vllm/pull/23691 We'll remove the patch file in the future. 3. Ascend scheduler is broken by https://github.com/vllm-project/vllm/pull/28733 We'll remove ascend scheudler later. 4. qwen3-next is broken by https://github.com/vllm-project/vllm/pull/28083 We'll remove model files in the future to avoid this kind of error 5. qwen vl is broken by https://github.com/vllm-project/vllm/pull/27764. We'll remove model files in the future Known issue: 1. ray doesn't work 2. the accuracy of qwen3-next is not correct 3. qwen3-vl is broken 4. prefix cache+ ascend scheduler + deepseek v2 lite is broken. Co-authored-by: MengqingCao <cmq0113@163.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: leo-pony <nengjunma@outlook.com> Co-authored-by: 22dimensions <waitingwind@foxmail.com> Co-authored-by: shen-shanshan <467638484@qq.com> - vLLM version: v0.11.2 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: hfadzxy <starmoon_zhang@163.com> Signed-off-by: leo-pony <nengjunma@outlook.com> Co-authored-by: MengqingCao <cmq0113@163.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: leo-pony <nengjunma@outlook.com>
This commit is contained in:
@@ -152,7 +152,7 @@ if prefill_context_parallel_enable():
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import xgrammar as xgr # type: ignore[import-untyped]
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
|
||||
else:
|
||||
xgr = LazyLoader("xgr", globals(), "xgrammar")
|
||||
|
||||
@@ -243,15 +243,32 @@ class AsyncNPUModelRunnerOutput(AsyncModelRunnerOutput):
|
||||
# Release the device tensor once the copy has completed
|
||||
del self._sampled_token_ids
|
||||
|
||||
valid_sampled_token_ids = self._sampled_token_ids_cpu.tolist()
|
||||
valid_sampled_token_ids: list[np.ndarray] = [
|
||||
row for row in self._sampled_token_ids_cpu.numpy()
|
||||
]
|
||||
for i in self._invalid_req_indices:
|
||||
valid_sampled_token_ids[i].clear()
|
||||
valid_sampled_token_ids[i] = np.array([])
|
||||
|
||||
output = self._model_runner_output
|
||||
output.sampled_token_ids = valid_sampled_token_ids
|
||||
return output
|
||||
|
||||
|
||||
class ExecuteModelState(NamedTuple):
|
||||
"""Ephemeral cached state transferred between execute_model() and
|
||||
sample_tokens(), after execute_model() returns None."""
|
||||
|
||||
scheduler_output: "SchedulerOutput"
|
||||
logits: torch.Tensor
|
||||
spec_decode_metadata: SpecDecodeMetadata | None
|
||||
hidden_states: torch.Tensor
|
||||
sample_hidden_states: torch.Tensor
|
||||
aux_hidden_states: list[torch.Tensor] | None
|
||||
kv_connector_output: KVConnectorOutput | None
|
||||
attn_metadata: dict[str, Any]
|
||||
positions: torch.Tensor
|
||||
|
||||
|
||||
class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
def __init__(self, vllm_config: VllmConfig, device: torch.device):
|
||||
@@ -604,6 +621,11 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
# TODO: EVS Support (Video tokens pruning) (see vllm#22980)
|
||||
self.is_multimodal_pruning_enabled = False
|
||||
|
||||
# Ephemeral state transferred between execute_model() and sample_tokens().
|
||||
self.execute_model_state: ExecuteModelState | None = None
|
||||
|
||||
self.transfer_event = torch.npu.Event()
|
||||
|
||||
def _set_up_drafter(self):
|
||||
# Set up speculative decoding.
|
||||
self.spec_attn_mask = None
|
||||
@@ -865,39 +887,12 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
self.input_batch.refresh_metadata()
|
||||
|
||||
def _init_mrope_positions(self, req_state: CachedRequestState):
|
||||
image_grid_thw = []
|
||||
video_grid_thw = []
|
||||
second_per_grid_ts = []
|
||||
audio_feature_lengths = []
|
||||
use_audio_in_video = False
|
||||
assert req_state.mm_features is not None
|
||||
for mm_feature in req_state.mm_features:
|
||||
mm_item = mm_feature.data
|
||||
if mm_item is None:
|
||||
continue
|
||||
mm_input = mm_item.get_data()
|
||||
if (t := mm_input.get("image_grid_thw")) is not None:
|
||||
image_grid_thw.append(t.tolist())
|
||||
if (t := mm_input.get("video_grid_thw")) is not None:
|
||||
video_grid_thw.append(t.tolist())
|
||||
if (t := mm_input.get("second_per_grid_ts")) is not None:
|
||||
second_per_grid_ts.append(t)
|
||||
if (t := mm_input.get("audio_feature_lengths")) is not None:
|
||||
audio_feature_lengths.append(t)
|
||||
if mm_input.get("use_audio_in_video") is True:
|
||||
use_audio_in_video = True
|
||||
|
||||
if supports_mrope(self.model):
|
||||
req_state.mrope_positions, req_state.mrope_position_delta = \
|
||||
self.model.get_mrope_input_positions(
|
||||
req_state.prompt_token_ids,
|
||||
hf_config=self.model_config.hf_config,
|
||||
image_grid_thw=image_grid_thw,
|
||||
video_grid_thw=video_grid_thw,
|
||||
second_per_grid_ts=second_per_grid_ts,
|
||||
audio_feature_lengths=audio_feature_lengths,
|
||||
use_audio_in_video=use_audio_in_video,
|
||||
)
|
||||
assert supports_mrope(self.model), "MROPE is not supported"
|
||||
req_state.mrope_positions, req_state.mrope_position_delta = \
|
||||
self.model.get_mrope_input_positions(
|
||||
req_state.prompt_token_ids,
|
||||
req_state.mm_features,
|
||||
)
|
||||
|
||||
def _sync_metadata_across_dp(
|
||||
self, num_tokens: int,
|
||||
@@ -1084,8 +1079,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
# 2. A list or tuple (length: num_items) of tensors, each of shape
|
||||
# (feature_size, hidden_size) in case the feature size is dynamic
|
||||
# depending on the input multimodal items.
|
||||
curr_group_outputs = self.model.get_multimodal_embeddings(
|
||||
**mm_kwargs_group)
|
||||
curr_group_outputs = self.model.embed_multimodal(**mm_kwargs_group)
|
||||
|
||||
sanity_check_mm_encoder_outputs(
|
||||
curr_group_outputs,
|
||||
@@ -1636,7 +1630,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
mm_embeds, is_mm_embed = self._gather_mm_embeddings(
|
||||
scheduler_output)
|
||||
|
||||
inputs_embeds = self.model.get_input_embeddings(
|
||||
inputs_embeds = self.model.embed_input_ids(
|
||||
input_ids,
|
||||
multimodal_embeddings=mm_embeds,
|
||||
is_multimodal=is_mm_embed,
|
||||
@@ -1666,7 +1660,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
# Some tokens ids may need to become embeds
|
||||
if token_ids_idx.numel() > 0:
|
||||
token_ids = self.input_ids[token_ids_idx]
|
||||
tokens_to_embeds = self.model.get_input_embeddings(
|
||||
tokens_to_embeds = self.model.embed_input_ids(
|
||||
input_ids=token_ids)
|
||||
self.inputs_embeds.gpu[token_ids_idx] = tokens_to_embeds
|
||||
|
||||
@@ -2075,9 +2069,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
def apply_grammar_bitmask(
|
||||
self,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
grammar_output: "GrammarOutput",
|
||||
logits: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
grammar_bitmask = scheduler_output.grammar_bitmask
|
||||
grammar_bitmask = grammar_output.grammar_bitmask
|
||||
|
||||
# We receive the structured output bitmask from the scheduler,
|
||||
# compacted to contain bitmasks only for structured output requests.
|
||||
@@ -2096,7 +2091,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
logit_index = batch_index + cumulative_offset
|
||||
cumulative_offset += len(
|
||||
scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
|
||||
if req_id in scheduler_output.structured_output_request_ids:
|
||||
if req_id in grammar_output.structured_output_request_ids:
|
||||
struct_out_req_batch_indices[req_id] = logit_index
|
||||
|
||||
out_indices = []
|
||||
@@ -2106,7 +2101,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
shape=(logits.shape[0],
|
||||
grammar_bitmask.shape[1]))
|
||||
cumulative_index = 0
|
||||
for req_id in scheduler_output.structured_output_request_ids:
|
||||
for req_id in grammar_output.structured_output_request_ids:
|
||||
num_spec_tokens = len(
|
||||
scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
|
||||
if req_id in struct_out_req_batch_indices:
|
||||
@@ -2137,7 +2132,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
def propose_draft_token_ids(
|
||||
self,
|
||||
valid_sampled_token_ids: Union[torch.Tensor, list[list[int]]],
|
||||
valid_sampled_token_ids: Union[torch.Tensor, list[np.ndarray]],
|
||||
sampling_metadata: SamplingMetadata,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
spec_decode_metadata: SpecDecodeMetadata,
|
||||
@@ -2270,7 +2265,11 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
self,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||
) -> Union[ModelRunnerOutput, AsyncModelRunnerOutput, IntermediateTensors]:
|
||||
) -> Union[ModelRunnerOutput, IntermediateTensors] | None:
|
||||
if self.execute_model_state is not None:
|
||||
raise RuntimeError("State error: sample_tokens() must be called "
|
||||
"after execute_model() returns None.")
|
||||
|
||||
with ProfileExecuteDuration().capture_async("prepare input"):
|
||||
self._update_states(scheduler_output)
|
||||
if not scheduler_output.total_num_scheduled_tokens:
|
||||
@@ -2399,8 +2398,46 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
logits = model_output_broadcast_data["logits"]
|
||||
|
||||
# Apply structured output bitmasks if present
|
||||
if scheduler_output.structured_output_request_ids:
|
||||
logits = self.apply_grammar_bitmask(scheduler_output, logits)
|
||||
self.execute_model_state = ExecuteModelState(
|
||||
scheduler_output,
|
||||
logits,
|
||||
spec_decode_metadata,
|
||||
hidden_states,
|
||||
sample_hidden_states,
|
||||
aux_hidden_states,
|
||||
kv_connector_output,
|
||||
attn_metadata,
|
||||
positions,
|
||||
)
|
||||
return None
|
||||
|
||||
@torch.inference_mode
|
||||
def sample_tokens(
|
||||
self, grammar_output: "GrammarOutput | None"
|
||||
) -> ModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors:
|
||||
if self.execute_model_state is None:
|
||||
# Nothing to do (PP non-final rank case), output isn't used.
|
||||
return None # noqa
|
||||
need_dump = self.dump_enable and self.debugger is not None
|
||||
# Unpack ephemeral state.
|
||||
(
|
||||
scheduler_output,
|
||||
logits,
|
||||
spec_decode_metadata,
|
||||
hidden_states,
|
||||
sample_hidden_states,
|
||||
aux_hidden_states,
|
||||
kv_connector_output,
|
||||
attn_metadata,
|
||||
positions,
|
||||
) = self.execute_model_state
|
||||
# Clear ephemeral state.
|
||||
self.execute_model_state = None
|
||||
|
||||
# Apply structured output bitmasks if present.
|
||||
if grammar_output is not None:
|
||||
logits = self.apply_grammar_bitmask(scheduler_output,
|
||||
grammar_output, logits)
|
||||
|
||||
with ProfileExecuteDuration().capture_async("Sample"):
|
||||
# Sample the next token and get logprobs if needed.
|
||||
@@ -2475,17 +2512,19 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
# Get the valid generated tokens.
|
||||
max_gen_len = sampled_token_ids.shape[-1]
|
||||
if max_gen_len == 1:
|
||||
# No spec decode tokens.
|
||||
valid_sampled_token_ids = sampled_token_ids.tolist()
|
||||
# No spec decode tokens. It's a tensor.
|
||||
valid_sampled_token_ids: list[np.ndarray] = [
|
||||
row for row in sampled_token_ids.cpu().numpy()
|
||||
]
|
||||
else:
|
||||
# Includes spec decode tokens.
|
||||
# Includes spec decode tokens. It's a numpy array
|
||||
valid_sampled_token_ids = self.rejection_sampler.parse_output(
|
||||
sampled_token_ids,
|
||||
self.input_batch.vocab_size,
|
||||
)
|
||||
# Mask out the sampled tokens that should not be sampled.
|
||||
for i in discard_sampled_tokens_req_indices:
|
||||
valid_sampled_token_ids[int(i)].clear()
|
||||
valid_sampled_token_ids[int(i)] = np.array([])
|
||||
else:
|
||||
valid_sampled_token_ids = []
|
||||
invalid_req_indices = discard_sampled_tokens_req_indices.tolist(
|
||||
@@ -2511,16 +2550,17 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
# the sampled tokens back, because there's no direct communication
|
||||
# between the first-stage worker and the last-stage worker.
|
||||
for req_idx in range(num_sampled_tokens):
|
||||
sampled_ids: np.ndarray | None
|
||||
if self.use_async_scheduling:
|
||||
sampled_ids = [-1] * 1 if \
|
||||
req_idx not in invalid_req_indices_set else None
|
||||
sampled_ids = (np.array([-1]) if req_idx
|
||||
not in invalid_req_indices_set else None)
|
||||
else:
|
||||
sampled_ids = valid_sampled_token_ids[req_idx]
|
||||
if not sampled_ids:
|
||||
if sampled_ids is None or sampled_ids.shape[0] == 0:
|
||||
continue
|
||||
|
||||
start_idx = self.input_batch.num_tokens_no_spec[req_idx]
|
||||
end_idx = start_idx + len(sampled_ids)
|
||||
end_idx = start_idx + sampled_ids.shape[0]
|
||||
assert end_idx <= self.model_config.max_model_len, (
|
||||
"Sampled token IDs exceed the max model length. "
|
||||
f"Total number of tokens: {end_idx} > max_model_len: "
|
||||
@@ -2534,7 +2574,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
self.input_batch.num_tokens[req_idx] = end_idx
|
||||
req_id = self.input_batch.req_ids[req_idx]
|
||||
req_state = self.requests[req_id]
|
||||
req_state.output_token_ids.extend(sampled_ids)
|
||||
req_state.output_token_ids.extend(sampled_ids.tolist())
|
||||
|
||||
def propose_draft_token_ids(sampled_token_ids):
|
||||
assert self.spec_decode_common_attn_metadata is not None
|
||||
@@ -2898,12 +2938,14 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
assert len(num_scheduled_tokens_list) == num_reqs
|
||||
num_scheduled_tokens = np.array(num_scheduled_tokens_list,
|
||||
dtype=np.int32)
|
||||
num_sampled_tokens = np.ones(num_reqs, dtype=np.int32)
|
||||
|
||||
if not self.in_profile_run and self.dynamic_eplb:
|
||||
self.eplb_updator.forward_before()
|
||||
|
||||
with self.maybe_dummy_run_with_lora(self.lora_config,
|
||||
num_scheduled_tokens):
|
||||
num_scheduled_tokens,
|
||||
num_sampled_tokens):
|
||||
if self.is_multimodal_model:
|
||||
input_ids = None
|
||||
inputs_embeds = self.inputs_embeds.gpu[:num_tokens]
|
||||
@@ -3658,9 +3700,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
for k, v in attn_backend_layers.items()
|
||||
}
|
||||
|
||||
def create_attn_groups(
|
||||
attn_backends_map: dict[AttentionBackend, list[str]],
|
||||
) -> list[AttentionGroup]:
|
||||
def create_attn_groups(attn_backends_map: dict[AttentionBackend,
|
||||
list[str]],
|
||||
kv_cache_group_id: int) -> list[AttentionGroup]:
|
||||
attn_groups: list[AttentionGroup] = []
|
||||
for (attn_backend,
|
||||
kv_cache_spec), layer_names in attn_backends_map.items():
|
||||
@@ -3671,16 +3713,17 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
self.vllm_config,
|
||||
self.device,
|
||||
))
|
||||
attn_group = AttentionGroup(attn_backend,
|
||||
attn_metadata_builders,
|
||||
layer_names, kv_cache_spec)
|
||||
attn_group = AttentionGroup(attn_backend, layer_names,
|
||||
kv_cache_spec, kv_cache_group_id,
|
||||
attn_metadata_builders)
|
||||
attn_groups.append(attn_group)
|
||||
return attn_groups
|
||||
|
||||
for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
|
||||
for i, kv_cache_group_spec in enumerate(
|
||||
kv_cache_config.kv_cache_groups):
|
||||
attn_backends = get_attn_backends_for_group( # type: ignore
|
||||
kv_cache_group_spec)
|
||||
self.attn_groups.append(create_attn_groups(attn_backends))
|
||||
self.attn_groups.append(create_attn_groups(attn_backends, i))
|
||||
|
||||
# Calculate reorder batch threshold (if needed)
|
||||
self.calculate_reorder_batch_threshold()
|
||||
@@ -3823,8 +3866,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
graph_support = builder.aclgraph_support.value
|
||||
builder_aclgraph = builder.aclgraph_support
|
||||
else:
|
||||
graph_support = builder.cudagraph_support.value
|
||||
builder_aclgraph = builder.cudagraph_support
|
||||
graph_support = builder._cudagraph_support.value
|
||||
builder_aclgraph = builder._cudagraph_support
|
||||
if graph_support < min_ag_support.value:
|
||||
min_ag_support = builder_aclgraph
|
||||
min_ag_builder_name = builder.__class__.__name__
|
||||
@@ -4422,3 +4465,18 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
self.input_ids_pcp_full_cpu[:total_num_scheduled_tokens_pcp_full],
|
||||
non_blocking=True,
|
||||
)
|
||||
|
||||
def _to_list(self, sampled_token_ids: torch.Tensor) -> list[np.ndarray]:
|
||||
# This is a short term mitigation for issue mentioned in
|
||||
# https://github.com/vllm-project/vllm/issues/22754.
|
||||
# `tolist` would trigger a cuda wise stream sync, which
|
||||
# would block other copy ops from other cuda streams.
|
||||
# A cuda event sync would avoid such a situation. Since
|
||||
# this is in the critical path of every single model
|
||||
# forward loop, this has caused perf issue for a disagg
|
||||
# setup.
|
||||
pinned = self.sampled_token_ids_pinned_cpu[:sampled_token_ids.shape[0]]
|
||||
pinned.copy_(sampled_token_ids, non_blocking=True)
|
||||
self.transfer_event.record()
|
||||
self.transfer_event.synchronize()
|
||||
return [row for row in pinned.numpy()]
|
||||
|
||||
@@ -829,7 +829,7 @@ class InputBatch:
|
||||
non_blocking=True)
|
||||
|
||||
def make_lora_inputs(
|
||||
self, num_scheduled_tokens: np.ndarray
|
||||
self, num_scheduled_tokens: np.ndarray, num_sampled_tokens: np.ndarray
|
||||
) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]:
|
||||
"""
|
||||
Given the num_scheduled_tokens for each request in the batch, return
|
||||
|
||||
@@ -18,7 +18,8 @@
|
||||
#
|
||||
|
||||
import copy
|
||||
from typing import Optional, Union
|
||||
from types import NoneType
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@@ -37,7 +38,7 @@ from vllm.sequence import IntermediateTensors
|
||||
from vllm.tasks import SupportedTask
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
|
||||
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
|
||||
DraftTokenIds, ModelRunnerOutput)
|
||||
@@ -206,6 +207,14 @@ class NPUWorker(WorkerBase):
|
||||
device = torch.device(f"npu:{self.local_rank}")
|
||||
NPUPlatform.set_device(device)
|
||||
NPUPlatform.empty_cache()
|
||||
|
||||
visible_device_count = (torch.npu.device_count()
|
||||
if torch.npu.is_available() else 0)
|
||||
assert self.parallel_config.local_world_size <= visible_device_count, (
|
||||
f"local_world_size ({self.parallel_config.local_world_size}) must be "
|
||||
f"less than or equal to the number of visible devices "
|
||||
f"({visible_device_count}).")
|
||||
|
||||
self.init_npu_memory = NPUPlatform.mem_get_info()[0]
|
||||
# Initialize the distributed environment.
|
||||
self._init_worker_distributed_environment()
|
||||
@@ -266,7 +275,7 @@ class NPUWorker(WorkerBase):
|
||||
def execute_model(
|
||||
self,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
) -> Optional[Union[ModelRunnerOutput, AsyncModelRunnerOutput]]:
|
||||
) -> ModelRunnerOutput | None:
|
||||
# enable msMonitor to monitor the performance of vllm-ascend
|
||||
if envs_ascend.MSMONITOR_USE_DAEMON:
|
||||
dp.step()
|
||||
@@ -280,7 +289,7 @@ class NPUWorker(WorkerBase):
|
||||
|
||||
output = self.model_runner.execute_model(scheduler_output,
|
||||
intermediate_tensors)
|
||||
if isinstance(output, (ModelRunnerOutput, AsyncModelRunnerOutput)):
|
||||
if isinstance(output, (ModelRunnerOutput, NoneType)):
|
||||
return output
|
||||
|
||||
assert isinstance(output, IntermediateTensors)
|
||||
@@ -304,6 +313,12 @@ class NPUWorker(WorkerBase):
|
||||
output.kv_connector_output = kv_connector_output
|
||||
return output
|
||||
|
||||
@torch.inference_mode()
|
||||
def sample_tokens(
|
||||
self, grammar_output: "GrammarOutput"
|
||||
) -> ModelRunnerOutput | AsyncModelRunnerOutput:
|
||||
return self.model_runner.sample_tokens(grammar_output)
|
||||
|
||||
def load_model(self) -> None:
|
||||
if self.vllm_config.model_config.enable_sleep_mode:
|
||||
allocator = CaMemAllocator.get_instance()
|
||||
|
||||
Reference in New Issue
Block a user