Drop 0.10.2 (#3284)
Drop v0.10.2 support, we support vLLM 0.11.0rc3 now. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/releases/v0.11.0 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -78,10 +78,12 @@ from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
|
||||
# yapf: disable
|
||||
from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
|
||||
KVCacheConfig, KVCacheGroupSpec,
|
||||
KVCacheSpec, MambaSpec)
|
||||
KVCacheSpec, MambaSpec,
|
||||
UniformTypeKVCacheSpecs)
|
||||
# yapf: enable
|
||||
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
|
||||
DraftTokenIds, LogprobsTensors, ModelRunnerOutput)
|
||||
DraftTokenIds, LogprobsTensors, ModelRunnerOutput,
|
||||
PoolerOutput)
|
||||
from vllm.v1.pool.metadata import PoolingMetadata
|
||||
from vllm.v1.sample.metadata import SamplingMetadata
|
||||
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
||||
@@ -121,7 +123,7 @@ from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
|
||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
|
||||
AscendSocVersion, ProfileExecuteDuration,
|
||||
get_ascend_soc_version, is_310p,
|
||||
lmhead_tp_enable, vllm_version_is)
|
||||
lmhead_tp_enable)
|
||||
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -143,13 +145,6 @@ if is_310p():
|
||||
else:
|
||||
ACL_FORMAT = ACL_FORMAT_FRACTAL_ND
|
||||
|
||||
if not vllm_version_is("0.10.2"):
|
||||
from vllm.v1.kv_cache_interface import UniformTypeKVCacheSpecs
|
||||
from vllm.v1.outputs import PoolerOutput
|
||||
else:
|
||||
from vllm.sequence import PoolerOutput
|
||||
UniformTypeKVCacheSpecs = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class GraphCaptureContext:
|
||||
@@ -308,23 +303,13 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
dtype=self.dtype,
|
||||
device=self.device)
|
||||
# Set up Attention
|
||||
if vllm_version_is("0.10.2"):
|
||||
self.attn_backend = get_attn_backend(
|
||||
0,
|
||||
self.dtype,
|
||||
None,
|
||||
self.block_size,
|
||||
self.model_config.is_attention_free,
|
||||
use_mla=self.model_config.use_mla,
|
||||
use_sfa=self.ascend_config.use_sfa)
|
||||
else:
|
||||
self.attn_backend = get_attn_backend(
|
||||
0,
|
||||
self.dtype,
|
||||
None,
|
||||
self.block_size,
|
||||
use_mla=self.model_config.use_mla,
|
||||
use_sfa=self.ascend_config.use_sfa)
|
||||
self.attn_backend = get_attn_backend(
|
||||
0,
|
||||
self.dtype,
|
||||
None,
|
||||
self.block_size,
|
||||
use_mla=self.model_config.use_mla,
|
||||
use_sfa=self.ascend_config.use_sfa)
|
||||
if torch.version.cann.startswith("8.3"):
|
||||
self.attn_mask_builder = AttentionMaskBuilder(
|
||||
self.scheduler_config.max_num_batched_tokens, self.dtype,
|
||||
@@ -602,12 +587,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
to_update.apply(pooling_params)
|
||||
|
||||
backward_kwargs = {}
|
||||
if vllm_version_is("0.10.2"):
|
||||
backward_kwargs["mm_kwargs"] = new_req_data.mm_kwargs
|
||||
backward_kwargs["mm_hashes"] = new_req_data.mm_hashes
|
||||
backward_kwargs["mm_positions"] = new_req_data.mm_positions
|
||||
else:
|
||||
backward_kwargs["mm_features"] = new_req_data.mm_features
|
||||
backward_kwargs["mm_features"] = new_req_data.mm_features
|
||||
|
||||
self.requests[req_id] = CachedRequestState(
|
||||
req_id=req_id,
|
||||
@@ -624,10 +604,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
|
||||
if self.uses_mrope:
|
||||
if vllm_version_is("0.10.2"):
|
||||
self._init_mrope_positions_0102(self.requests[req_id])
|
||||
else:
|
||||
self._init_mrope_positions(self.requests[req_id])
|
||||
self._init_mrope_positions(self.requests[req_id])
|
||||
|
||||
req_ids_to_add.append(req_id)
|
||||
|
||||
@@ -759,39 +736,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
use_audio_in_video=use_audio_in_video,
|
||||
)
|
||||
|
||||
def _init_mrope_positions_0102(self, req_state: CachedRequestState):
|
||||
image_grid_thw = []
|
||||
video_grid_thw = []
|
||||
second_per_grid_ts = []
|
||||
audio_feature_lengths = []
|
||||
use_audio_in_video = False
|
||||
assert req_state.mm_kwargs is not None
|
||||
for mm_item in req_state.mm_kwargs:
|
||||
mm_input = mm_item.get_data()
|
||||
if mm_input.get("image_grid_thw") is not None:
|
||||
image_grid_thw.append(mm_input["image_grid_thw"].tolist())
|
||||
if mm_input.get("video_grid_thw") is not None:
|
||||
video_grid_thw.append(mm_input["video_grid_thw"].tolist())
|
||||
if mm_input.get("second_per_grid_ts") is not None:
|
||||
second_per_grid_ts.append(mm_input["second_per_grid_ts"])
|
||||
if mm_input.get("audio_feature_lengths") is not None:
|
||||
audio_feature_lengths.append(mm_input["audio_feature_lengths"])
|
||||
if mm_input.get("use_audio_in_video") is True:
|
||||
use_audio_in_video = True
|
||||
|
||||
hf_config = self.model_config.hf_config
|
||||
|
||||
req_state.mrope_positions, req_state.mrope_position_delta = \
|
||||
MRotaryEmbedding.get_input_positions_tensor(
|
||||
req_state.prompt_token_ids,
|
||||
hf_config=hf_config,
|
||||
image_grid_thw=image_grid_thw,
|
||||
video_grid_thw=video_grid_thw,
|
||||
second_per_grid_ts=second_per_grid_ts,
|
||||
audio_feature_lengths=audio_feature_lengths,
|
||||
use_audio_in_video=use_audio_in_video,
|
||||
)
|
||||
|
||||
def _sync_metadata_across_dp(
|
||||
self, num_tokens: int, with_prefill: bool, enable_dbo: bool
|
||||
) -> tuple[int, Optional[torch.Tensor], bool, bool]:
|
||||
@@ -966,12 +910,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
return
|
||||
|
||||
# Batch the multi-modal inputs.
|
||||
if vllm_version_is("0.10.2"):
|
||||
mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler_0102(
|
||||
scheduler_output)
|
||||
else:
|
||||
mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler(
|
||||
scheduler_output)
|
||||
mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler(
|
||||
scheduler_output)
|
||||
encoder_outputs = []
|
||||
|
||||
for _, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
|
||||
@@ -1003,31 +943,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
is_embed=pos_info.is_embed,
|
||||
)
|
||||
|
||||
# TODO: remove this once we drop support for vLLM 0.10.2
|
||||
def _batch_mm_kwargs_from_scheduler_0102(
|
||||
self,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
) -> tuple[list[MultiModalKwargsItem], list[tuple[str, PlaceholderRange]]]:
|
||||
scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
|
||||
if not scheduled_encoder_inputs:
|
||||
return [], []
|
||||
# Batch the multi-modal inputs.
|
||||
mm_kwargs = list[MultiModalKwargsItem]()
|
||||
# list of tuple (mm_hash, position_info)
|
||||
mm_hashes_pos = list[tuple[str, PlaceholderRange]]()
|
||||
for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
|
||||
req_state = self.requests[req_id]
|
||||
assert req_state.mm_hashes is not None
|
||||
assert req_state.mm_kwargs is not None
|
||||
assert req_state.mm_positions is not None
|
||||
for mm_input_id in encoder_input_ids:
|
||||
mm_hash = req_state.mm_hashes[mm_input_id]
|
||||
mm_kwargs.append(req_state.mm_kwargs[mm_input_id])
|
||||
mm_hashes_pos.append(
|
||||
(mm_hash, req_state.mm_positions[mm_input_id]))
|
||||
|
||||
return mm_kwargs, mm_hashes_pos
|
||||
|
||||
def _batch_mm_kwargs_from_scheduler(
|
||||
self,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
@@ -1067,20 +982,11 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
) -> list[torch.Tensor]:
|
||||
|
||||
def _iter_mm_features(req_state: CachedRequestState):
|
||||
if vllm_version_is("0.10.2"):
|
||||
# legacy path (to be removed later)
|
||||
assert req_state.mm_hashes is not None
|
||||
assert req_state.mm_positions is not None
|
||||
for mm_hash, pos_info in zip(req_state.mm_hashes,
|
||||
req_state.mm_positions):
|
||||
yield mm_hash, pos_info, getattr(pos_info, "is_embed",
|
||||
None)
|
||||
else:
|
||||
assert req_state.mm_features is not None
|
||||
for mm_feature in req_state.mm_features:
|
||||
pos_info = mm_feature.mm_position
|
||||
yield mm_feature.identifier, pos_info, getattr(
|
||||
pos_info, "is_embed", None)
|
||||
assert req_state.mm_features is not None
|
||||
for mm_feature in req_state.mm_features:
|
||||
pos_info = mm_feature.mm_position
|
||||
yield mm_feature.identifier, pos_info, getattr(
|
||||
pos_info, "is_embed", None)
|
||||
|
||||
mm_embeds: list[torch.Tensor] = []
|
||||
|
||||
@@ -1527,10 +1433,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
for attn_group in self.attn_groups[kv_cache_group_id]:
|
||||
common_prefix_len = 0
|
||||
extra_attn_metadata_args = {}
|
||||
if vllm_version_is("0.10.2"):
|
||||
builder = attn_group.metadata_builder
|
||||
else:
|
||||
builder = attn_group.get_metadata_builder()
|
||||
builder = attn_group.get_metadata_builder()
|
||||
if isinstance(builder, GDNAttentionMetadataBuilder):
|
||||
if use_spec_decode:
|
||||
extra_attn_metadata_args = dict(
|
||||
@@ -1809,29 +1712,21 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
device=hidden_states.device)
|
||||
seq_lens_cpu = self.seq_lens_cpu[:self.input_batch.num_reqs]
|
||||
|
||||
if vllm_version_is("0.10.2"):
|
||||
# Pooling models D2H & synchronize occurs in pooler.py:build_output
|
||||
raw_pooler_output = self.model.pooler(
|
||||
hidden_states=hidden_states, pooling_metadata=pooling_metadata)
|
||||
else:
|
||||
model = cast(VllmModelForPooling, self.model)
|
||||
raw_pooler_output = model.pooler(
|
||||
hidden_states=hidden_states,
|
||||
pooling_metadata=pooling_metadata,
|
||||
)
|
||||
raw_pooler_output = json_map_leaves(
|
||||
lambda x: x.to("cpu", non_blocking=True),
|
||||
raw_pooler_output,
|
||||
)
|
||||
torch.npu.synchronize()
|
||||
model = cast(VllmModelForPooling, self.model)
|
||||
raw_pooler_output = model.pooler(
|
||||
hidden_states=hidden_states,
|
||||
pooling_metadata=pooling_metadata,
|
||||
)
|
||||
raw_pooler_output = json_map_leaves(
|
||||
lambda x: x.to("cpu", non_blocking=True),
|
||||
raw_pooler_output,
|
||||
)
|
||||
torch.npu.synchronize()
|
||||
|
||||
pooler_output: list[Optional[torch.Tensor]] = []
|
||||
for raw_output, seq_len, prompt_len in zip(
|
||||
raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens):
|
||||
if vllm_version_is("0.10.2"):
|
||||
output = raw_output.data if seq_len == prompt_len else None
|
||||
else:
|
||||
output = raw_output if seq_len == prompt_len else None
|
||||
output = raw_output if seq_len == prompt_len else None
|
||||
pooler_output.append(output)
|
||||
|
||||
return ModelRunnerOutput(
|
||||
@@ -2006,8 +1901,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
num_scheduled_tokens_np, finished_sending,
|
||||
finished_recving, kv_connector_output)
|
||||
sample_hidden_states = hidden_states[logits_indices]
|
||||
logits = self._compute_logits_wrapper(sample_hidden_states,
|
||||
None)
|
||||
logits = self.model.compute_logits(sample_hidden_states)
|
||||
if broadcast_pp_output:
|
||||
model_output_broadcast_data = {
|
||||
"logits": logits.contiguous(),
|
||||
@@ -2302,10 +2196,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
)
|
||||
|
||||
for attn_group in self.attn_groups[kv_cache_group_id]:
|
||||
if vllm_version_is("0.10.2"):
|
||||
builder = attn_group.metadata_builder
|
||||
else:
|
||||
builder = attn_group.get_metadata_builder()
|
||||
builder = attn_group.get_metadata_builder()
|
||||
attn_metadata_i = builder.build_for_graph_capture(
|
||||
common_attn_metadata)
|
||||
for layer_name in kv_cache_group_spec.layer_names:
|
||||
@@ -2463,8 +2354,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
dtype=torch.int32)
|
||||
|
||||
def dummy_compute_logits(hidden_states):
|
||||
return self._compute_logits_wrapper(
|
||||
hidden_states[dummy_indices], None)
|
||||
return self.model.compute_logits(
|
||||
hidden_states[dummy_indices])
|
||||
|
||||
with set_ascend_forward_context(
|
||||
attn_metadata,
|
||||
@@ -2542,18 +2433,13 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
logit_indices = np.cumsum(num_scheduled_tokens) - 1
|
||||
# TODO: need to rum a dummy sampler for generate task
|
||||
hidden_states = hidden_states[logit_indices]
|
||||
output = self._compute_logits_wrapper(hidden_states, None)
|
||||
output = self.model.compute_logits(hidden_states)
|
||||
|
||||
NPUPlatform.synchronize()
|
||||
del hidden_states, output
|
||||
self.encoder_cache.clear()
|
||||
gc.collect()
|
||||
|
||||
def _compute_logits_wrapper(self, hidden_states, sampling_metadata):
|
||||
if vllm_version_is("0.10.2"):
|
||||
return self.model.compute_logits(hidden_states, sampling_metadata)
|
||||
return self.model.compute_logits(hidden_states)
|
||||
|
||||
def _dummy_pooler_run_task(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
@@ -2615,10 +2501,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
for task in self.get_supported_pooling_tasks():
|
||||
# Run a full batch with each task to ensure none of them OOMs
|
||||
output = self._dummy_pooler_run_task(hidden_states, task)
|
||||
if vllm_version_is("0.10.2"):
|
||||
output_size[task] = output.get_data_nbytes()
|
||||
else:
|
||||
output_size[task] = sum(o.nbytes for o in output)
|
||||
output_size[task] = sum(o.nbytes for o in output)
|
||||
del output # Allow GC
|
||||
|
||||
max_task = max(output_size.items(), key=lambda x: x[1])[0]
|
||||
@@ -2657,16 +2540,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
self.model.get_eagle3_aux_hidden_state_layers())
|
||||
|
||||
if self.lora_config:
|
||||
if vllm_version_is("0.10.2"):
|
||||
self.model = self.load_lora_model(self.model,
|
||||
self.model_config,
|
||||
self.scheduler_config,
|
||||
self.lora_config,
|
||||
self.device)
|
||||
else:
|
||||
self.model = self.load_lora_model(self.model,
|
||||
self.vllm_config,
|
||||
self.device)
|
||||
self.model = self.load_lora_model(self.model, self.vllm_config,
|
||||
self.device)
|
||||
logger.info("Loading model weights took %.4f GB",
|
||||
m.consumed_memory / float(2**30))
|
||||
|
||||
@@ -2694,17 +2569,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
self.initialize_attn_backend(kv_cache_config)
|
||||
self.use_hybrid_blocks = (len(self.attn_groups) > 1)
|
||||
# NOTE: Currently, we determine whether we need `num_accepted_tokens` through `MambaSpec`.
|
||||
if vllm_version_is("0.10.2"):
|
||||
self.need_accepted_tokens = any([
|
||||
isinstance(
|
||||
self.kv_cache_config.kv_cache_groups[0].kv_cache_spec,
|
||||
MambaSpec) for attn_group in self.attn_groups
|
||||
])
|
||||
else:
|
||||
self.need_accepted_tokens = any([
|
||||
isinstance(attn_group[0].kv_cache_spec, MambaSpec)
|
||||
for attn_group in self.attn_groups
|
||||
])
|
||||
self.need_accepted_tokens = any([
|
||||
isinstance(attn_group[0].kv_cache_spec, MambaSpec)
|
||||
for attn_group in self.attn_groups
|
||||
])
|
||||
|
||||
self.may_reinitialize_input_batch(kv_cache_config)
|
||||
|
||||
@@ -2737,11 +2605,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
|
||||
|
||||
kv_caches: Dict[str, torch.Tensor] = {}
|
||||
for group in self._kv_cache_spec_attn_group_iterator_dispatcher():
|
||||
if vllm_version_is("0.10.2"):
|
||||
kv_cache_spec, group = group
|
||||
else:
|
||||
kv_cache_spec = group.kv_cache_spec
|
||||
for group in self._kv_cache_spec_attn_group_iterator():
|
||||
kv_cache_spec = group.kv_cache_spec
|
||||
attn_backend = group.backend
|
||||
for layer_name in group.layer_names:
|
||||
if layer_name in self.runner_only_attn_layers:
|
||||
@@ -2846,11 +2711,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
|
||||
|
||||
kv_caches: Dict[str, torch.Tensor] = {}
|
||||
for group in self._kv_cache_spec_attn_group_iterator_dispatcher():
|
||||
if vllm_version_is("0.10.2"):
|
||||
kv_cache_spec, group = group
|
||||
else:
|
||||
kv_cache_spec = group.kv_cache_spec
|
||||
for group in self._kv_cache_spec_attn_group_iterator():
|
||||
kv_cache_spec = group.kv_cache_spec
|
||||
attn_backend = group.backend
|
||||
for layer_name in group.layer_names:
|
||||
if layer_name in self.runner_only_attn_layers:
|
||||
@@ -2996,11 +2858,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
)), "Some layers are not correctly initialized"
|
||||
|
||||
kv_caches: Dict[str, torch.Tensor] = {}
|
||||
for group in self._kv_cache_spec_attn_group_iterator_dispatcher():
|
||||
if vllm_version_is("0.10.2"):
|
||||
kv_cache_spec, group = group
|
||||
else:
|
||||
kv_cache_spec = group.kv_cache_spec
|
||||
for group in self._kv_cache_spec_attn_group_iterator():
|
||||
kv_cache_spec = group.kv_cache_spec
|
||||
attn_backend = group.backend
|
||||
for layer_name in group.layer_names:
|
||||
if layer_name in self.runner_only_attn_layers:
|
||||
@@ -3211,50 +3070,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
for k, v in attn_backend_layers.items()
|
||||
}
|
||||
|
||||
def get_attn_backends_for_layers(
|
||||
layer_names: list[str]
|
||||
) -> dict[type[AttentionBackend], list[str]]:
|
||||
"""Get attention_backend for all attention layers
|
||||
TODO: Only used in v0.10.2, drop me when 0.10.2 is dropped
|
||||
"""
|
||||
layers = get_layers_from_vllm_config(self.vllm_config,
|
||||
AttentionLayerBase,
|
||||
layer_names)
|
||||
attn_backends = {}
|
||||
attn_backend_layers = defaultdict(list)
|
||||
# Dedupe based on full class name; this is a bit safer than
|
||||
# using the class itself as the key because when we create dynamic
|
||||
# attention backend subclasses (e.g. ChunkedLocalAttention) unless
|
||||
# they are cached correctly, there will be different objects per
|
||||
# layer.
|
||||
for layer_name in layer_names:
|
||||
attn_backend = layers[layer_name].get_attn_backend()
|
||||
key = attn_backend.full_cls_name()
|
||||
attn_backends[key] = attn_backend
|
||||
attn_backend_layers[key].append(layer_name)
|
||||
return {
|
||||
attn_backends[k]: v
|
||||
for k, v in attn_backend_layers.items()
|
||||
}
|
||||
|
||||
def create_attn_groups_v0102(
|
||||
attn_backends_map: dict[AttentionBackend, list[str]],
|
||||
kv_cache_spec: KVCacheSpec,
|
||||
) -> list[AttentionGroup]:
|
||||
attn_groups: list[AttentionGroup] = []
|
||||
for attn_backend, layer_names in attn_backends_map.items():
|
||||
attn_metadata_builder_i = attn_backend.get_builder_cls()(
|
||||
kv_cache_spec,
|
||||
layer_names,
|
||||
self.vllm_config,
|
||||
self.device,
|
||||
)
|
||||
attn_group = AttentionGroup(attn_backend,
|
||||
attn_metadata_builder_i,
|
||||
layer_names)
|
||||
attn_groups.append(attn_group)
|
||||
return attn_groups
|
||||
|
||||
def create_attn_groups(
|
||||
attn_backends_map: dict[AttentionBackend, list[str]],
|
||||
) -> list[AttentionGroup]:
|
||||
@@ -3274,18 +3089,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
attn_groups.append(attn_group)
|
||||
return attn_groups
|
||||
|
||||
if vllm_version_is("0.10.2"):
|
||||
for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
|
||||
kv_cache_spec = kv_cache_group_spec.kv_cache_spec
|
||||
attn_backends = get_attn_backends_for_layers(
|
||||
kv_cache_group_spec.layer_names)
|
||||
self.attn_groups.append(
|
||||
create_attn_groups_v0102(attn_backends, kv_cache_spec))
|
||||
else:
|
||||
for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
|
||||
attn_backends = get_attn_backends_for_group( # type: ignore
|
||||
kv_cache_group_spec)
|
||||
self.attn_groups.append(create_attn_groups(attn_backends))
|
||||
for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
|
||||
attn_backends = get_attn_backends_for_group( # type: ignore
|
||||
kv_cache_group_spec)
|
||||
self.attn_groups.append(create_attn_groups(attn_backends))
|
||||
|
||||
# Calculate reorder batch threshold (if needed)
|
||||
self.calculate_reorder_batch_threshold()
|
||||
@@ -3299,31 +3106,13 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
for attn_groups in self.attn_groups:
|
||||
yield from attn_groups
|
||||
|
||||
def _kv_cache_spec_attn_group_iterator_v0102(
|
||||
self) -> Iterator[tuple[KVCacheSpec, AttentionGroup]]:
|
||||
if not self.kv_cache_config.kv_cache_groups:
|
||||
return
|
||||
for kv_cache_spec_id, attn_groups in enumerate(self.attn_groups):
|
||||
for attn_group in attn_groups:
|
||||
yield self.kv_cache_config.kv_cache_groups[
|
||||
kv_cache_spec_id].kv_cache_spec, attn_group
|
||||
|
||||
def _kv_cache_spec_attn_group_iterator_dispatcher(self):
|
||||
if vllm_version_is("0.10.2"):
|
||||
return self._kv_cache_spec_attn_group_iterator_v0102()
|
||||
else:
|
||||
return self._kv_cache_spec_attn_group_iterator()
|
||||
|
||||
def calculate_reorder_batch_threshold(self) -> None:
|
||||
"""
|
||||
Check that if any backends reorder batches; that the reordering
|
||||
is compatible (e.g., decode threshold is the same)
|
||||
"""
|
||||
for group in self._attn_group_iterator():
|
||||
if vllm_version_is("0.10.2"):
|
||||
attn_metadata_builder_i = group.metadata_builder
|
||||
else:
|
||||
attn_metadata_builder_i = group.get_metadata_builder()
|
||||
attn_metadata_builder_i = group.get_metadata_builder()
|
||||
if hasattr(attn_metadata_builder_i, "reorder_batch_threshold"):
|
||||
# check that if any backends reorder batches; that the reordering
|
||||
# is compatible (e.g., decode threshold is the same)
|
||||
@@ -3427,10 +3216,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
min_ag_builder_name = None
|
||||
|
||||
for attn_group in self._attn_group_iterator():
|
||||
if vllm_version_is("0.10.2"):
|
||||
builder = attn_group.metadata_builder
|
||||
else:
|
||||
builder = attn_group.get_metadata_builder()
|
||||
builder = attn_group.get_metadata_builder()
|
||||
if builder.aclgraph_support.value < min_ag_support.value:
|
||||
min_ag_support = builder.aclgraph_support
|
||||
min_ag_builder_name = builder.__class__.__name__
|
||||
@@ -3674,7 +3460,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
req_idx = self.input_batch.req_id_to_index[req_id]
|
||||
offset = self.query_start_loc_np[req_idx].item()
|
||||
prompt_hidden_states = hidden_states[offset:offset + num_logits]
|
||||
logits = self._compute_logits_wrapper(prompt_hidden_states, None)
|
||||
logits = self.model.compute_logits(prompt_hidden_states)
|
||||
|
||||
# Get the "target" tokens for each index. For prompt at index i,
|
||||
# the token at prompt index i+1 is the "sampled" token we want
|
||||
|
||||
@@ -39,7 +39,6 @@ from vllm.v1.sample.metadata import SamplingMetadata
|
||||
from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
|
||||
from vllm.v1.utils import copy_slice
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
from vllm_ascend.worker.block_table import MultiGroupBlockTable
|
||||
|
||||
|
||||
@@ -79,12 +78,6 @@ class CachedRequestState:
|
||||
@deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be "
|
||||
"removed in v0.13. Please use `mm_kwargs` instead.")
|
||||
def mm_inputs(self) -> list[MultiModalKwargsItems]:
|
||||
if vllm_version_is("0.10.2"):
|
||||
assert self.mm_kwargs is not None
|
||||
return [
|
||||
MultiModalKwargsItems.from_seq([item])
|
||||
for item in self.mm_kwargs
|
||||
]
|
||||
assert self.mm_features is not None
|
||||
return [
|
||||
MultiModalKwargsItems.from_seq([f.data]) for f in self.mm_features
|
||||
|
||||
Reference in New Issue
Block a user