diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index dc92136..668c802 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -27,11 +27,11 @@ from vllm.attention.backends.utils import CommonAttentionState from vllm.forward_context import ForwardContext, get_forward_context from vllm.utils import direct_register_custom_op from vllm.v1.core.sched.output import SchedulerOutput -from vllm.v1.worker.gpu_input_batch import InputBatch from vllm_ascend.ops.attention import vanilla_chunked_prefill from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p, nd_to_nz_2d, nd_to_nz_spec) +from vllm_ascend.worker.npu_input_batch import InputBatch class AscendAttentionBackend(AttentionBackend): diff --git a/vllm_ascend/attention/attention_v1_torchair.py b/vllm_ascend/attention/attention_v1_torchair.py index 3b4c7a9..48437b4 100644 --- a/vllm_ascend/attention/attention_v1_torchair.py +++ b/vllm_ascend/attention/attention_v1_torchair.py @@ -25,11 +25,11 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionLayer, AttentionType) from vllm.attention.backends.utils import PAD_SLOT_ID, CommonAttentionState from vllm.v1.core.sched.output import SchedulerOutput -from vllm.v1.worker.gpu_input_batch import InputBatch from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p, nd_to_nz_2d) +from vllm_ascend.worker.npu_input_batch import InputBatch class AscendAttentionTorchairBackend(AttentionBackend): diff --git a/vllm_ascend/device_allocator/camem.py b/vllm_ascend/device_allocator/camem.py index 23dd77c..1bd97ab 100644 --- a/vllm_ascend/device_allocator/camem.py +++ b/vllm_ascend/device_allocator/camem.py @@ -24,7 +24,8 @@ from typing import Any, Callable, Dict, Optional, Tuple, Union import torch from acl.rt import memcpy # type: ignore # noqa: F401 from vllm.logger import logger -from vllm.utils import is_pin_memory_available + +from vllm_ascend.platform import NPUPlatform def find_loaded_library(lib_name) -> Optional[str]: @@ -199,7 +200,7 @@ class CaMemAllocator: size_in_bytes, dtype=torch.uint8, device='cpu', - pin_memory=is_pin_memory_available()) + pin_memory=NPUPlatform.is_pin_memory_available()) cpu_ptr = cpu_backup_tensor.data_ptr() ACL_MEMCPY_DEVICE_TO_HOST = 2 dest_max = cpu_ptr + size_in_bytes * 2 diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 8ea680c..4c008b4 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -44,7 +44,6 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 from vllm.distributed.parallel_state import (get_dp_group, get_pp_group, get_tp_group) from vllm.forward_context import get_forward_context -from vllm.inputs import INPUT_REGISTRY from vllm.logger import logger from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding @@ -52,7 +51,6 @@ from vllm.model_executor.model_loader import get_model from vllm.model_executor.models.interfaces import supports_transcription from vllm.model_executor.models.interfaces_base import ( VllmModelForPooling, is_pooling_model, is_text_generation_model) -from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange from vllm.multimodal.utils import group_mm_inputs_by_modality from vllm.pooling_params import PoolingParams @@ -60,7 +58,6 @@ from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, LazyLoader, cdiv) -from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheSpec) from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors, @@ -169,13 +166,6 @@ class NPUModelRunner(LoRAModelRunnerMixin): self.device = device self.dtype = self.model_config.dtype self.sampler = Sampler() - # Multi-modal data support - self.input_registry = INPUT_REGISTRY - self.mm_registry = MULTIMODAL_REGISTRY - self.max_num_encoder_input_tokens, self.encoder_cache_size = compute_encoder_budget( - model_config=self.model_config, - scheduler_config=self.scheduler_config, - mm_registry=self.mm_registry) # Lazy initialization, these will be set after __init__ self.kv_caches: List[torch.Tensor] = []