Drop vLLM 0.13.0 support (#6069)
### What this PR does / why we need it?
Drop vLLM 0.13.0 support, upgrade to 0.14.0
- vLLM version: v0.13.0
- vLLM main:
d68209402d
---------
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
@@ -50,8 +50,10 @@ from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils.import_utils import LazyLoader
|
||||
from vllm.utils.math_utils import cdiv
|
||||
from vllm.utils.mem_utils import DeviceMemoryProfiler
|
||||
from vllm.v1.attention.backend import AttentionBackend, AttentionType # type: ignore
|
||||
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
|
||||
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
|
||||
from vllm.v1.attention.selector import get_attn_backend # type: ignore
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.kv_cache_interface import (AttentionSpec,
|
||||
EncoderOnlyAttentionSpec,
|
||||
@@ -102,7 +104,7 @@ from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
|
||||
from vllm_ascend.utils import (AscendDeviceType, ProfileExecuteDuration,
|
||||
enable_sp, get_ascend_device_type, is_moe_model,
|
||||
lmhead_tp_enable, maybe_trans_nz,
|
||||
set_weight_prefetch_method, vllm_version_is)
|
||||
set_weight_prefetch_method)
|
||||
from vllm_ascend.worker.npu_input_batch import NPUInputBatch
|
||||
from vllm_ascend.worker.pcp_utils import PCPManager
|
||||
|
||||
@@ -115,15 +117,6 @@ if TYPE_CHECKING:
|
||||
else:
|
||||
xgr = LazyLoader("xgr", globals(), "xgrammar")
|
||||
|
||||
# isort: off
|
||||
if vllm_version_is('0.13.0'):
|
||||
from vllm.attention.backends.abstract import ( # type: ignore
|
||||
AttentionBackend, AttentionType)
|
||||
from vllm.attention.selector import get_attn_backend # type: ignore
|
||||
else:
|
||||
from vllm.v1.attention.selector import get_attn_backend # type: ignore
|
||||
from vllm.v1.attention.backend import AttentionBackend, AttentionType # type: ignore
|
||||
# isort: on
|
||||
import torch_npu
|
||||
|
||||
# if true, allow tensor initialization and casting with internal format (e.g., NZ)
|
||||
@@ -746,10 +739,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
|
||||
# _prepare_inputs may reorder the batch, so we must gather
|
||||
# multi-modal outputs after that to ensure the correct order
|
||||
if vllm_version_is('0.13.0'):
|
||||
model_kwargs = self._init_model_kwargs(num_input_tokens)
|
||||
else:
|
||||
model_kwargs = self._init_model_kwargs()
|
||||
model_kwargs = self._init_model_kwargs()
|
||||
if self.is_multimodal_model and not self.model_config.is_encoder_decoder:
|
||||
self.multimodal_cpu_fields = ["grid_thw"]
|
||||
self._prepare_multimodal_fields()
|
||||
@@ -1575,16 +1565,10 @@ class NPUModelRunner(GPUModelRunner):
|
||||
logits = None
|
||||
else:
|
||||
if self.input_batch.pooling_params:
|
||||
if vllm_version_is('0.13.0'):
|
||||
pool_output = self._pool(
|
||||
hidden_states,
|
||||
scheduler_output.total_num_scheduled_tokens,
|
||||
num_scheduled_tokens_np)
|
||||
else:
|
||||
pool_output = self._pool(
|
||||
hidden_states,
|
||||
scheduler_output.total_num_scheduled_tokens,
|
||||
num_scheduled_tokens_np, kv_connector_output)
|
||||
pool_output = self._pool(
|
||||
hidden_states,
|
||||
scheduler_output.total_num_scheduled_tokens,
|
||||
num_scheduled_tokens_np, kv_connector_output)
|
||||
if self.debugger is not None:
|
||||
self.debugger.stop()
|
||||
self.debugger.step()
|
||||
@@ -1675,8 +1659,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
attn_metadata,
|
||||
aux_hidden_states,
|
||||
)
|
||||
if not vllm_version_is('0.13.0'):
|
||||
self._copy_draft_token_ids_to_cpu(scheduler_output)
|
||||
self._copy_draft_token_ids_to_cpu(scheduler_output)
|
||||
|
||||
(
|
||||
logprobs_lists,
|
||||
@@ -1826,20 +1809,12 @@ class NPUModelRunner(GPUModelRunner):
|
||||
valid_sampled_token_ids[int(i)].clear()
|
||||
else:
|
||||
# Includes spec decode tokens.
|
||||
if vllm_version_is('0.13.0'):
|
||||
valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output(
|
||||
sampled_token_ids,
|
||||
self.input_batch.vocab_size,
|
||||
discard_sampled_tokens_req_indices,
|
||||
return_cu_num_tokens=logprobs_tensors is not None,
|
||||
)
|
||||
else:
|
||||
valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output(
|
||||
sampled_token_ids,
|
||||
self.input_batch.vocab_size,
|
||||
discard_sampled_tokens_req_indices,
|
||||
logprobs_tensors=logprobs_tensors,
|
||||
)
|
||||
valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output(
|
||||
sampled_token_ids,
|
||||
self.input_batch.vocab_size,
|
||||
discard_sampled_tokens_req_indices,
|
||||
logprobs_tensors=logprobs_tensors,
|
||||
)
|
||||
else:
|
||||
valid_sampled_token_ids = []
|
||||
invalid_req_indices = discard_sampled_tokens_req_indices.tolist()
|
||||
|
||||
@@ -58,16 +58,13 @@ from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel
|
||||
from vllm_ascend.ops.triton.triton_utils import init_device_properties_triton
|
||||
from vllm_ascend.utils import (AscendDeviceType, check_ascend_device_type,
|
||||
enable_sp, get_ascend_device_type,
|
||||
register_ascend_customop, vllm_version_is)
|
||||
register_ascend_customop)
|
||||
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
||||
|
||||
torch._dynamo.trace_rules.clear_lru_cache() # noqa: E402
|
||||
from torch._dynamo.variables import TorchInGraphFunctionVariable # noqa: E402
|
||||
|
||||
if vllm_version_is("0.13.0"):
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
else:
|
||||
from vllm.utils.torch_utils import set_random_seed
|
||||
from vllm.utils.torch_utils import set_random_seed
|
||||
|
||||
torch_non_c_binding_in_graph_functions_npu = dict.fromkeys(
|
||||
["torch.npu.current_stream"],
|
||||
@@ -121,13 +118,6 @@ class NPUWorker(WorkerBase):
|
||||
self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
|
||||
self.cache_config.cache_dtype]
|
||||
|
||||
if vllm_version_is('0.13.0'):
|
||||
if self.model_config.trust_remote_code:
|
||||
# note: lazy import to avoid importing torch before initializing
|
||||
from vllm.utils.import_utils import init_cached_hf_modules
|
||||
|
||||
init_cached_hf_modules()
|
||||
|
||||
self.profiler = self._init_profiler()
|
||||
if vllm_config.model_config and vllm_config.model_config.enable_sleep_mode:
|
||||
# Buffers saved before sleep
|
||||
|
||||
Reference in New Issue
Block a user