[CI] upgrade vllm to 0.8.5 (#715)
1. Upgrade vllm to 0.8.5 2. Drop 0.8.4 support 3. Keep doc to 0.8.4rc2 until we release 0.8.5 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -28,7 +28,6 @@ from vllm.worker.model_runner_base import (ModelRunnerBase,
|
||||
ModelRunnerWrapperBase)
|
||||
|
||||
from vllm_ascend.attention.attention import AscendMetadata
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
# A flag to enable debug prints for the updated input tensors
|
||||
# before each step.
|
||||
@@ -287,17 +286,11 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase):
|
||||
if not self.is_driver_worker:
|
||||
return []
|
||||
# Sample the next token.
|
||||
if vllm_version_is("0.8.4"):
|
||||
output = self.model.sample(
|
||||
logits=logits,
|
||||
sampling_metadata=model_input.sampling_metadata,
|
||||
)
|
||||
else:
|
||||
assert self.model_runner.sampler is not None
|
||||
output = self.model_runner.sampler(
|
||||
logits=logits,
|
||||
sampling_metadata=model_input.sampling_metadata,
|
||||
)
|
||||
assert self.model_runner.sampler is not None
|
||||
output = self.model_runner.sampler(
|
||||
logits=logits,
|
||||
sampling_metadata=model_input.sampling_metadata,
|
||||
)
|
||||
outputs.append(output)
|
||||
|
||||
if model_input.attn_metadata.num_prefills == 0 \
|
||||
|
||||
@@ -34,6 +34,7 @@ from vllm.attention.backends.utils import CommonAttentionState
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.core.scheduler import SchedulerOutputs
|
||||
from vllm.distributed import get_pp_group
|
||||
from vllm.distributed.kv_transfer import get_kv_transfer_group
|
||||
from vllm.forward_context import set_forward_context
|
||||
from vllm.inputs import INPUT_REGISTRY, InputRegistry
|
||||
from vllm.logger import logger
|
||||
@@ -42,7 +43,7 @@ from vllm.lora.request import LoRARequest
|
||||
from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
|
||||
from vllm.model_executor import SamplingMetadata, SamplingMetadataCache
|
||||
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
||||
from vllm.model_executor.model_loader import get_model
|
||||
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
||||
from vllm.model_executor.models import supports_lora, supports_multimodal
|
||||
@@ -63,13 +64,6 @@ from vllm.worker.model_runner_base import (
|
||||
_init_attn_metadata_from_tensor_dict,
|
||||
_init_sampling_metadata_from_tensor_dict)
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.8.4"):
|
||||
from vllm.distributed import get_kv_transfer_group
|
||||
else:
|
||||
from vllm.distributed.kv_transfer import get_kv_transfer_group
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.attention.backends.abstract import AttentionBackend
|
||||
|
||||
@@ -935,12 +929,7 @@ class NPUModelRunnerBase(ModelRunnerBase[TModelInputForNPU]):
|
||||
self.sampling_metadata_cache: SamplingMetadataCache = \
|
||||
SamplingMetadataCache() \
|
||||
if self.parallel_config.pipeline_parallel_size == 1 else None
|
||||
|
||||
if vllm_version_is("0.8.4"):
|
||||
self.sampler = None
|
||||
else:
|
||||
from vllm.model_executor.layers.sampler import get_sampler
|
||||
self.sampler = get_sampler()
|
||||
self.sampler = get_sampler()
|
||||
|
||||
def get_model(self) -> nn.Module:
|
||||
return self.model
|
||||
@@ -1409,17 +1398,10 @@ class NPUModelRunner(NPUModelRunnerBase[ModelInputForNPUWithSamplingMetadata]):
|
||||
model_input.async_callback()
|
||||
|
||||
# Sample the next token.
|
||||
if vllm_version_is("0.8.4"):
|
||||
output = self.model.sample(
|
||||
logits=logits,
|
||||
sampling_metadata=model_input.sampling_metadata,
|
||||
)
|
||||
else:
|
||||
assert self.sampler is not None
|
||||
output = self.sampler(
|
||||
logits=logits,
|
||||
sampling_metadata=model_input.sampling_metadata,
|
||||
)
|
||||
output = self.sampler(
|
||||
logits=logits,
|
||||
sampling_metadata=model_input.sampling_metadata,
|
||||
)
|
||||
if (self.observability_config is not None
|
||||
and self.observability_config.collect_model_forward_time
|
||||
and output is not None):
|
||||
|
||||
@@ -47,13 +47,13 @@ from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
KVCacheSpec)
|
||||
from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, ModelRunnerOutput
|
||||
from vllm.v1.sample.sampler import Sampler
|
||||
from vllm.v1.utils import bind_kv_cache
|
||||
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
|
||||
|
||||
from vllm_ascend.attention.attention import AttentionMaskBuilder
|
||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||
from vllm_ascend.platform import NPUPlatform
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import xgrammar as xgr # type: ignore[import-untyped]
|
||||
@@ -291,11 +291,7 @@ class NPUModelRunner:
|
||||
self.attn_mask_builder = AttentionMaskBuilder.initialize_from_len(
|
||||
self.attn_mask_len, self.dtype)
|
||||
|
||||
if vllm_version_is("0.8.4"):
|
||||
self.sampler = None
|
||||
else:
|
||||
from vllm.v1.sample.sampler import Sampler
|
||||
self.sampler = Sampler()
|
||||
self.sampler = Sampler()
|
||||
|
||||
def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
|
||||
"""Update the cached states and the persistent batch with the scheduler
|
||||
@@ -346,34 +342,19 @@ class NPUModelRunner:
|
||||
generator.manual_seed(sampling_params.seed)
|
||||
else:
|
||||
generator = None
|
||||
if vllm_version_is("0.8.4"):
|
||||
self.requests[req_id] = CachedRequestState(
|
||||
req_id=req_id,
|
||||
prompt_token_ids=new_req_data.prompt_token_ids,
|
||||
prompt=new_req_data.prompt,
|
||||
mm_inputs=new_req_data.mm_inputs,
|
||||
mm_positions=new_req_data.mm_positions,
|
||||
sampling_params=sampling_params,
|
||||
generator=generator,
|
||||
block_ids=new_req_data.block_ids,
|
||||
num_computed_tokens=new_req_data.num_computed_tokens,
|
||||
output_token_ids=[],
|
||||
lora_request=new_req_data.lora_request,
|
||||
)
|
||||
else:
|
||||
# the prompt removed by: https://github.com/vllm-project/vllm/pull/17214
|
||||
self.requests[req_id] = CachedRequestState(
|
||||
req_id=req_id,
|
||||
prompt_token_ids=new_req_data.prompt_token_ids,
|
||||
mm_inputs=new_req_data.mm_inputs,
|
||||
mm_positions=new_req_data.mm_positions,
|
||||
sampling_params=sampling_params,
|
||||
generator=generator,
|
||||
block_ids=new_req_data.block_ids,
|
||||
num_computed_tokens=new_req_data.num_computed_tokens,
|
||||
output_token_ids=[],
|
||||
lora_request=new_req_data.lora_request,
|
||||
)
|
||||
|
||||
self.requests[req_id] = CachedRequestState(
|
||||
req_id=req_id,
|
||||
prompt_token_ids=new_req_data.prompt_token_ids,
|
||||
mm_inputs=new_req_data.mm_inputs,
|
||||
mm_positions=new_req_data.mm_positions,
|
||||
sampling_params=sampling_params,
|
||||
generator=generator,
|
||||
block_ids=new_req_data.block_ids,
|
||||
num_computed_tokens=new_req_data.num_computed_tokens,
|
||||
output_token_ids=[],
|
||||
lora_request=new_req_data.lora_request,
|
||||
)
|
||||
|
||||
req_ids_to_add.append(req_id)
|
||||
|
||||
@@ -666,17 +647,10 @@ class NPUModelRunner:
|
||||
|
||||
# Sample the next token and get logprobs if needed.
|
||||
sampling_metadata = self.input_batch.sampling_metadata
|
||||
if vllm_version_is("0.8.4"):
|
||||
sampler_output = self.model.sample(
|
||||
logits=logits,
|
||||
sampling_metadata=sampling_metadata,
|
||||
)
|
||||
else:
|
||||
assert self.sampler is not None
|
||||
sampler_output = self.sampler(
|
||||
logits=logits,
|
||||
sampling_metadata=sampling_metadata,
|
||||
)
|
||||
sampler_output = self.sampler(
|
||||
logits=logits,
|
||||
sampling_metadata=sampling_metadata,
|
||||
)
|
||||
|
||||
# TODO(woosuk): The following loop can be slow since it iterates over
|
||||
# the requests one by one. Optimize.
|
||||
|
||||
@@ -23,7 +23,6 @@ from vllm.worker.multi_step_model_runner import (ModelOutput,
|
||||
PythonizationCache,
|
||||
StatefulModelInput)
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
from vllm_ascend.worker.model_runner import (
|
||||
ModelInputForNPUWithSamplingMetadata, NPUModelRunnerBase)
|
||||
|
||||
@@ -318,13 +317,7 @@ class MultiStepModelNPURunner(NPUModelRunnerBase[StatefulModelInputForNPU]):
|
||||
dtype=torch.long,
|
||||
device="cpu",
|
||||
pin_memory=True)
|
||||
|
||||
if vllm_version_is("0.8.4"):
|
||||
self._base_model_runner.model.sampler.include_gpu_probs_tensor = (
|
||||
True)
|
||||
else:
|
||||
assert self._base_model_runner.sampler is not None
|
||||
self._base_model_runner.sampler.include_gpu_probs_tensor = True
|
||||
self._base_model_runner.sampler.include_gpu_probs_tensor = True
|
||||
if frozen_model_input.sampling_metadata:
|
||||
frozen_model_input.sampling_metadata.skip_sampler_cpu_output = (
|
||||
True)
|
||||
|
||||
@@ -28,6 +28,7 @@ from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.distributed import (ensure_model_parallel_initialized,
|
||||
init_distributed_environment,
|
||||
set_custom_all_reduce)
|
||||
from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
|
||||
from vllm.logger import logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor import set_random_seed
|
||||
@@ -46,15 +47,10 @@ from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
|
||||
from vllm_ascend.device_allocator.camem import CaMemAllocator
|
||||
from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel
|
||||
from vllm_ascend.platform import NPUPlatform
|
||||
from vllm_ascend.utils import try_register_lib, vllm_version_is
|
||||
from vllm_ascend.utils import try_register_lib
|
||||
from vllm_ascend.worker.model_runner import NPUModelRunner
|
||||
from vllm_ascend.worker.pooling_model_runner import NPUPoolingModelRunner
|
||||
|
||||
if vllm_version_is("0.8.4"):
|
||||
from vllm.distributed import ensure_kv_transfer_initialized
|
||||
else:
|
||||
from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
|
||||
|
||||
|
||||
class NPUWorker(LocalOrDistributedWorkerBase):
|
||||
"""A worker class that executes (a partition of) the model on a NPU.
|
||||
|
||||
@@ -29,6 +29,7 @@ from vllm.config import VllmConfig
|
||||
from vllm.distributed import (ensure_model_parallel_initialized,
|
||||
init_distributed_environment,
|
||||
set_custom_all_reduce)
|
||||
from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
|
||||
from vllm.logger import logger
|
||||
from vllm.model_executor import set_random_seed
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
@@ -41,14 +42,9 @@ from vllm.v1.worker.worker_base import WorkerBase
|
||||
|
||||
from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel
|
||||
from vllm_ascend.platform import NPUPlatform
|
||||
from vllm_ascend.utils import try_register_lib, vllm_version_is
|
||||
from vllm_ascend.utils import try_register_lib
|
||||
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
||||
|
||||
if vllm_version_is("0.8.4"):
|
||||
from vllm.distributed import ensure_kv_transfer_initialized
|
||||
else:
|
||||
from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
|
||||
|
||||
|
||||
class NPUWorker(WorkerBase):
|
||||
|
||||
|
||||
Reference in New Issue
Block a user