[Main2Main] Upgrade vllm commit to 0113 (#5839)
### What this PR does / why we need it?
Upgrade vllm commit to 0113 (11b6af5280d6d6dfb8953af16e67b25f819b3be9)
- Modify import paths due to the refactors
https://github.com/vllm-project/vllm/pull/31916
https://github.com/vllm-project/vllm/pull/32054
- Fix `TypeError: NPUOffloadingSpec.__init__() takes 2 positional
arguments but 3 were given` due to
https://github.com/vllm-project/vllm/pull/24498
- Skip the async-scheduling tests in
`tests/e2e/multicard/4-cards/long_sequence/test_mtp.py`, which are never
verified
https://github.com/vllm-project/vllm/pull/31998
- Skip some pooling tests, which are caused by
https://github.com/vllm-project/vllm/pull/32148
where vllm is also failed
https://buildkite.com/vllm/ci/builds/46705/steps/canvas?jid=019bb329-3834-4685-862b-1613b8e0f5d4
We will reopen those tests when main2main reachs
https://github.com/vllm-project/vllm/pull/32243
- Skip some cases in
`tests/e2e/multicard/4-cards/long_sequence/test_mtp.py`, which are
broken by
https://github.com/vllm-project/vllm/pull/32118
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
2f4e6548ef
Signed-off-by: wjunLu <wjunlu217@gmail.com>
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
Co-authored-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
@@ -30,9 +30,7 @@ import numpy as np
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.nn as nn
|
||||
from vllm.attention.backends.abstract import AttentionBackend, AttentionType
|
||||
from vllm.attention.layer import Attention, MLAAttention
|
||||
from vllm.attention.selector import get_attn_backend
|
||||
from vllm.config import (CompilationMode, CUDAGraphMode, VllmConfig,
|
||||
get_layers_from_vllm_config)
|
||||
from vllm.distributed import (get_tensor_model_parallel_world_size,
|
||||
@@ -119,6 +117,15 @@ if TYPE_CHECKING:
|
||||
else:
|
||||
xgr = LazyLoader("xgr", globals(), "xgrammar")
|
||||
|
||||
# isort: off
|
||||
if vllm_version_is('0.13.0'):
|
||||
from vllm.attention.backends.abstract import ( # type: ignore
|
||||
AttentionBackend, AttentionType)
|
||||
from vllm.attention.selector import get_attn_backend # type: ignore
|
||||
else:
|
||||
from vllm.v1.attention.selector import get_attn_backend # type: ignore
|
||||
from vllm.v1.attention.backend import AttentionBackend, AttentionType # type: ignore
|
||||
# isort: on
|
||||
import torch_npu
|
||||
|
||||
# if true, allow tensor initialization and casting with internal format (e.g., NZ)
|
||||
@@ -1817,12 +1824,20 @@ class NPUModelRunner(GPUModelRunner):
|
||||
valid_sampled_token_ids[int(i)].clear()
|
||||
else:
|
||||
# Includes spec decode tokens.
|
||||
valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output(
|
||||
sampled_token_ids,
|
||||
self.input_batch.vocab_size,
|
||||
discard_sampled_tokens_req_indices,
|
||||
return_cu_num_tokens=logprobs_tensors is not None,
|
||||
)
|
||||
if vllm_version_is('0.13.0'):
|
||||
valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output(
|
||||
sampled_token_ids,
|
||||
self.input_batch.vocab_size,
|
||||
discard_sampled_tokens_req_indices,
|
||||
return_cu_num_tokens=logprobs_tensors is not None,
|
||||
)
|
||||
else:
|
||||
valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output(
|
||||
sampled_token_ids,
|
||||
self.input_batch.vocab_size,
|
||||
discard_sampled_tokens_req_indices,
|
||||
logprobs_tensors=logprobs_tensors,
|
||||
)
|
||||
else:
|
||||
valid_sampled_token_ids = []
|
||||
invalid_req_indices = discard_sampled_tokens_req_indices.tolist()
|
||||
|
||||
Reference in New Issue
Block a user