[Main2Main] Upgrade vLLM to 0226 (#6813)
### What this PR does / why we need it?
Breaking:
1. https://github.com/vllm-project/vllm/pull/33452
2. https://github.com/vllm-project/vllm/pull/33451
3. https://github.com/vllm-project/vllm/pull/32567
4. https://github.com/vllm-project/vllm/pull/32344
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.15.0
- vLLM main:
83b47f67b1
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
Signed-off-by: gcanlin <canlinguosdu@gmail.com>
Co-authored-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
@@ -138,12 +138,8 @@ if TYPE_CHECKING:
|
||||
else:
|
||||
xgr = LazyLoader("xgr", globals(), "xgrammar")
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("v0.15.0"):
|
||||
from vllm.attention.layer import Attention, MLAAttention # type: ignore
|
||||
else:
|
||||
from vllm.model_executor.layers.attention import Attention, MLAAttention
|
||||
from vllm.model_executor.layers.attention import Attention, MLAAttention
|
||||
|
||||
# if true, allow tensor initialization and casting with internal format (e.g., NZ)
|
||||
torch.npu.config.allow_internal_format = True
|
||||
|
||||
@@ -531,6 +531,9 @@ class NPUWorker(WorkerBase):
|
||||
def pin_lora(self, lora_id: int) -> bool:
|
||||
return self.model_runner.pin_lora(lora_id)
|
||||
|
||||
def reset_encoder_cache(self) -> None:
|
||||
self.model_runner.reset_encoder_cache()
|
||||
|
||||
def execute_dummy_batch(self) -> None:
|
||||
self.model_runner._dummy_run(num_tokens=self.model_runner.decode_token_per_req, uniform_decode=True)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user