1. fix https://github.com/vllm-project/vllm/pull/28542 The model structure modifications we involved in are: - Qwen2.5-VL(still exist some patch) - Qwen2-VL - Qwen2 - DeepSeek series - Qwen-moe series 2. fix https://github.com/vllm-project/vllm/pull/29121 the output token now type changed from np to `list[list[int]]` 3. fix https://github.com/vllm-project/vllm/pull/29262 `xformers` backend for multimodal now has been deprecated 4. fix https://github.com/vllm-project/vllm/pull/29342 5. fix https://github.com/vllm-project/vllm/pull/28579 6. fix https://github.com/vllm-project/vllm/pull/28718 7. fix https://github.com/vllm-project/vllm/issues/28665 8. fix https://github.com/vllm-project/vllm/pull/26847 vllm introduced the `optimization-level`, some default config has been changed, and the param `--enforce-eager` has been deprecated 9. fix http://github.com/vllm-project/vllm/pull/29223 it retuns tuple for sampler. 10. fix https://github.com/vllm-project/vllm/pull/29471 we'll remove the related patch to avoid this kind of error. Co-authored-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: wangli <wangli858794774@gmail.com> - vLLM version: v0.11.2 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com>
75 lines
2.6 KiB
Python
75 lines
2.6 KiB
Python
# Standard
|
|
from enum import Enum
|
|
|
|
import torch
|
|
from vllm.config import ParallelConfig
|
|
from vllm.logger import logger
|
|
|
|
from vllm_ascend.distributed.kvpool.backend.backend import Backend
|
|
|
|
|
|
class MmcDirect(Enum):
|
|
COPY_L2G = 0
|
|
COPY_G2L = 1
|
|
COPY_G2H = 2
|
|
COPY_H2G = 3
|
|
|
|
|
|
class MemcacheBackend(Backend):
|
|
|
|
def __init__(self, parallel_config: ParallelConfig):
|
|
try:
|
|
from memcache import DistributedObjectStore # type: ignore
|
|
except ImportError as e:
|
|
raise ImportError(
|
|
"Please install memcache by following the instructions at "
|
|
"https://gitee.com/ascend/memfabric_hybrid " # noqa: E501
|
|
"to run vLLM with MemcacheConnector.") from e
|
|
try:
|
|
self.rank = parallel_config.rank
|
|
self.store = DistributedObjectStore()
|
|
res = self.store.init(self.rank)
|
|
assert res == 0
|
|
except ValueError as e:
|
|
logger.error("Configuration loading failed: %s", e)
|
|
raise
|
|
except Exception as exc:
|
|
logger.error(
|
|
"An error occurred while loading the configuration: %s", exc)
|
|
raise
|
|
|
|
def set_device(self):
|
|
device = torch.device(f"npu:{self.rank}")
|
|
torch.npu.set_device(device)
|
|
|
|
def register_buffer(self, ptrs: list[int], sizes: list[int]):
|
|
for ptr, size in zip(ptrs, sizes):
|
|
ret_value = self.store.register_buffer(ptr, size)
|
|
if ret_value != 0:
|
|
raise RuntimeError("Memcache memory registration failed.")
|
|
|
|
def exists(self, keys: list[str]) -> list[int]:
|
|
return self.store.batch_is_exist(keys)
|
|
|
|
def get(self, key: list[str], addr: list[list[int]],
|
|
size: list[list[int]]):
|
|
try:
|
|
res = self.store.batch_get_into_layers(key, addr, size,
|
|
MmcDirect.COPY_G2L.value)
|
|
for value in res:
|
|
if value != 0:
|
|
logger.error(f"Failed to get key {key},res:{res}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to get key {key}. {e}")
|
|
|
|
def put(self, key: list[str], addr: list[list[int]],
|
|
size: list[list[int]]):
|
|
try:
|
|
res = self.store.batch_put_from_layers(key, addr, size,
|
|
MmcDirect.COPY_L2G.value)
|
|
for value in res:
|
|
if value != 0:
|
|
logger.error(f"Failed to get key {key},res:{res}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to put key {key},error:{e}")
|