[Core] Cherry pick from 0.7.1 to keep the main code newest (#127)
Cherry pick from 0.7.1 to keep the main code newest Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -16,7 +16,7 @@
|
||||
#
|
||||
|
||||
import os
|
||||
from typing import Optional, Tuple
|
||||
from typing import TYPE_CHECKING, Optional, Tuple
|
||||
|
||||
import torch
|
||||
|
||||
@@ -28,6 +28,11 @@ except ImportError:
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.platforms import Platform, PlatformEnum
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
else:
|
||||
FlexibleArgumentParser = None
|
||||
|
||||
os.environ["RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES"] = "1"
|
||||
|
||||
|
||||
@@ -53,6 +58,15 @@ class NPUPlatform(Platform):
|
||||
ray_device_key: str = "NPU"
|
||||
device_control_env_var: str = "ASCEND_RT_VISIBLE_DEVICES"
|
||||
|
||||
supported_quantization: list[str] = ["ascend"]
|
||||
|
||||
@classmethod
|
||||
def pre_register_and_update(cls,
|
||||
parser: Optional[FlexibleArgumentParser] = None
|
||||
) -> None:
|
||||
from vllm_ascend.quantization.quant_config import \
|
||||
AscendQuantConfig # noqa: F401
|
||||
|
||||
@classmethod
|
||||
def get_device_capability(cls, device_id: int = 0):
|
||||
return None
|
||||
@@ -96,11 +110,14 @@ class NPUPlatform(Platform):
|
||||
parallel_config.worker_cls = "vllm_ascend.worker.NPUWorker"
|
||||
cache_config = vllm_config.cache_config
|
||||
if cache_config and cache_config.block_size is None:
|
||||
cache_config.block_size = 128
|
||||
# TODO: Set block_size to 128 will lead unexpected accuracy issue in mla case. Please set block_size to 128 back once the problem is fixed.
|
||||
cache_config.block_size = 16
|
||||
|
||||
@classmethod
|
||||
def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
|
||||
kv_cache_dtype, block_size, use_v1, use_mla):
|
||||
if use_mla:
|
||||
return "vllm_ascend.attention.AscendMLAAttentionBackend"
|
||||
return "vllm_ascend.attention.AscendAttentionBackend"
|
||||
|
||||
@classmethod
|
||||
|
||||
Reference in New Issue
Block a user