adopt rope in vllm-ascend (#530)
### What this PR does / why we need it? Adopt custom kernel rotary embedding in actual model inference, customized rotary_embedding will generate contiguous query and key in the cpp side to reduce the overhead of two contiguous and index_select compared with rotary_embedding in torch_npu. For now, rotary_embedding can only support the scenario of `is_neox = true`, non-neox version rope will be updated soon in the future. --------- Signed-off-by: ganyi <pleaplusone.gy@gmail.com>
This commit is contained in:
@@ -23,7 +23,9 @@ import torch
|
||||
import torch_npu # noqa: F401
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import logger
|
||||
from vllm.platforms import Platform, PlatformEnum
|
||||
|
||||
CUSTOM_OP_ENABLED = False
|
||||
try:
|
||||
# register custom ops into torch_library here
|
||||
import vllm_ascend.vllm_ascend_C # type: ignore # noqa: F401
|
||||
@@ -35,8 +37,8 @@ except ImportError as e:
|
||||
logging.warning(
|
||||
"Warning: Failed to register custom ops, all custom ops will be disabled"
|
||||
)
|
||||
|
||||
from vllm.platforms import Platform, PlatformEnum
|
||||
else:
|
||||
CUSTOM_OP_ENABLED = True
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
|
||||
Reference in New Issue
Block a user