[CI][XPU]enable sglang CI on Intel XPU (#9493)
Co-authored-by: huaiyuzh <huaiyu.zheng@intel.com> Co-authored-by: Ma Mingfei <mingfei.ma@intel.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
This commit is contained in:
@@ -17,6 +17,7 @@ from sglang.srt.utils import (
|
||||
is_cuda,
|
||||
is_hip,
|
||||
is_npu,
|
||||
is_xpu,
|
||||
)
|
||||
|
||||
_is_cuda = is_cuda()
|
||||
@@ -25,6 +26,7 @@ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
|
||||
_is_npu = is_npu()
|
||||
_is_cpu_amx_available = cpu_has_amx_support()
|
||||
_is_cpu = is_cpu()
|
||||
_is_xpu = is_xpu()
|
||||
|
||||
if _is_cuda:
|
||||
from sgl_kernel import FusedSetKVBufferArg, apply_rope_with_cos_sin_cache_inplace
|
||||
@@ -109,8 +111,10 @@ class RotaryEmbedding(CustomOp):
|
||||
cache = cache.to(dtype)
|
||||
|
||||
if (
|
||||
not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512]
|
||||
) and not (_is_cpu and _is_cpu_amx_available):
|
||||
(not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512])
|
||||
and not (_is_cpu and _is_cpu_amx_available)
|
||||
and not _is_xpu
|
||||
):
|
||||
from vllm._custom_ops import rotary_embedding
|
||||
|
||||
self.vllm_rotary_embedding = rotary_embedding
|
||||
@@ -284,6 +288,16 @@ class RotaryEmbedding(CustomOp):
|
||||
s += f", base={self.base}, is_neox_style={self.is_neox_style}"
|
||||
return s
|
||||
|
||||
def forward_xpu(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
offsets: Optional[torch.Tensor] = None,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
# TODO: make a wrapper, and XPU will implement this kernel later.
|
||||
return self.forward_native(positions, query, key, offsets)
|
||||
|
||||
|
||||
class LinearScalingRotaryEmbedding(RotaryEmbedding):
|
||||
"""RotaryEmbedding extended with linear scaling.
|
||||
|
||||
@@ -75,6 +75,11 @@ DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE = "gaunernst/DeepSeek-V2-Lite-Chat-FP8"
|
||||
DEFAULT_MODEL_NAME_FOR_TEST_W8A8 = "RedHatAI/Llama-3.2-3B-quantized.w8a8"
|
||||
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
|
||||
|
||||
# INT4 models
|
||||
DEFAULT_MODEL_NAME_FOR_TEST_AWQ_INT4 = (
|
||||
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
|
||||
)
|
||||
|
||||
# EAGLE
|
||||
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
|
||||
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
|
||||
|
||||
Reference in New Issue
Block a user