[CI][XPU]enable sglang CI on Intel XPU (#9493)

Co-authored-by: huaiyuzh <huaiyu.zheng@intel.com> Co-authored-by: Ma Mingfei <mingfei.ma@intel.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
2025-10-16 08:13:19 +08:00
parent baf277a9bf
commit 4c03dbaaef
6 changed files with 266 additions and 2 deletions
--- a/python/sglang/srt/layers/rotary_embedding.py
+++ b/python/sglang/srt/layers/rotary_embedding.py
@@ -17,6 +17,7 @@ from sglang.srt.utils import (
    is_cuda,
    is_hip,
    is_npu,
+    is_xpu,
 )

 _is_cuda = is_cuda()
@@ -25,6 +26,7 @@ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 _is_npu = is_npu()
 _is_cpu_amx_available = cpu_has_amx_support()
 _is_cpu = is_cpu()
+_is_xpu = is_xpu()

 if _is_cuda:
    from sgl_kernel import FusedSetKVBufferArg, apply_rope_with_cos_sin_cache_inplace
@@ -109,8 +111,10 @@ class RotaryEmbedding(CustomOp):
            cache = cache.to(dtype)

        if (
-            not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512]
-        ) and not (_is_cpu and _is_cpu_amx_available):
+            (not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512])
+            and not (_is_cpu and _is_cpu_amx_available)
+            and not _is_xpu
+        ):
            from vllm._custom_ops import rotary_embedding

            self.vllm_rotary_embedding = rotary_embedding
@@ -284,6 +288,16 @@ class RotaryEmbedding(CustomOp):
        s += f", base={self.base}, is_neox_style={self.is_neox_style}"
        return s

+    def forward_xpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # TODO: make a wrapper, and XPU will implement this kernel later.
+        return self.forward_native(positions, query, key, offsets)
+

 class LinearScalingRotaryEmbedding(RotaryEmbedding):
    """RotaryEmbedding extended with linear scaling.
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -75,6 +75,11 @@ DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE = "gaunernst/DeepSeek-V2-Lite-Chat-FP8"
 DEFAULT_MODEL_NAME_FOR_TEST_W8A8 = "RedHatAI/Llama-3.2-3B-quantized.w8a8"
 DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"

+# INT4 models
+DEFAULT_MODEL_NAME_FOR_TEST_AWQ_INT4 = (
+    "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
+)
+
 # EAGLE
 DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
 DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"