From 744719587ed11d2b1e3624cb21f042816a105bb7 Mon Sep 17 00:00:00 2001 From: Li Wei <52344829+liwei109@users.noreply.github.com> Date: Thu, 12 Feb 2026 15:40:42 +0800 Subject: [PATCH] [Feature] Support glmx (#194) Signed-off-by: Li Wei Co-authored-by: tangshiwen Co-authored-by: Xinyu Dong --- .../tutorials/multi_xpu_GLM-5-W8A8-INT8.md | 92 +++++++++++++++++++ vllm_kunlun/__init__.py | 10 ++ vllm_kunlun/config/__init__.py | 0 vllm_kunlun/config/model.py | 22 +++++ vllm_kunlun/models/__init__.py | 4 + vllm_kunlun/models/deepseek_v2.py | 4 + vllm_kunlun/ops/_kunlun_ops.py | 7 -- vllm_kunlun/transformer_utils/__init__.py | 0 vllm_kunlun/transformer_utils/config.py | 27 ++++++ 9 files changed, 159 insertions(+), 7 deletions(-) create mode 100644 docs/source/tutorials/multi_xpu_GLM-5-W8A8-INT8.md create mode 100644 vllm_kunlun/config/__init__.py create mode 100644 vllm_kunlun/config/model.py create mode 100644 vllm_kunlun/transformer_utils/__init__.py create mode 100644 vllm_kunlun/transformer_utils/config.py diff --git a/docs/source/tutorials/multi_xpu_GLM-5-W8A8-INT8.md b/docs/source/tutorials/multi_xpu_GLM-5-W8A8-INT8.md new file mode 100644 index 0000000..5d44173 --- /dev/null +++ b/docs/source/tutorials/multi_xpu_GLM-5-W8A8-INT8.md @@ -0,0 +1,92 @@ +# Multi XPU (GLM-5-W8A8-INT8) + +## Run vllm-kunlun on Multi XPU + +Setup environment using container: + +Please follow the [installation.md](../installation.md) document to set up the environment first. + +Create a container +```bash +# !/bin/bash +# rundocker.sh +XPU_NUM=8 +DOCKER_DEVICE_CONFIG="" +if [ $XPU_NUM -gt 0 ]; then + for idx in $(seq 0 $((XPU_NUM-1))); do + DOCKER_DEVICE_CONFIG="${DOCKER_DEVICE_CONFIG} --device=/dev/xpu${idx}:/dev/xpu${idx}" + done + DOCKER_DEVICE_CONFIG="${DOCKER_DEVICE_CONFIG} --device=/dev/xpuctrl:/dev/xpuctrl" +fi + +export build_image="xxx" + +docker run -itd ${DOCKER_DEVICE_CONFIG} \ + --net=host \ + --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ + --tmpfs /dev/shm:rw,nosuid,nodev,exec,size=32g \ + --cap-add=SYS_PTRACE \ + -v /home/users/vllm-kunlun:/home/vllm-kunlun \ + -v /usr/local/bin/xpu-smi:/usr/local/bin/xpu-smi \ + --name "$1" \ + -w /workspace \ + "$build_image" /bin/bash +``` + +### Preparation Weight + +- Pull GLM-5-W8A8-INT8 weights + ``` + wget -O GLM-5-W8A8-INT8-Dynamic.tar.gz https://aihc-private-hcd.bj.bcebos.com/LLM/AICapX-Quant-Models/GLM-5-W8A8-INT8-Dynamic.tar.gz + ``` + +### Online Serving on Multi XPU + +Start the vLLM server on multi XPU: + +```bash +unset XPU_DUMMY_EVENT && \ +export XPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 && \ +export XMLIR_CUDNN_ENABLED=1 && \ +export XPU_USE_DEFAULT_CTX=1 && \ +export XMLIR_FORCE_USE_XPU_GRAPH=1 && \ +export XMLIR_ENABLE_FAST_FC=1 && \ +export XPU_USE_FAST_SWIGLU=1 && \ +export CUDA_GRAPH_OPTIMIZE_STREAM=1 && \ +export XMLIR_ENABLE_MOCK_TORCH_COMPILE=false && \ +export XPU_USE_MOE_SORTED_THRES=1 && \ +export USE_ORI_ROPE=1 && \ +export VLLM_USE_V1=1 + +python -m vllm.entrypoints.openai.api_server \ + --host 0.0.0.0 \ + --port 8806 \ + --model GLM-5-W8A8-INT8-Dynamic \ + --gpu-memory-utilization 0.97 \ + --trust-remote-code \ + --max-model-len 32768 \ + --tensor-parallel-size 8 \ + --dtype bfloat16 \ + --max_num_seqs 8 \ + --max_num_batched_tokens 8192 \ + --block-size 64 \ + --no-enable-chunked-prefill \ + --distributed-executor-backend mp \ + --disable-log-requests \ + --no-enable-prefix-caching \ + --kv-cache-dtype bfloat16 \ + --compilation-config '{ + "splitting_ops":[ + "vllm.unified_attention", + "vllm.unified_attention_with_output", + "vllm.unified_attention_with_output_kunlun", + "vllm.mamba_mixer2", + "vllm.mamba_mixer", + "vllm.short_conv", + "vllm.linear_attention", + "vllm.plamo2_mamba_mixer", + "vllm.gdn_attention", + "vllm.sparse_attn_indexer", + "vllm.sparse_attn_indexer_vllm_kunlun" + ]}' +``` diff --git a/vllm_kunlun/__init__.py b/vllm_kunlun/__init__.py index 97b9d7c..2042243 100644 --- a/vllm_kunlun/__init__.py +++ b/vllm_kunlun/__init__.py @@ -47,6 +47,16 @@ def register(): """Register the Kunlun platform""" from .utils import redirect_output from .vllm_utils_wrapper import direct_register_custom_op, patch_annotations_for_schema + + # Change for GLM5 + if "vllm.transformers_utils.config" in sys.modules: + from .transformer_utils.config import _XPU_CONFIG_REGISTRY + sys.modules["vllm.transformers_utils.config"]._CONFIG_REGISTRY = _XPU_CONFIG_REGISTRY + + import vllm.config.model as model_module + from .config.model import is_deepseek_mla + model_module.ModelConfig.is_deepseek_mla = property(is_deepseek_mla) + import_hook() return "vllm_kunlun.platforms.kunlun.KunlunPlatform" diff --git a/vllm_kunlun/config/__init__.py b/vllm_kunlun/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_kunlun/config/model.py b/vllm_kunlun/config/model.py new file mode 100644 index 0000000..37374e3 --- /dev/null +++ b/vllm_kunlun/config/model.py @@ -0,0 +1,22 @@ +def is_deepseek_mla(self) -> bool: + if not hasattr(self.hf_text_config, "model_type"): + return False + elif self.hf_text_config.model_type in ( + "deepseek_v2", + "deepseek_v3", + "deepseek_v32", + "deepseek_mtp", + "kimi_k2", + "longcat_flash", + "glm_moe_dsa", + ): + return self.hf_text_config.kv_lora_rank is not None + elif self.hf_text_config.model_type == "eagle": + # if the model is an EAGLE module, check for the + # underlying architecture + return ( + self.hf_text_config.model.model_type + in ("deepseek_v2", "deepseek_v3", "deepseek_v32") + and self.hf_text_config.kv_lora_rank is not None + ) + return False diff --git a/vllm_kunlun/models/__init__.py b/vllm_kunlun/models/__init__.py index 48a4fe1..bceb5f5 100644 --- a/vllm_kunlun/models/__init__.py +++ b/vllm_kunlun/models/__init__.py @@ -89,5 +89,9 @@ def register_model(): "DeepSeekMTPModel", "vllm_kunlun.models.deepseek_mtp:DeepSeekMTP") + ModelRegistry.register_model( + "GlmMoeDsaForCausalLM", + "vllm_kunlun.models.deepseek_v2:GlmMoeDsaForCausalLM") + def register_quant_method(): """to do""" diff --git a/vllm_kunlun/models/deepseek_v2.py b/vllm_kunlun/models/deepseek_v2.py index d65ab3e..7ec23d2 100644 --- a/vllm_kunlun/models/deepseek_v2.py +++ b/vllm_kunlun/models/deepseek_v2.py @@ -1339,6 +1339,10 @@ class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM): pass +class GlmMoeDsaForCausalLM(DeepseekV2ForCausalLM): + pass + + # Compatibility with # https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/configuration_deepseek.py def get_spec_layer_idx_from_weight_name(config: Union[DeepseekV2Config, diff --git a/vllm_kunlun/ops/_kunlun_ops.py b/vllm_kunlun/ops/_kunlun_ops.py index 94a875e..1d86690 100644 --- a/vllm_kunlun/ops/_kunlun_ops.py +++ b/vllm_kunlun/ops/_kunlun_ops.py @@ -195,10 +195,6 @@ class KunlunOps: query_x = query.contiguous() key_x = key.contiguous() - num_tokens = query_x.shape[0] - num_heads = query_x.shape[1] // head_size - num_kv_heads = key_x.shape[1] // head_size - torch.ops._C.rotary_embedding( positions, query_x, @@ -207,9 +203,6 @@ class KunlunOps: cos_sin_cache, is_neox_style) - query_x = query_x.view(num_tokens, num_heads * head_size) - key_x = key_x.view(num_tokens, num_kv_heads * head_size) - return query_x, key_x # Rotary embedding diff --git a/vllm_kunlun/transformer_utils/__init__.py b/vllm_kunlun/transformer_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_kunlun/transformer_utils/config.py b/vllm_kunlun/transformer_utils/config.py new file mode 100644 index 0000000..638ebf4 --- /dev/null +++ b/vllm_kunlun/transformer_utils/config.py @@ -0,0 +1,27 @@ +from transformers import PretrainedConfig +from vllm.transformers_utils.config import LazyConfigDict, _CONFIG_REGISTRY + +_XPU_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict( + chatglm="ChatGLMConfig", + deepseek_vl_v2="DeepseekVLV2Config", + deepseek_v3="DeepseekV3Config", + deepseek_v32="DeepseekV3Config", + glm_moe_dsa="DeepseekV3Config", + kimi_vl="KimiVLConfig", + Llama_Nemotron_Nano_VL="Nemotron_Nano_VL_Config", + RefinedWeb="RWConfig", # For tiiuae/falcon-40b(-instruct) + RefinedWebModel="RWConfig", # For tiiuae/falcon-7b(-instruct) + jais="JAISConfig", + mlp_speculator="MLPSpeculatorConfig", + medusa="MedusaConfig", + midashenglm="MiDashengLMConfig", + eagle="EAGLEConfig", + speculators="SpeculatorsConfig", + nemotron="NemotronConfig", + olmo3="Olmo3Config", + ovis="OvisConfig", + ultravox="UltravoxConfig", + step3_vl="Step3VLConfig", + step3_text="Step3TextConfig", + qwen3_next="Qwen3NextConfig", +)