[Doc] Optimize the document (#136)

2026-01-22 14:12:44 +08:00
parent 58f570ddea
commit 9e13f23661
6 changed files with 100 additions and 40 deletions
--- a/docs/source/tutorials/multi_xpu_GLM-4.5.md
+++ b/docs/source/tutorials/multi_xpu_GLM-4.5.md
@@ -113,7 +113,16 @@ python -m vllm.entrypoints.openai.api_server \
      --no-enable-chunked-prefill \
      --distributed-executor-backend mp \
      --served-model-name GLM-4.5 \
-      --compilation-config '{"splitting_ops": ["vllm.unified_attention_with_output_kunlun", "vllm.unified_attention", "vllm.unified_attention_with_output", "vllm.mamba_mixer2"]}'  > log_glm_plugin.txt 2>&1 &
+      --compilation-config '{"splitting_ops": ["vllm.unified_attention",
+                                                "vllm.unified_attention_with_output",
+                                                "vllm.unified_attention_with_output_kunlun",
+                                                "vllm.mamba_mixer2",
+                                                "vllm.mamba_mixer",
+                                                "vllm.short_conv",
+                                                "vllm.linear_attention",
+                                                "vllm.plamo2_mamba_mixer",
+                                                "vllm.gdn_attention",
+                                                "vllm.sparse_attn_indexer"]}'  > log_glm_plugin.txt 2>&1 &
 ```

 If your service start successfully, you can see the info shown below:
--- a/docs/source/tutorials/multi_xpu_Qwen3-Coder-480B-A35B(W8A8).md
+++ b/docs/source/tutorials/multi_xpu_Qwen3-Coder-480B-A35B(W8A8).md
@@ -16,7 +16,8 @@ if [ $XPU_NUM -gt 0 ]; then
    DOCKER_DEVICE_CONFIG="${DOCKER_DEVICE_CONFIG} --device=/dev/xpuctrl:/dev/xpuctrl"
 fi

-export build_image="xxxxxxxxxxxxxxxxx" 
+export build_image="iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.32"
+

 docker run -itd ${DOCKER_DEVICE_CONFIG} \
    --net=host \
@@ -32,14 +33,12 @@ docker run -itd ${DOCKER_DEVICE_CONFIG} \

 ### Preparation Weight

-* Pull Qwen3-Coder-480B-A35B-Instruct bf16 weights
-* Modify the weights configuration.json file and add the fields quantization_config and compression_config.
+- Pull Qwen3-Coder-480B-A35B-Instruct bf16 weights
+- Modify the weights configuration.json file and add the fields quantization_config and compression_config.

 ```json
 {
-  "architectures": [
-    "Qwen3MoeForCausalLM"
-  ],
+  "architectures": ["Qwen3MoeForCausalLM"],
  "attention_dropout": 0.0,
  "decoder_sparse_step": 1,
  "eos_token_id": 151645,
@@ -61,7 +60,7 @@ docker run -itd ${DOCKER_DEVICE_CONFIG} \
  "num_key_value_heads": 8,
  "output_router_logits": false,
  "qkv_bias": false,
-  "rms_norm_eps": 1e-06,
+  "rms_norm_eps": 1e-6,
  "rope_scaling": null,
  "rope_theta": 10000000,
  "router_aux_loss_coef": 0.0,
@@ -104,7 +103,6 @@ docker run -itd ${DOCKER_DEVICE_CONFIG} \
    "sparsity_config": null
  }
 }
-
 ```

 ### Online Serving on Multi XPU
@@ -128,5 +126,15 @@ python3 -m vllm.entrypoints.openai.api_server \
 --enable-chunked-prefill=False \
 --no-enable-prefix-caching \
 --disable-log-requests \
- --gpu-memory-utilization 0.85
-```
+ --gpu-memory-utilization 0.9 \
+ --compilation-config '{"splitting_ops": ["vllm.unified_attention",
+                                                "vllm.unified_attention_with_output",
+                                                "vllm.unified_attention_with_output_kunlun",
+                                                "vllm.mamba_mixer2",
+                                                "vllm.mamba_mixer",
+                                                "vllm.short_conv",
+                                                "vllm.linear_attention",
+                                                "vllm.plamo2_mamba_mixer",
+                                                "vllm.gdn_attention",
+                                                "vllm.sparse_attn_indexer"]}' 2>&1 | tee output_p800.log
+```
--- a/docs/source/tutorials/single_xpu_Qwen3-8B.md
+++ b/docs/source/tutorials/single_xpu_Qwen3-8B.md
@@ -128,9 +128,16 @@ python -m vllm.entrypoints.openai.api_server \
      --no-enable-chunked-prefill \
      --distributed-executor-backend mp \
      --served-model-name Qwen3-8B \
-      --compilation-config '{"splitting_ops": ["vllm.unified_attention_with_output_kunlun",
-            "vllm.unified_attention", "vllm.unified_attention_with_output",
-            "vllm.mamba_mixer2"]}' \
+      --compilation-config '{"splitting_ops": ["vllm.unified_attention",
+                                                "vllm.unified_attention_with_output",
+                                                "vllm.unified_attention_with_output_kunlun",
+                                                "vllm.mamba_mixer2",
+                                                "vllm.mamba_mixer",
+                                                "vllm.short_conv",
+                                                "vllm.linear_attention",
+                                                "vllm.plamo2_mamba_mixer",
+                                                "vllm.gdn_attention",
+                                                "vllm.sparse_attn_indexer"]}' \
 ```

 If your service start successfully, you can see the info shown below: