[Bugs] Fix Docs Build Problem (#97)

* [Bugs] Docs fixed * Update contributing.md * Update index.md * fix lua to text * fix title size
2026-01-10 05:55:40 +08:00
parent 8c9cabd760
commit 7be26ca617
17 changed files with 721 additions and 151 deletions
--- a/docs/Dockerfile.xpu
+++ b/docs/Dockerfile.xpu
@@ -0,0 +1,87 @@
+#
+# Copyright (c) 2025 Baidu Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-kunlun project.
+#
+# This file is mainly Adapted from vllm-project/vllm/vllm/envs.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.37_base
+
+SHELL ["bash", "-c"]
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    ca-certificates tzdata vim net-tools \
+    gcc g++ cmake libnuma-dev \
+    wget tmux curl \
+    software-properties-common && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN conda init && conda create --name vllm_kunlun_0.10.1.1 python=3.10.15 -y && \
+    conda run -n vllm_kunlun_0.10.1.1 && source activate vllm_kunlun_0.10.1.1 && \
+    conda clean -afy
+
+RUN source activate vllm_kunlun_0.10.1.1 && \
+    pip install torch==2.5.1+cu118 torchvision==0.20.1+cu118 torchaudio==2.5.1+cu118 --index-url https://download.pytorch.org/whl/cu118 && \
+    pip cache purge && rm -rf /root/.cache/pip
+
+RUN source activate vllm_kunlun_0.10.1.1 && \
+    pip install setuptools==80.9.0 cuda_mock==1.1.1 hyperparameter==0.5.6 black==23.3.0 lark==1.2.2 \
+    networkx wheel PyYAML==6.0.2 ipython h5py regex==2024.9.11 colorama==0.4.6 pynvml==11.5.3 \
+    nvidia-cuda-runtime-cu11 tabulate==0.9.0 openpyxl==3.1.5 pandas prettytable \
+    pytest==8.1.0 pytest-repeat==0.9.3 pytest-timeout==2.3.1 py==1.11.0 datasets==2.16.0 \
+    pydantic==2.9.2 psutil==6.1.0 einops==0.8.0 \
+    pytest-html==4.1.1 py-cpuinfo pytest-timeout==2.3.1 termcolor jsonlines==4.0.0 tiktoken \
+    qwen_vl_utils filetype fastapi==0.112.1 fire py-cpuinfo transformers==4.57.1 \
+    gradio==4.0.0 sse-starlette trl==0.8.6 uvicorn accelerate==0.30.1 --index-url https://pip.baidu-int.com/simple/ && \
+    pip cache purge && rm -rf /root/.cache/pip
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    LANG=en_US.UTF-8 \
+    LANGUAGE=en_US:en \
+    TERM=xterm-256color \
+    PATH=/root/miniconda/envs/vllm_kunlun_0.10.1.1/bin/:$PATH
+
+WORKDIR /workspace
+RUN wget https://su.bcebos.com/v1/klx-sdk-release-public/xccl/resource/MLNX_OFED_LINUX-24.01-0.3.3.1-ubuntu20.04-x86_64.tgz && tar -xf MLNX_OFED_LINUX-24.01-0.3.3.1-ubuntu20.04-x86_64.tgz
+WORKDIR /workspace/MLNX_OFED_LINUX-24.01-0.3.3.1-ubuntu20.04-x86_64
+RUN apt-get install -y -o Dpkg::Options::='--force-confdef' -o Dpkg::Options::='--force-confold' flex swig tk debhelper libltdl-dev libusb-1.0-0 tcl chrpath pkg-config graphviz bison && \
+    ./mlnxofedinstall --user-space-only --skip-distro-check --without-fw-update --force && \
+    rm -rf /workspace/MLNX_OFED_LINUX-24.01-0.3.3.1-ubuntu20.04-x86_64* && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+COPY vllm-kunlun/ /workspace/vllm-kunlun/ 
+RUN cd /workspace/vllm-kunlun && \
+    bash dockerfile/install.sh && \
+    rm -rf /workspace/vllm-kunlun/build/ && rm -rf /workspace/vllm-kunlun/vllm_kunlun.egg-info/
+
+# xpu-smi tools
+WORKDIR /workspace
+COPY xre-Linux-x86_64-5.2.0.0/  /workspace/xre-Linux-x86_64-5.2.0.0/
+RUN mv /workspace/xre-Linux-x86_64-5.2.0.0/bin/* /usr/local/bin/ && mv /workspace/xre-Linux-x86_64-5.2.0.0/so/*  /lib/x86_64-linux-gnu/ && \
+    rm -rf /workspace/xre-Linux-x86_64-5.2.0.0/
+
+RUN rm -rf \
+    /root/.cache \
+    /root/.conda \
+    /tmp/*
+
+CMD ["/bin/bash"]
--- a/docs/README.md
+++ b/docs/README.md
@@ -5,52 +5,53 @@
 uv venv myenv --python 3.12 --seed
 source myenv/bin/activate

-# 步骤1：进入docs目录
+
+ # Step 1: Enter the docs directory
 cd docs

-# 步骤2：安装依赖（使用uv）
+# Step 2: Install dependencies (using uv)
 uv pip install -r requirements-docs.txt

-# 安装 sphinx-autobuild（如果没在 requirements 文件里）
+# Install sphinx-autobuild (if not in requirements file)
 uv pip install sphinx-autobuild

-# 从 docs 目录运行：
+# Run from the docs directory:
 sphinx-autobuild ./source ./_build/html --port 8000

-# 步骤1：清理旧文件
+# Step 1: Clean up old files
 make clean

-# 步骤2：构建HTML
+# Step 2: Build HTML
 make html

-# 步骤3：本地预览
+# Step 3: Local preview
 python -m http.server -d _build/html/

-浏览器访问：http://localhost:8000
+Browser access: http://localhost:8000

 🌍 Internationalization
-国际化翻译流程（以中文为例）
+Internationalization translation process (taking Chinese as an example)

-# 步骤1：提取可翻译文本（生成 .pot）
+# Step 1: Extract translatable text (generate .pot)
 sphinx-build -b gettext source _build/gettext

-# 步骤2：生成/更新中文 .po 文件
+# Step 2: Generate/update Chinese .po file
 sphinx-intl update -p _build/gettext -l zh_CN

-# 步骤3：人工翻译 .po 文件
-# 用文本编辑器打开 source/locale/zh_CN/LC_MESSAGES/*.po
-# 在 msgstr "" 里填入中文翻译
+# Step 3: Manually translate .po file
+# Use a text editor to open source/locale/zh_CN/LC_MESSAGES/*.po
+# Fill in the Chinese translation in msgstr ""

-# 步骤4：编译并构建中文文档
+# Step 4: Compile and build Chinese documentation
 make intl

-# 步骤5：查看效果
+# Step 5: View the effect
 python -m http.server -d _build/html


-浏览器访问：
+Browser access:

-英文版： http://localhost:8000
-中文版： http://localhost:8000/zh-cn
+English version: http://localhost:8000
+Chinese version: http://localhost:8000/zh-cn

 ```
--- a/docs/envs.py
+++ b/docs/envs.py
@@ -47,18 +47,15 @@ env_variables: Dict[str, Callable[[], Any]] = {
    # The C compiler used for compiling the package. If not set, the default
    # value is None, which means the system default C compiler will be used.
    "C_COMPILER": lambda: os.getenv("C_COMPILER", None),
-    # The version of the Kunlun chip. If not set, the default value is
-    # KUNLUN910B1(Available for A2 and A3 series). It's used for package building.
-    # Please make sure that the version is correct.
-    "SOC_VERSION": lambda: os.getenv("SOC_VERSION", "KUNLUN910B1"),
+
+    "SOC_VERSION": lambda: os.getenv("SOC_VERSION", "KUNLUNP800"),
    # If set, vllm-kunlun will print verbose logs during compilation
    "VERBOSE": lambda: bool(int(os.getenv("VERBOSE", "0"))),
-    # The home path for CANN toolkit. If not set, the default value is
    # /usr/local/Kunlun/kunlun-toolkit/latest
    "KUNLUN_HOME_PATH": lambda: os.getenv("KUNLUN_HOME_PATH", None),
-    # The path for HCCL library, it's used by pyhccl communicator backend. If
-    # not set, the default value is libhccl.so。
-    "HCCL_SO_PATH": lambda: os.environ.get("HCCL_SO_PATH", None),
+    # The path for XCCL library, it's used by pyxccl communicator backend. If
+    # not set, the default value is libxccl.so。
+    "XCCL_SO_PATH": lambda: os.environ.get("XCCL_SO_PATH", None),
    # The version of vllm is installed. This value is used for developers who
    # installed vllm from source locally. In this case, the version of vllm is
    # usually changed. For example, if the version of vllm is "0.9.0", but when
@@ -119,7 +116,6 @@ env_variables: Dict[str, Callable[[], Any]] = {
    # and the mla_pa will be the default path of deepseek decode path.
    "VLLM_KUNLUN_MLA_PA": lambda: int(os.getenv("VLLM_KUNLUN_MLA_PA", 0)),
    # Whether to enable MatmulAllReduce fusion kernel when tensor parallel is enabled.
-    # this feature is supported in A2, and eager mode will get better performance.
    "VLLM_KUNLUN_ENABLE_MATMUL_ALLREDUCE": lambda: bool(
        int(os.getenv("VLLM_KUNLUN_ENABLE_MATMUL_ALLREDUCE", "0"))
    ),
--- a/docs/source/community/contributors.md
+++ b/docs/source/community/contributors.md
@@ -35,4 +35,5 @@
 |   Yijin Qiao   |
 |  Chenchao Hu   |
 |  Weijie Hong   |
-|   Song Jiang   |
+|   Song Jiang   |
+|   Hongwei Ma   |
--- a/docs/source/developer_guide/contribution/contributing.md
+++ b/docs/source/developer_guide/contribution/contributing.md
@@ -4,7 +4,7 @@
 It's recommended to set up a local development environment to build vllm-kunlun and run tests
 before you submit a PR.

-#### Run models locally
+### Run models locally

 After completing Run lint setup which is shown in quicksatrt, you can run your changed locally:

@@ -40,7 +40,7 @@ python -m vllm.entrypoints.openai.api_server \
 ```
 Please save a screenshot of your service running successfully, and attach an accuracy report.

-#### Submit the commit
+### Submit the commit

 ```bash
 # Commit changed files using `-s`
@@ -74,10 +74,3 @@ If the PR spans more than one category, please include all relevant prefixes.
 ## Others

 If you find any problem when contributing, you can join our slack group to talk with us and then feel free to submit a PR to improve the doc to help other developers. 
-
-:::{toctree}
-:caption: Index
-:maxdepth: 1
-testing
-multi_node_test
-:::
--- a/docs/source/developer_guide/contribution/index.md
+++ b/docs/source/developer_guide/contribution/index.md
@@ -1,5 +1,69 @@
 # Contributing

 ## Building and Testing
+It's recommended to set up a local development environment to build vllm-kunlun and run tests
+before you submit a PR.

-Comming soon...
+### Run models locally
+
+After completing Run lint setup which is shown in quicksatrt, you can run your changed locally:
+
+```{code-block} bash
+   :substitutions:
+
+python -m vllm.entrypoints.openai.api_server \
+      --host 0.0.0.0 \
+      --port 8356 \
+      --model /your_modified_models\
+      --trust-remote-code \
+      --tensor-parallel-size 1 \
+      --no-enable-prefix-caching \
+      --no-enable-chunked-prefill \
+      --distributed-executor-backend mp \
+      --served-model-name your_modified_models \
+      --compilation-config '{"splitting_ops": ["vllm.unified_attention_with_output_kunlun",
+            "vllm.unified_attention", "vllm.unified_attention_with_output",
+            "vllm.mamba_mixer2"]}' \
+```
+Please save a screenshot of your service running successfully, and attach an accuracy report.
+
+### Submit the commit
+
+```bash
+# Commit changed files using `-s`
+git commit -sm "your commit info"
+```
+
+🎉 Congratulations! You have completed the development environment setup.
+
+
+## PR Title and Classification
+
+Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:
+
+- `[Attention]` for new features or optimization in attention.
+- `[Communicator]` for new features or optimization in communicators.
+- `[ModelRunner]` for new features or optimization in model runner.
+- `[Platform]` for new features or optimization in platform.
+- `[Worker]` for new features or optimization in worker.
+- `[Core]` for new features or optimization  in the core vllm-kunlun logic (such as platform, attention, communicators, model runner)
+- `[Kernel]` for changes affecting compute kernels and ops.
+- `[Bugfix]` for bug fixes.
+- `[Doc]` for documentation fixes and improvements.
+- `[Test]` for tests (such as unit tests).
+- `[CI]` for build or continuous integration improvements.
+- `[Misc]` for PRs that do not fit the above categories. Please use this sparingly.
+
+:::{note}
+If the PR spans more than one category, please include all relevant prefixes.
+:::
+
+## Others
+
+If you find any problem when contributing, you can join our slack group to talk with us and then feel free to submit a PR to improve the doc to help other developers. 
+
+:::{toctree}
+:caption: Index
+:maxdepth: 1
+contributing
+:::
--- a/docs/source/developer_guide/evaluation/accuracy/accuracy_kernel.md
+++ b/docs/source/developer_guide/evaluation/accuracy/accuracy_kernel.md
@@ -1,10 +1,10 @@
-## Operator accuracy test
+# Operator accuracy test

-### torch_xray
+## torch_xray

 torch_xray is an operator precision analysis tool that can dump module-level input-output precision comparisons and automatically construct operator unit tests.

-#### 1.Download and install
+### 1.Download and install

 ***\*python3.10:\****

@@ -20,9 +20,9 @@ bos:/klx-sdk-release-public/xpytorch/dev_kl3/torch_xray/latest/torch_xray-999.9.

 Note that the same installation package must be used when using it in different environments.

-#### 2.Use
+### 2.Use

-##### Dump module-level inputs and outputs and compare their precision.
+#### Dump module-level inputs and outputs and compare their precision.

 Below is a sample code snippet used to dump the input and output of the vision module and compare the errors in the vllm framework.

@@ -50,7 +50,7 @@ The results directory will generate an h5 file and a csv file.
 -rw-r--r-- 1 root root        71 Oct 31 13:11 globalrank-0_localrank-0_summary.csv
 ```

-##### Data processing
+#### Data processing

 ```bash
 summary xxx.h5 sum.txt
@@ -91,7 +91,7 @@ The generated h5 file is processed using the summary command to generate a txt f
 +-------+------+------+-----------------------------------------------------------+-------------+-------------+--------------+-------------+
 ```

-##### Accuracy Comparison
+#### Accuracy Comparison

 ```bash
 # The results are stored in result.csv
@@ -103,7 +103,7 @@ The `compare` command is used to process the H5 files generated on the GPU and X
 If you encounter a "no matched keys" problem, please refer to the instructions at the end of this article for a solution.


-##### Example of results
+#### Example of results

 ```bash
 +-------+--------+-----------------------------------------------------------+--------+-----------+-------------+-------------+--------+
@@ -141,11 +141,11 @@ If you encounter a "no matched keys" problem, please refer to the instructions a

 Generally, the main focus is on Min Err/Max Err.

-##### Indicator Explanation
+#### Indicator Explanation

 To be improved...

-#### The dump operator is tested and run.
+### The dump operator is tested and run.

 ```bash
 X_DEBUG=0x102 # trace operator name、arguments shape、dtype、data_range
@@ -199,13 +199,13 @@ This is the file directory.
 │       ├── dump.json # Information needed to generate unit tests, such as input/output size and dtype.
 ```

-##### Generate unit test
+#### Generate unit test

 jprof --cpu_init --blacklist --factory=load dump.json

 Create a pytests directory in the current directory to store unit tests.

-##### Run unit test
+#### Run unit test

 The GPU only needs to copy the XPU's pytests directory and execute it.

@@ -216,14 +216,14 @@ Since the unit test program defaults to finding the actual dumped tensors using
 pytest --detail_compare_path=./xxx.csv proc_xxx/pytests/ --seed 42
 ```

-##### Results Comparison
+#### Results Comparison

 ```bash
 # After obtaining two result CSV files, compare them and generate result.csv.
 summary_diff_check  ./xpu.csv ./gpu.csv ./result.csv
 ```

-##### Example of results
+#### Example of results

 ```bash
 +------------+-----------------------+-------------+-------------+-----------+----------+---------+---------+----------+
@@ -242,9 +242,9 @@ summary_diff_check  ./xpu.csv ./gpu.csv ./result.csv

 The main focus is on the values of gpu_1e-1, xpu_1e-1, etc., which represent the number of elements whose error between the gpu/xpu result and the cpu result exceeds the order of 1e-n. This serves as the primary basis for determining whether there is a problem with the operator's precision.

-#### Replenish
+### Replenish

-##### Bypassing the issue of differing naming conventions between Kunlun Card and GPU modules, which prevents diff calculation.
+#### Bypassing the issue of differing naming conventions between Kunlun Card and GPU modules, which prevents diff calculation.

 ```bash
 #
--- a/docs/source/developer_guide/evaluation/accuracy/accuracy_server.md
+++ b/docs/source/developer_guide/evaluation/accuracy/accuracy_server.md
@@ -1,8 +1,8 @@
-## Overall accuracy test
+# Overall accuracy test

-### EvalScope
+## EvalScope

-#### 1.Download and install
+### 1.Download and install

 EvalScope supports use in Python environments. Users can install EvalScope via pip or from source code. Here are examples of both installation methods:

@@ -15,7 +15,7 @@ cd evalscope
 pip install -e '.[perf]'
 ```

-#### 2.Dataset preparation script
+### 2.Dataset preparation script

 ```python
 from evalscope.collections import CollectionSchema, DatasetInfo, WeightedSampler
@@ -88,20 +88,24 @@ if not os.path.exists(output_dir):  # Step 4: Check if the directory exists
 # dump the mixed data to a jsonl file
 dump_jsonl_data(mixed_data, output_path)  # Step 6: Securely write to the file
 ```
+
 Dataset composition visualization:
+
 ```
 ┌───────────────────────────────────────┐
 │       VL-Test (1000 samples)          │
 ├─────────────────┬─────────────────────┤
 │   PureText      │      Vision         │
-│   (333 样本)    │    (667 样本)        │
+│   (333 samples) │    (667 samples)    │
 ├─────────────────┼─────────────────────┤
 │ • mmlu_pro      │ • math_vista        │
 │ • ifeval        │ • mmmu_pro          │
 │ • gsm8k         │                     │
 └─────────────────┴─────────────────────┘
 ```
-#### 3.Test
+
+### 3.Test
+
 ```python
 from dotenv import dotenv_values

@@ -134,13 +138,14 @@ task_cfg = TaskConfig(

 run_task(task_cfg=task_cfg)
 ```
+
 Parameter Tuning Guide:

-| Parameter        | Current value | Effect  | Adjustment suggestions                |
-| ----------------- | ------ | --------------- | ----------------------- |
-| `temperature`     | 0.6    | Control output diversity  | Math problems ↓ 0.3 / Creative writing ↑ 0.9 |
-| `top_p`           | 0.95   | Filtering low-probability tokens | Reduce "nonsense"         |
-| `eval_batch_size` | 5      | Number of requests processed in parallel  | With sufficient video memory, it can be increased to 10.         |
+| Parameter         | Current value | Effect                                   | Adjustment suggestions                                   |
+| ----------------- | ------------- | ---------------------------------------- | -------------------------------------------------------- |
+| `temperature`     | 0.6           | Control output diversity                 | Math problems ↓ 0.3 / Creative writing ↑ 0.9             |
+| `top_p`           | 0.95          | Filtering low-probability tokens         | Reduce "nonsense"                                        |
+| `eval_batch_size` | 5             | Number of requests processed in parallel | With sufficient video memory, it can be increased to 10. |

 Run the test:

@@ -167,20 +172,22 @@ python accuracy.py 2>&1 | tee "$LOG_FILE"
 # ========================================
 EXIT_CODE=${PIPESTATUS[0]}
 if [ $EXIT_CODE -eq 0 ]; then
-    echo "✅ 评测完成! 日志已保存到: $LOG_FILE"
+    echo "✅ Evaluation completed! Log saved to: $LOG_FILE"
 else
-    echo "❌ 评测失败! 退出码: $EXIT_CODE 请查看日志: $LOG_FILE"
+    echo "❌ Evaluation failed! Exit code: $EXIT_CODE Please check the log: $LOG_FILE"
 fi
 ```
-#### 4.Common problem fixes

-##### 4.1 NLTK resource missing fix
+### 4.Common problem fixes
+
+#### 4.1 NLTK resource missing fix

 ```bash
 Resource punkt_tab not found.
 ```

 Solution：
+
 ```python
 import nltk
 import os
@@ -193,13 +200,13 @@ os.makedirs(download_dir, exist_ok=True)
 nltk.data.path.append(download_dir)

 # Step 3: Download necessary resources
-print("🔽 开始下载punkt_tab资源...")
+print("🔽 Start downloading punkt_tab resource...")
 try:
    nltk.download("punkt_tab", download_dir=download_dir)
-    print("✅ 下载成功!")
+    print("✅ Download successful!")
 except Exception as e:
-    print(f"❌ 下载失败: {e}")
-    print("💡 备选方案:手动从GitHub下载")
+    print(f"❌ Download failed: {e}")
+    print("💡 Alternative: Download manually from GitHub")
    print(
        "   URL: https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt_tab.zip"
    )
@@ -218,7 +225,7 @@ python fix_nltk.py
 bash run_accuracy_test.sh
 ```

-#### 5.Results Display
+### 5.Results Display

 ```bash
 +-------------+---------------------+--------------+---------------+-------+
--- a/docs/source/developer_guide/performance/performance_benchmark/benchmark_kernel.md
+++ b/docs/source/developer_guide/performance/performance_benchmark/benchmark_kernel.md
@@ -1,8 +1,8 @@
-## Operator performance
+# Operator performance

-### XProfiler
+## XProfiler

-#### 1.Download and install
+### 1.Download and install

 - The download link for the x86_64 platform installation package xre-Linux-x86_64 is:

@@ -16,7 +16,7 @@

 After downloading and extracting, you can directly execute `xpu-installer` and `install_rt.sh` to install.

-#### 2.Start using
+### 2.Start using

 XProfiler supports three modes: 1) fork mode; 2) time mode; and 3) daemon mode. After execution, XProfiler will generate two types of JSON files:

@@ -26,7 +26,7 @@ XProfiler supports three modes: 1) fork mode; 2) time mode; and 3) daemon mode.

 The specific modes will be introduced below.

-##### fork mode
+#### fork mode

 The fork pattern is used to track the entire time period from the start to the end of a user program. This pattern is suitable for most inference tasks and is the simplest to use. An example is shown below:

@@ -34,13 +34,13 @@ The fork pattern is used to track the entire time period from the start to the e
 /xxxx/xxxx/xprofiler -r500 --xpu=0 python test.py
 ```

-* --r: Sets the trace time resolution in nanoseconds (ns). The default is 100. If an "out of space error" occurs, try increasing the -r value to 500.
+- --r: Sets the trace time resolution in nanoseconds (ns). The default is 100. If an "out of space error" occurs, try increasing the -r value to 500.

-* --xpu: Specifies the acquisition device ID, supporting multi-card configuration. --xpu=all enables all cards; the default is card 0.
+- --xpu: Specifies the acquisition device ID, supporting multi-card configuration. --xpu=all enables all cards; the default is card 0.

 More parameters can be found in the command-line parameters section later.

-##### time mode
+#### time mode

 The time mode is used to track user programs for a period of time. This method is suitable for tasks that need to run for a long time.

@@ -58,12 +58,12 @@ A temporary .sock file will be generated in the execution directory. The path ne

 ```bash
 export XPU_ENABLE_PROFILER_TRACING=1
-export XPU_TRACING_OUTPUT_NAME=<xprofiler 执行目录>/xprofiler.sock
+export XPU_TRACING_OUTPUT_NAME=<xprofiler execution directory>/xprofiler.sock
 # Start your own program
 python xxx.py
 ```

-##### deamon mode
+#### deamon mode

 The daemon mode is used to track the event timeline of a specified code segment, eliminating interference from redundant information. The startup command is the same as in fork mode.

@@ -99,49 +99,49 @@ xprofiler.sock
 ```python
 export XPU_ENABLE_PROFILER_TRACING=1
 # Here, the path to the .sock file from step 2 is used for assignment.
-export XPU_TRACING_OUTPUT_NAME=<xprofiler 执行目录>/xprofiler.sock
+export XPU_TRACING_OUTPUT_NAME=<xprofiler execution directory>/xprofiler.sock
 # Start your own program
 python xxx.py
 ```

 Note: If you want to specify a particular card to run on, you must import the XPU_VISIBLE_DEVICES environment variable in the terminal in steps 2 and 3; otherwise, you will not be able to capture the data.

-##### More parameters
+#### More parameters

-| parameters                | Example                     | default value | describe                |
-| -------------------------- | --------------------------------------- | ------ | ------------------------------------------------------------ |
-| -b or --buffer-size        | -b=512                                  | 256    | Specifies the size of the trace buffer in MB. This is generally not required. However, if there are many trace signals, the buffer size can be increased appropriately to avoid OOS (Out of Size). |
-| -x or --xpu                | -x=0--xpu=0                             | 0      | Set the card number to be tracked; multiple cards or all cards can be set.                       |
-| -t or --time               | -t=10                                   | off    | Enable time mode, in seconds, to capture information over a specified period.                   |
-| -d or --deamonize          | -r500                                   | 0      | Enable daemon mode to retrieve events in the background.                               |
-| -r or --export-profile     | -e ./trace_output-e ./output/trace.json | ./     | Record the trace results to a document or folder. If this parameter is not specified, a default xprofiler.trace.json file will be generated in the execution directory. |
-| -S or --settings           | -S xprofiler.trace.json                 | off    | xprofiler reads a JSON file containing the events that need to be traced. If this parameter is not configured, xprofiler enables `--profile-api-trace` and `--sse-trace` by default. |
-| -A or --profiler-api-trace | -A                                      | on     | Get driver events.                                              |
-| -s or --sse-trace          | -s                                      | on     | Get all SSE events.                                           |
-| -C or --cluster-trace      | -C                                      | off    | Retrieve all cluster events.                                        |
-| -n or --sdnn-trace         | -n                                      | off    | Get all SDNN events.                                           |
-| -c or --sdnn-cluster-trace | -c                                      | off    | Retrieve all SDNN cluster events.                                  |
-| -E or --cache-trace        | -E                                      | off    | Get bandwidth statistics events.                                           |
-| -u or --debug              | -u44:open log，debug level-u0:close log    | 33     | Debug the interface and enable driver event/device event logging.。                    |
+| parameters                 | Example                                 | default value | describe                                                                                                                                                                                           |
+| -------------------------- | --------------------------------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| -b or --buffer-size        | -b=512                                  | 256           | Specifies the size of the trace buffer in MB. This is generally not required. However, if there are many trace signals, the buffer size can be increased appropriately to avoid OOS (Out of Size). |
+| -x or --xpu                | -x=0--xpu=0                             | 0             | Set the card number to be tracked; multiple cards or all cards can be set.                                                                                                                         |
+| -t or --time               | -t=10                                   | off           | Enable time mode, in seconds, to capture information over a specified period.                                                                                                                      |
+| -d or --deamonize          | -r500                                   | 0             | Enable daemon mode to retrieve events in the background.                                                                                                                                           |
+| -r or --export-profile     | -e ./trace_output-e ./output/trace.json | ./            | Record the trace results to a document or folder. If this parameter is not specified, a default xprofiler.trace.json file will be generated in the execution directory.                            |
+| -S or --settings           | -S xprofiler.trace.json                 | off           | xprofiler reads a JSON file containing the events that need to be traced. If this parameter is not configured, xprofiler enables `--profile-api-trace` and `--sse-trace` by default.               |
+| -A or --profiler-api-trace | -A                                      | on            | Get driver events.                                                                                                                                                                                 |
+| -s or --sse-trace          | -s                                      | on            | Get all SSE events.                                                                                                                                                                                |
+| -C or --cluster-trace      | -C                                      | off           | Retrieve all cluster events.                                                                                                                                                                       |
+| -n or --sdnn-trace         | -n                                      | off           | Get all SDNN events.                                                                                                                                                                               |
+| -c or --sdnn-cluster-trace | -c                                      | off           | Retrieve all SDNN cluster events.                                                                                                                                                                  |
+| -E or --cache-trace        | -E                                      | off           | Get bandwidth statistics events.                                                                                                                                                                   |
+| -u or --debug              | -u44:open log，debug level-u0:close log | 33            | Debug the interface and enable driver event/device event logging.。                                                                                                                                |

-#### 3.View Results
+### 3.View Results

 The generated xprofiler.trace.json file can be viewed and analyzed using a visual interface. Two tools are introduced here.

-##### Chrome browser
+#### Chrome browser

 Enter chrome://tracing/ in your browser (you may need to enable developer tools the first time you access this site), and click "load" in the top left corner to import the file. Interface display.

 ![img](https://rte.weiyun.baidu.com/wiki/attach/image/api/imageDownloadAddress?attachId=89aef70f112a4394adcac8b03ef994db&docGuid=WFoZOcuqnSXJIE)

-##### prefetto ui
+#### prefetto ui

 Search directly, or visit[Perfetto UI](https://ui.perfetto.dev/#!/viewer?local_cache_key)，The interface is as follows。

 ![img](https://rte.weiyun.baidu.com/wiki/attach/image/api/imageDownloadAddress?attachId=895a715344e9473c9ee93518c3064b27&docGuid=WFoZOcuqnSXJIE)

-#### 4.Performance Analysis
+### 4.Performance Analysis

 With various performance data available, analysis and optimization can then be performed based on the results.

-(Further details to be added later)
+(Further details to be added later)
--- a/docs/source/developer_guide/performance/performance_benchmark/benchmark_server.md
+++ b/docs/source/developer_guide/performance/performance_benchmark/benchmark_server.md
@@ -1,12 +1,12 @@
-## vLLM server performance
+# vLLM server performance

-### vLLM benchmark CLI
+## vLLM benchmark CLI

 You can directly use vLLM's CLI benchmark. For more details, please refer to[vLLM Developer Guide Benchmark Suites](https://docs.vllm.ai/en/stable/contributing/benchmarks.html)

-#### 1.Online testing
+### 1.Online testing

-##### 1.1Start the vLLM server
+#### 1.1Start the vLLM server

 Server startup script reference

@@ -37,7 +37,7 @@ python -m vllm.entrypoints.openai.api_server \

 ```

-##### 1.2Execute test
+#### 1.2Execute test

 To run the test script, you can refer to the code below.

@@ -57,7 +57,7 @@ python -m vllm.entrypoints.cli.main bench serve \
    --ignore-eos 2>&1 | tee benchmark.log
 ```

-##### 1.3Result
+#### 1.3Result

 The following content will be displayed after the process is complete.

@@ -96,15 +96,15 @@ Key Parameter Explanation:
 | ***\*P99 TPOT\****          | 99% of requests' time per token generation    | ↓ The lower the better |
 | ***\*ITL\****               | Delay between adjacent output tokens            | ↓ The lower the better |

-#### 2.Offline testing
+### 2.Offline testing

 Comming soon...

-### EvalScope
+## EvalScope

 EvalScope is a comprehensive model testing tool that can test not only model accuracy but also performance. For more information, please visit [website address missing].[EvalScope](https://evalscope.readthedocs.io/en/latest/index.html)，A brief introduction follows.

-#### 1.Download and install
+### 1.Download and install

 EvalScope supports use in Python environments. Users can install EvalScope via pip or from source code. Here are examples of both installation methods:

@@ -119,11 +119,11 @@ pip install -e '.[perf]'

 After downloading, some modules may be missing, causing the program to fail to run. Just follow the prompts to install them.

-#### 2.Start using
+### 2.Start using

 The following demonstrates the performance test of the Qwen3-8B in a single-card scenario.

-##### 2.1Start the server
+#### 2.1Start the server

 The first step is to start the server. The example script is shown below.

@@ -154,7 +154,7 @@ python -m vllm.entrypoints.openai.api_server \

 ```

-##### 2.2 Start EvalScope
+#### 2.2 Start EvalScope

 Start EvalScope to begin performance testing.

@@ -175,7 +175,7 @@ evalscope perf \
  --extra-args '{"ignore_eos": true}'
 ```

-##### 2.3Results Analysis
+#### 2.3Results Analysis

 The following figure shows the results. You can view other data from a single test through the logs. For the specific meaning of the parameters, please refer to the parameter interpretation in the vLLM benchmark test.

--- a/docs/source/developer_guide/performance/performance_benchmark/index.md
+++ b/docs/source/developer_guide/performance/performance_benchmark/index.md
@@ -7,4 +7,5 @@ This document details the performance testing methods for vllm-kunlun and the an
 :maxdepth: 1
 benchmark_server
 benchmark_kernel
+profiling
 :::
--- a/docs/source/developer_guide/performance/performance_benchmark/profiling.md
+++ b/docs/source/developer_guide/performance/performance_benchmark/profiling.md
@@ -0,0 +1,418 @@
+# Profiling
+
+
+
+## 🔧 Action Plan（Three Phases）
+### Phase 1️⃣: Multi-Device Log Redirection Configuration
+#### Background
+By default, kernel logs from all 8 XPU devices are interleaved and emitted to [stdout], resulting in:
+- It becomes impossible to distinguish which log originates from which device.
+- Timestamps become interleaved, making it difficult to analyze the temporal relationships.
+- Single-device bottlenecks are masked by global aggregation.
+
+#### Solution
+During model initialization, create separate log files for each device.
+#### Code Explanation (embedded in qwen2.py)
+```python
+import os  # ← Ensure this is imported at the top of the file
+from vllm.distributed import get_tensor_model_parallel_rank  # ← Import function to get the tensor model parallel rank
+
+class Qwen2Model(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 decoder_layer_type: type[nn.Module] = Qwen2DecoderLayer):
+        super().__init__()
+
+        # ========== [Expert Solution] Kunlun XPU Multi-Device Log Redirection ==========
+        try:
+            # Step 1: Get the current XPU device's rank (0~7)
+            rank = get_tensor_model_parallel_rank()
+            
+            # Step 2: Create log directory (works with your get_kernel_time_ex.py)
+            log_dir = "./xpu_logs"
+            os.makedirs(log_dir, exist_ok=True)
+            
+            # Step 3: Generate a separate log file for each device
+            log_file = os.path.join(log_dir, f"rank_{rank}.log")
+            
+            # Step 4: Core operation – redirect file descriptors
+            # os.O_TRUNC: Clear previous logs on each run to avoid mixing outputs
+            fd = os.open(log_file, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o664)
+            os.dup2(fd, 1)  # Redirect stdout → rank_X.log
+            os.dup2(fd, 2)  # Redirect stderr → rank_X.log
+            os.close(fd)     # Close original file descriptor; redirection persists
+            
+            # Optional: print a confirmation message (will go into rank_X.log)
+            print(f"[Qwen2Model Init] Rank {rank} log redirected to {log_file}")
+            
+        except Exception as e:
+            # Fallback mechanism: failure to redirect logs does not affect model loading
+            print(f"[WARNING] Failed to redirect log for rank: {e}", flush=True)
+        # ========== End of log redirection code ==========
+
+```
+#### ⚠️ Common Issues
+**Q1**:Why not use Python's `logging` module?
+**A**:The XPU runtime kernel logs are emitted from the C++ layer and cannot be captured by Python’s `logging` module. Redirection via low-level file descriptors is required.
+**Q1**:Will logs be lost if the model fails to load??
+**A**:The `try-except` block ensures that if log redirection fails, it falls back to the default behavior without affecting model startup.
+
+### Phase 2️⃣: Profiling Environment Activation
+#### 🚀 vLLM Launch
+```bash
+unset XPU_DUMMY_EVENT
+export XPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export XPU_USE_MOE_SORTED_THRES=1
+export XFT_USE_FAST_SWIGLU=1
+export XMLIR_CUDNN_ENABLED=1
+export XPU_USE_DEFAULT_CTX=1
+export XMLIR_FORCE_USE_XPU_GRAPH=1
+export XPU_USE_FAST_SWIGLU=1
+export VLLM_HOST_IP=$(hostname -i)
+echo "VLLM_HOST_IP: $VLLM_HOST_IP"
+
+export XMLIR_ENABLE_MOCK_TORCH_COMPILE=false
+
+export XPUAPI_DEBUG=0x1              # Enable kernel performance logging
+export XPURT_DISPATCH_MODE=PROFILING # Activate profiling mode
+
+USE_ORI_ROPE=1 VLLM_USE_V1=1 python -m vllm.entrypoints.openai.api_server \
+      --host 0.0.0.0 \
+      --port 8000 \
+      --model /models/Qwen2.5-72B-Instruct \
+      --gpu-memory-utilization 0.9 \
+      --trust-remote-code \
+      --max-model-len 32768 \
+      --tensor-parallel-size 8 \
+      --dtype float16 \
+      --max_num_seqs 512 \
+      --max_num_batched_tokens 32768 \
+      --max-seq-len-to-capture 32768 \
+      --block-size 128 \
+      --no-enable-prefix-caching \
+      --no-enable-chunked-prefill \
+      --distributed-executor-backend mp \
+      --served-model-name Qwen2.5-72B-Instruct \
+      --compilation-config '{"splitting_ops": ["vllm.unified_attention_with_output_kunlun",
+            "vllm.unified_attention", "vllm.unified_attention_with_output",
+            "vllm.mamba_mixer2"]}' 2>&1 | tee output_p800.log
+
+```
+
+
+#### 🚀 Client Load Testing
+```bash
+#!/bin/bash
+
+# Define test combinations array (concurrency x input length x output length)
+TEST_COMBINATIONS=(
+    "8x1024x1024" # Medium-low concurrency
+)
+
+# Create result directory
+RESULT_DIR="bench_$(date +%Y%m%d_%H%M)"
+mkdir -p $RESULT_DIR
+
+# Summary results file
+SUMMARY_FILE="$RESULT_DIR/summary_results.csv"
+echo "num_prompts,input_len,output_len,throughput,latency_mean,latency_p50,latency_p90,latency_p99" >$SUMMARY_FILE
+
+# Progress counter
+TOTAL_TESTS=${#TEST_COMBINATIONS[@]}
+CURRENT_TEST=0
+
+# Loop through different test combinations
+for COMBINATION in "${TEST_COMBINATIONS[@]}"; do
+    # Parse combination parameters
+    NUM_PROMPTS=$(echo $COMBINATION | cut -d'x' -f1)
+    INPUT_LEN=$(echo $COMBINATION | cut -d'x' -f2)
+    OUTPUT_LEN=$(echo $COMBINATION | cut -d'x' -f3)
+
+    # Update progress
+    CURRENT_TEST=$((CURRENT_TEST + 1))
+
+    echo "=========================================================="
+    echo "Test progress: $CURRENT_TEST/$TOTAL_TESTS ($(printf "%.1f" $(echo "$CURRENT_TEST/$TOTAL_TESTS*100" | bc -l))%)"
+    echo "Current test configuration: concurrency=$NUM_PROMPTS, input length=$INPUT_LEN, output length=$OUTPUT_LEN"
+    echo "=========================================================="
+
+    OUTPUT_FILE="$RESULT_DIR/p800_${NUM_PROMPTS}_${INPUT_LEN}_${OUTPUT_LEN}.log"
+
+    # Run benchmark
+    python3 -m vllm.entrypoints.cli.main bench serve \
+        --host 127.0.0.1 \
+        --port 8000 \
+        --backend vllm \
+        --model Qwen2.5-72B-Instruct \
+        --dataset-name random \
+        --num-prompts $NUM_PROMPTS \
+        --random-input-len $INPUT_LEN \
+        --random-output-len $OUTPUT_LEN \
+        --tokenizer /ssd1/models/Qwen2.5-72B-Instruct \
+        --ignore-eos 2>&1 | tee $OUTPUT_FILE
+
+    # Wait 15 seconds to let the service recover
+    echo "Waiting 15 seconds before the next round..."
+    sleep 15
+
+    # Extract key performance metrics from output and append to summary file
+    THROUGHPUT=$(grep "Throughput" $OUTPUT_FILE | awk '{print $2}')
+    LATENCY_MEAN=$(grep "Mean latency" $OUTPUT_FILE | awk '{print $3}')
+    LATENCY_P50=$(grep "p50 latency" $OUTPUT_FILE | awk '{print $3}')
+    LATENCY_P90=$(grep "p90 latency" $OUTPUT_FILE | awk '{print $3}')
+    LATENCY_P99=$(grep "p99 latency" $OUTPUT_FILE | awk '{print $3}')
+
+    echo "$NUM_PROMPTS,$INPUT_LEN,$OUTPUT_LEN,$THROUGHPUT,$LATENCY_MEAN,$LATENCY_P50,$LATENCY_P90,$LATENCY_P99" >>$SUMMARY_FILE
+done
+
+# Output summary report
+echo "=========================================================="
+echo "Benchmark completed! Results saved in: $RESULT_DIR"
+echo "=========================================================="
+
+
+```
+
+### Phase 3️⃣: Log Analysis and Bottleneck Identification
+```text
+xpu_logs/
+├─ rank_0.log
+├─ rank_1.log
+├─ rank_2.log
+├─ rank_3.log
+├─ rank_4.log
+├─ rank_5.log
+├─ rank_6.log
+└─ rank_7.log
+
+```
+#### 🔍 Script Workflow (op_log.py)
+**Input**:Raw Kernel Logs (Sample Format)
+```
+[XPURT_PROF] void xblas_xpu3::fc_cdnn_infer<float16,...> 123456 ns
+[XPURT_PROF] void kl3_all_reduce<float16> 987654 ns
+```
+**Processing logic**
+:::::{tab-set}
+::::{tab-item} op_log.py 
+
+
+```python
+"""
+A better version of 'get_op_time.py', get more level dump and support kl3.
+ 
+Usage: python3 get_kernel_time_ex.py --help
+"""
+ 
+import os
+import sys
+import re
+ 
+unit_factors = [0.9, 1.3, 1.45] # kunlun1, kunlun2, kunlun3
+patterns = ["\[XPURT_PROF\] (\S+)\s+\S+\s+(\S+) ns", "\[XPURT_PROF\] (\S+)\s+(\S+)\s+\S+ ns"]
+tab_space_num = int(4)
+ 
+def get_total_time(res):
+    total_time = 0.0
+    for i in res.values():
+        total_time += i
+    return  total_time
+ 
+def print_info_op(res, cnt, unit, op):
+    total_time = get_total_time(res)
+    total_cnt = 0
+    # print detailed op time
+    lis=sorted(res.items(), key=lambda d:d[1], reverse=True)
+    if sys.version_info.major == 2:
+        import commands
+        for i in range(len(lis)):
+            (status, cmd_output) = commands.getstatusoutput("c++filt {}".format(lis[i][0]))
+            if status == 0:
+                formt_type = (cmd_output.split('('))[0]
+            total_cnt += cnt[lis[i][0]]
+    elif sys.version_info.major == 3:
+        import subprocess
+        for i in range(len(lis)):
+            (status, cmd_output) = subprocess.getstatusoutput("c++filt {}".format(lis[i][0]))
+            if status == 0:
+                formt_type = (cmd_output.split('('))[0]
+            total_cnt += cnt[lis[i][0]]
+    print(f"{op} {total_time / unit} {total_cnt}")
+ 
+def print_info_kernel(res, cnt, unit):
+    total_time = get_total_time(res)
+    total_cnt = 0
+    print("Total time(ms) is {}".format(total_time / unit))
+    # print detailed op time
+    lis=sorted(res.items(), key=lambda d:d[1], reverse=True)
+    if sys.version_info.major == 2:
+        print("{:<90}{:<10}{:<15}{:<15}".format("Op type", "count", "time(ms)", "%"))
+        import commands
+        for i in range(len(lis)):
+            (status, cmd_output) = commands.getstatusoutput("c++filt {}".format(lis[i][0]))
+            if status == 0:
+                formt_type = (cmd_output.split('('))[0]
+            print("{:<90}{:<10}{:<15}{:<15.5}".format(formt_type, cnt[lis[i][0]], lis[i][1] / unit, \
+                lis[i][1] / total_time * 100))
+            total_cnt += cnt[lis[i][0]]
+    elif sys.version_info.major == 3:
+        print("{:<90}{:<10}{:<20}{:<20}".format("Op type", "count", "time(ms)", "%"))
+        import subprocess
+        for i in range(len(lis)):
+            (status, cmd_output) = subprocess.getstatusoutput("c++filt {}".format(lis[i][0]))
+            if status == 0:
+                formt_type = (cmd_output.split('('))[0]
+            print("{:<150}{:<10}{:<25}{:<20.5}".format(formt_type, cnt[lis[i][0]], lis[i][1] / unit, \
+                lis[i][1] / total_time * 100))
+            total_cnt += cnt[lis[i][0]]
+ 
+    print("Total count is {}".format(total_cnt))
+ 
+def count_head_spaces(s: str) -> int:
+   
+    count = 0
+    for char in s:
+        if char == ' ':
+            count += 1
+        else:
+            break
+    return count
+ 
+def process_line(lines, pattern1, unit_factor, dump_level):
+    """ process a line in a file with profiling info
+ 
+    Args:
+        unit_factor: A factor differentiated by KUNLUN1 and KUNLUN2
+ 
+    """
+    res = {}
+    cnt = {}
+    op = "init_op"
+    unit = unit_factor * 1000 * 1000 # ns -> ms
+    wait_next_one = False
+    for i in range(len(lines)):
+        cur_line = lines[i]
+        if "gtest_" in cur_line:
+            cur_level = count_head_spaces(cur_line) / tab_space_num
+            if cur_level == dump_level:
+                wait_next_one = False
+                print_info_op(res, cnt, unit, op)
+                # clear buf
+                res = {}
+                cnt = {}
+                op = cur_line.lstrip().rstrip()
+            elif cur_level < dump_level:
+                wait_next_one = True
+                # skip record kernel time untime next one
+                continue
+        if wait_next_one:
+            # skip record kernel time
+            continue
+        match = re.match(pattern1, lines[i])
+        if match:
+            op_type = match.group(1)
+            op_time = match.group(2)
+            if op_type in res:
+                res[op_type] += float(op_time)
+                cnt[op_type] += 1
+            else:
+                res[op_type] = float(op_time)
+                cnt[op_type] = 1
+ 
+    # get left total time
+    if dump_level == -1:
+        print_info_kernel(res, cnt, unit)
+    else:
+        print_info_op(res, cnt, unit, op)
+    return res
+ 
+def process_file(file_name, pattern2, unit_factor, dump_level = -1):
+    """ Process a file line by line
+ 
+    Iteratively process each line in the target file.
+ 
+    """
+ 
+    with open(file_name, "r") as f:
+        lines = f.readlines()
+        f1_res_list = process_line(lines, pattern2, unit_factor, dump_level)
+ 
+if __name__ == '__main__':
+    import argparse
+ 
+
+    parser = argparse.ArgumentParser()
+ 
+
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument('-xpu1', action='store_true', help='指定为 xpu1')
+    group.add_argument('-xpu2', action='store_true', help='指定为 xpu2')
+    group.add_argument('-xpu3', action='store_true', help='指定为 xpu3')
+    parser.add_argument('--level', type=int, default=-1, help='指定 dump 缩进级别（默认为 -1）')
+
+    parser.add_argument('filename', help='要处理的文件名')
+ 
+
+    args = parser.parse_args()
+ 
+
+    filename = args.filename
+    xpu_version = 0
+    if args.xpu2:
+        xpu_version = 1
+    if args.xpu3:
+        xpu_version = 2
+    dump_level = args.level
+    print(f'Filename: {filename}')
+    print(f'-xpu option: {xpu_version}')
+    print(f'--level option: {dump_level}')
+ 
+    unit_factor = unit_factors[xpu_version]
+    pattern_idx = 0
+    if xpu_version > 0:
+        pattern_idx = 1
+    process_file(filename, patterns[pattern_idx], unit_factor, dump_level)
+ 
+```
+
+::::
+
+::::{tab-item} op_log.sh
+
+
+
+```bash
+
+for i in {0..7}; do
+    python op_log.py -xpu3 xpu_logs/rank_${i}.log > analysis_rank${i}.log
+    echo "Rank ${i} 分析完成"
+done
+
+
+for i in {0..7}; do
+    echo "=== Rank $i ===" 
+    head -n 6 analysis_rank${i}.log | tail -n 5
+done
+```
+::::
+:::::
+#### 📈 Output Example (analysis_rank0.log)
+```
+Filename: xpu_logs/rank_0.log
+-xpu option: 2
+--level option: -1
+Total time(ms) is 53742.29571862069
+Op type                                                                                   count     time(ms)            %                   
+void xblas_xpu3::fc_cdnn_infer<float16, float16, float16, float16, float, float, float, float, 1>                                                     661569    22736.262780689656       42.306              
+void kl3_all_reduce<float16>                                                                                                                          176134    14782.525712413793       27.506              
+void kl3_all_reduce_butterfly<float16>                                                                                                                164864    4197.28395862069         7.81           
+```
+#### 🚨 Troubleshooting Guide
+|Symptom|Cause|Solution|
+|-|-|-|
+|`xpu_logs` directory is empty|XPUAPI_DEBUG not enabled|Verify that the environment variable is correctly set|
+All 8 log files have identical content|Multi-process backend not activated|Ensure `--distributed-executor-backend` mp is specified|
+|Throughput drops >15%|Profiling overhead too high|Enable profiling only during analysis; disable in production|
--- a/docs/source/faqs.md
+++ b/docs/source/faqs.md
@@ -20,9 +20,8 @@ We will support the kunlun4 M100 platform in early 2026.

 ### 2. How to get our docker containers?

-**base**:`docker pull iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.32`.
+**base**:`docker pull wjie520/vllm_kunlun:v0.0.1`.

-**full**:`docker pull wjie520/vllm_kunlun:v0.0.1`.

 ### 3. How vllm-kunlun work with vLLM?

--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -16,9 +16,9 @@

 <p style="text-align:center">
 <script async defer src="https://buttons.github.io/buttons.js"></script>
-<a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a>
-<a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
-<a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
+<a class="github-button" href="https://github.com/baidu/vLLM-Kunlun" data-show-count="true" data-size="large" aria-label="Star">Star</a>
+<a class="github-button" href="https://github.com/baidu/vLLM-Kunlun/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
+<a class="github-button" href="https://github.com/baidu/vLLM-Kunlun/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
 </p>
 :::

--- a/docs/source/tutorials/multi_xpu_GLM-4.5.md
+++ b/docs/source/tutorials/multi_xpu_GLM-4.5.md
@@ -17,9 +17,10 @@ docker run -itd \
        -v /usr/local/bin/:/usr/local/bin/ \
        -v /lib/x86_64-linux-gnu/libxpunvidia-ml.so.1:/lib/x86_64-linux-gnu/libxpunvidia-ml.so.1 \
        iregistry.baidu-int.com/hac_test/aiak-inference-llm:xpu_dev_20251113_221821 bash
-        
+
 docker exec -it glm-vllm-01011 /bin/bash
 ```
+
 ### Offline Inference on multi XPU

 Start the server in a container:
@@ -30,7 +31,7 @@ import os
 from vllm import LLM, SamplingParams

 def main():
-    
+
    model_path = "/data/GLM-4.5"

    llm_params = {
@@ -50,7 +51,7 @@ def main():
            "content": [
                {
                    "type": "text",
-                    "text": "你好，请问你是谁?"
+                    "text": "Hello, who are you?"
                }
            ]
        }
@@ -68,8 +69,8 @@ def main():

    response = outputs[0].outputs[0].text
    print("=" * 50)
-    print("输入内容:", messages)
-    print("模型回复:\n", response)
+    print("Input content:", messages)
+    print("Model response:\n", response)
    print("=" * 50)

 if __name__ == "__main__":
@@ -83,12 +84,10 @@ If you run this script successfully, you can see the info shown below:

 ```bash
 ==================================================
-输入内容: [{'role': 'user', 'content': [{'type': 'text', 'text': '你好，请问你是谁?'}]}]
-模型回复:
+Input content: [{'role': 'user', 'content': [{'type': 'text', 'text': 'Hello, who are you?'}]}]
+Model response:
 <think>
-嗯，用户问了一个相当身份的直接问题。这个问题看似简单，但背后可能
-有几种可能性意—ta或许初次测试我的可靠性，或者单纯想确认对话方。从AI助手的常见定位，用户给出清晰平的方式明确身份，同时为后续可能
-的留出生进行的空间。\n\n用户用“你”这个“您”，语气更倾向非正式交流，所以回复风格可以轻松些。不过既然是初次回复，保持适度的专业性比较好稳妥。提到
+Well, the user asked a rather direct question about identity. This question seems simple, but there could be several underlying intentions—perhaps they are testing my reliability for the first time, or they simply want to confirm the identity of the conversational partner. From the common positioning of AI assistants, the user has provided a clear and flat way to define identity while leaving room for potential follow-up questions.\n\nThe user used "you" instead of "your", which leans towards a more informal tone, so the response style can be a bit more relaxed. However, since this is the initial response, it is better to maintain a moderate level of professionalism. Mentioning
 ==================================================
 ```

@@ -114,8 +113,9 @@ python -m vllm.entrypoints.openai.api_server \
      --no-enable-chunked-prefill \
      --distributed-executor-backend mp \
      --served-model-name GLM-4.5 \
-      --compilation-config '{"splitting_ops": ["vllm.unified_attention_with_output_kunlun", "vllm.unified_attention", "vllm.unified_attention_with_output", "vllm.mamba_mixer2"]}'  > log_glm_plugin.txt 2>&1 & 
+      --compilation-config '{"splitting_ops": ["vllm.unified_attention_with_output_kunlun", "vllm.unified_attention", "vllm.unified_attention_with_output", "vllm.mamba_mixer2"]}'  > log_glm_plugin.txt 2>&1 &
 ```
+
 If your service start successfully, you can see the info shown below:

 ```bash
@@ -132,7 +132,7 @@ curl http://localhost:8989/v1/chat/completions \
  -d '{
    "model": "GLM-4.5",
    "messages": [
-      {"role": "user", "content": "你好，请问你是谁?"}
+      {"role": "user", "content": "Hello, who are you?"}
    ],
    "max_tokens": 100,
    "temperature": 0.7
@@ -142,7 +142,7 @@ curl http://localhost:8989/v1/chat/completions \
 If you query the server successfully, you can see the info shown below (client):

 ```bash
-{"id":"chatcmpl-6af7318de7394bc4ae569e6324a162fa","object":"chat.completion","created":1763101638,"model":"GLM-4.5","choices":[{"index":0,"message":{"role":"assistant","content":"\n<think>用户问“你好，请问你是谁？”，这是一个应该是个了解我的身份。首先，我需要确认用户的需求是什么。可能他们是第一次使用这个服务，或者之前没有接触过类似的AI助手，所以想确认我的背景和能力。 \n\n接下来，我要确保回答清晰明了，同时友好关键点：我是谁，由谁开发，能做什么。需要避免使用专业术语，保持口语化，让不同容易理解。 \n\n然后，用户可能有潜在的需求，比如想了解我能","refusal":null,"annotations":null,"audio":null,"function_call":null,"tool_calls":[],"reasoning_content":null},"logprobs":null,"finish_reason":"length","stop_reason":null}],"service_tier":null,"system_fingerprint":null,"usage":{"prompt_tokens":11,"total_tokens":111,"completion_tokens":100,"prompt_tokens_details":null},"prompt_logprobs":null,"kv_tr
+{"id":"chatcmpl-6af7318de7394bc4ae569e6324a162fa","object":"chat.completion","created":1763101638,"model":"GLM-4.5","choices":[{"index":0,"message":{"role":"assistant","content":"\n<think>The user asked, \"Hello, who are you?\" This is a question about my identity. First, I need to confirm the user's intent. They might be using this service for the first time or have never interacted with similar AI assistants before, so they want to know my background and capabilities.\n\nNext, I should ensure my answer is clear and friendly, focusing on key points: who I am, who developed me, and what I can do. I should avoid technical jargon and keep the response conversational so it's easy to understand.\n\nAdditionally, the user may have potential needs, such as wanting to know what I am capable of.","refusal":null,"annotations":null,"audio":null,"function_call":null,"tool_calls":[],"reasoning_content":null},"logprobs":null,"finish_reason":"length","stop_reason":null}],"service_tier":null,"system_fingerprint":null,"usage":{"prompt_tokens":11,"total_tokens":111,"completion_tokens":100,"prompt_tokens_details":null},"prompt_logprobs":null,"kv_tr
 ```

 Logs of the vllm server:
@@ -150,4 +150,4 @@ Logs of the vllm server:
 ```bash
 (APIServer pid=54567) INFO:     127.0.0.1:60338 - "POST /v1/completions HTTP/1.1" 200 OK
 (APIServer pid=54567) INFO 11-13 14:35:48 [loggers.py:123] Engine 000: Avg prompt throughput: 0.5 tokens/s, Avg generation throughput: 0.7 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%
-```
+```
--- a/docs/source/tutorials/single_xpu_Qwen3-8B.md
+++ b/docs/source/tutorials/single_xpu_Qwen3-8B.md
@@ -16,7 +16,7 @@ if [ $XPU_NUM -gt 0 ]; then
    DOCKER_DEVICE_CONFIG="${DOCKER_DEVICE_CONFIG} --device=/dev/xpuctrl:/dev/xpuctrl"
 fi

-export build_image="xxxxxxxxxxxxxxxxx" 
+export build_image="xxxxxxxxxxxxxxxxx"

 docker run -itd ${DOCKER_DEVICE_CONFIG} \
    --net=host \
@@ -58,7 +58,7 @@ def main():
            "content": [
                {
                    "type": "text",
-                    "text": "说个笑话"
+                    "text": "tell a joke"
                }
            ]
        }
@@ -76,8 +76,8 @@ def main():

    response = outputs[0].outputs[0].text
    print("=" * 50)
-    print("输入内容:", messages)
-    print("模型回复:\n", response)
+    print("Input content:", messages)
+    print("Model response:\n", response)
    print("=" * 50)

 if __name__ == "__main__":
@@ -91,16 +91,18 @@ If you run this script successfully, you can see the info shown below:

 ```bash
 ==================================================
-输入内容: [{'role': 'user', 'content': [{'type': 'text', 'text': '说个笑话'}]}]
-模型回复:
+Input content: [{'role': 'user', 'content': [{'type': 'text', 'text': 'tell a joke'}]}]
+Model response:
 <think>
-好的，用户让我讲个笑话。首先，我需要考虑用户的需求。他们可能只是想轻松一下，或者需要一些娱乐。接下来，我要选择一个适合的笑话，不要太复杂，容易理解，同时也要有趣味性。

-用户可能希望笑话是中文的，所以我要确保笑话符合中文的语言习惯和文化背景。我需要避免涉及敏感话题，比如政治、宗教或者可能引起误解的内容。然后，我得考虑笑话的结构，通常是一个设置和一个出人意料的结尾，这样能带来笑点。
+Okay, the user asked me to tell a joke. First, I need to consider the user's needs. They might just want to relax or need some entertainment. Next, I need to choose a suitable joke that is not too complicated, easy to understand, and also interesting.

-例如，可以讲一个关于日常生活的小幽默，比如动物或者常见的场景。比如，一只乌龟和兔子赛跑的故事，但加入一些反转。不过要确保笑话的长度适中，不要太长，以免用户失去兴趣。另外，要注意用词口语化，避免生硬或复杂的句子结构。

-可能还要检查一下这个笑话是否常见，避免重复。如果用户之前听过类似的，可能需要
+The user might expect the joke to be in Chinese, so I need to ensure that the joke conforms to the language habits and cultural background of Chinese. I need to avoid sensitive topics, such as politics, religion, or anything that might cause misunderstanding. Then, I have to consider the structure of the joke, which usually involves a setup and an unexpected ending to create humor.
+
+For example, I could tell a light-hearted story about everyday life, such as animals or common scenarios. For instance, the story of a turtle and a rabbit racing, but with a twist. However, I need to ensure that the joke is of moderate length and not too long, so the user doesn't lose interest. Additionally, I should pay attention to using colloquial language and avoid stiff or complex sentence structures.
+
+I might also need to check if this joke is common to avoid repetition. If the user has heard something similar before, I may need to come up with a different angle.
 ==================================================
 ```

@@ -130,6 +132,7 @@ python -m vllm.entrypoints.openai.api_server \
            "vllm.unified_attention", "vllm.unified_attention_with_output",
            "vllm.mamba_mixer2"]}' \
 ```
+
 If your service start successfully, you can see the info shown below:

 ```bash
@@ -162,4 +165,4 @@ Logs of the vllm server:
 ```bash
 (APIServer pid=54567) INFO:     127.0.0.1:60338 - "POST /v1/completions HTTP/1.1" 200 OK
 (APIServer pid=54567) INFO 11-13 14:35:48 [loggers.py:123] Engine 000: Avg prompt throughput: 0.5 tokens/s, Avg generation throughput: 0.7 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%
-```
+```
--- a/docs/source/user_guide/configuration/env_vars.md
+++ b/docs/source/user_guide/configuration/env_vars.md
@@ -14,4 +14,4 @@ vllm-kunlun uses the following environment variables to configure the system:
 | `export XMLIR_FORCE_USE_XPU_GRAPH`       | `1`               | ***\*Forces the enablement of XPU Graph mode.\****. This can capture and optimize the model execution graph, significantly boosting inference performance. |
 | `export VLLM_HOST_IP`                    | `$(hostname -i)`  | ***\*Sets the host IP address for the vLLM service\****. This uses a shell command to dynamically get the current host's internal IP. It's used for inter-node communication in a distributed environment. |
 | `export XMLIR_ENABLE_MOCK_TORCH_COMPILE` | `false`           | ***\*Disable Mock Torch Compile Function\****. Set to `false` to ensure the actual compilation and optimization flow is used, rather than mock mode. |
-| `USE_ORI_ROPE`                           | `1`               | ***\*Control whether to use the original RoPE (Rotate Position Encoding) implementation\****. Default is `1` (use original/standard RoPE). Setting to `0` may be used to enable QWEN3 (possibly the specific quantization or optimization technique of KunlunCore), but this requires specific model support. |
+| `FUSED_QK_ROPE_OP`                           | `0`               | ***\*Control whether to use the Fused QK-Norm and RoPE implementation\****. Default is `0` (use original/standard RoPE). Setting to `1` may be used to enable QWEN3. |