[Doc] Add a perf tune section (#5127)

### What this PR does / why we need it? This patch purpose to 1. add a section on os point of perf tune doc 2. Set some default env in the image for performance - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-12-19 14:52:52 +08:00
parent a6eaf816f1
commit 5ab6d124e5
7 changed files with 121 additions and 12 deletions
--- a/8
+++ b/8
@@ -23,7 +23,9 @@ ARG SOC_VERSION="ascend910b1"

 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
-ENV SOC_VERSION=$SOC_VERSION
+ENV SOC_VERSION=$SOC_VERSION \
+    TASK_QUEUE_ENABLE=1 \
+    OMP_NUM_THREADS=1

 WORKDIR /workspace

@@ -31,7 +33,7 @@ COPY . /vllm-workspace/vllm-ascend/

 # Install Mooncake dependencies
 RUN apt-get update -y && \
-    apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev && \
+    apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev libjemalloc2 && \
    git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
    cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
    cd /vllm-workspace/Mooncake && bash mooncake_installer.sh -y && \
@@ -66,4 +68,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
 RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
    python3 -m pip cache purge

+RUN echo "export LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc
+
 CMD ["/bin/bash"]
--- a/Dockerfile.310p
+++ b/Dockerfile.310p
@@ -22,10 +22,13 @@ ARG SOC_VERSION="ascend310p1"

 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
-ENV SOC_VERSION=$SOC_VERSION
+ENV SOC_VERSION=$SOC_VERSION \
+    TASK_QUEUE_ENABLE=1 \
+    OMP_NUM_THREADS=1
+    

 RUN apt-get update -y && \
-    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
+    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev libjemalloc2 && \
    rm -rf /var/cache/apt/* && \
    rm -rf /var/lib/apt/lists/*

@@ -58,4 +61,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
 RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
    python3 -m pip cache purge

+RUN echo "export LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc
+
 CMD ["/bin/bash"]
--- a/Dockerfile.310p.openEuler
+++ b/Dockerfile.310p.openEuler
@@ -20,10 +20,12 @@ FROM quay.io/ascend/cann:8.3.rc2-310p-openeuler24.03-py3.11
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG SOC_VERSION="ascend310p1"

-ENV SOC_VERSION=$SOC_VERSION
+ENV SOC_VERSION=$SOC_VERSION \
+    TASK_QUEUE_ENABLE=1 \
+    OMP_NUM_THREADS=1

 RUN yum update -y && \
-    yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
+    yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel jemalloc && \
    rm -rf /var/cache/yum

 RUN pip config set global.index-url ${PIP_INDEX_URL}
@@ -55,4 +57,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
 RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
    python3 -m pip cache purge

+RUN echo "export LD_PRELOAD=/usr/lib64/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc
+
 CMD ["/bin/bash"]
--- a/Dockerfile.a3
+++ b/Dockerfile.a3
@@ -24,7 +24,9 @@ ARG SOC_VERSION="ascend910_9391"
 COPY . /vllm-workspace/vllm-ascend/
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
-ENV SOC_VERSION=$SOC_VERSION
+ENV SOC_VERSION=$SOC_VERSION \
+    TASK_QUEUE_ENABLE=1 \
+    OMP_NUM_THREADS=1

 RUN pip config set global.index-url ${PIP_INDEX_URL}

@@ -32,7 +34,7 @@ WORKDIR /workspace

 # Install Mooncake dependencies
 RUN apt-get update -y && \
-    apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev && \
+    apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev libjemalloc2 && \
    git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
    cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
    cd /vllm-workspace/Mooncake && bash mooncake_installer.sh -y && \
@@ -65,4 +67,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
 RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
    python3 -m pip cache purge

+RUN echo "export LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc
+
 CMD ["/bin/bash"]
--- a/Dockerfile.a3.openEuler
+++ b/Dockerfile.a3.openEuler
@@ -21,7 +21,9 @@ ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG MOONCAKE_TAG="v0.3.7.post2"
 ARG SOC_VERSION="ascend910_9391"

-ENV SOC_VERSION=$SOC_VERSION
+ENV SOC_VERSION=$SOC_VERSION \
+    TASK_QUEUE_ENABLE=1 \
+    OMP_NUM_THREADS=1

 RUN pip config set global.index-url ${PIP_INDEX_URL}

@@ -32,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
 SHELL ["/bin/bash", "-c"]

 RUN yum update -y && \
-    yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
+    yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel jemalloc && \
    git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
    cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
    ARCH=$(uname -m) && \
@@ -68,4 +70,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
 RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
    python3 -m pip cache purge

+RUN echo "export LD_PRELOAD=/usr/lib64/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc
+
 CMD ["/bin/bash"]
--- a/Dockerfile.openEuler
+++ b/Dockerfile.openEuler
@@ -21,7 +21,9 @@ ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG MOONCAKE_TAG="v0.3.7.post2"
 ARG SOC_VERSION="ascend910b1"

-ENV SOC_VERSION=$SOC_VERSION
+ENV SOC_VERSION=$SOC_VERSION \
+    TASK_QUEUE_ENABLE=1 \
+    OMP_NUM_THREADS=1

 RUN pip config set global.index-url ${PIP_INDEX_URL}

@@ -32,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
 SHELL ["/bin/bash", "-c"]

 RUN yum update -y && \
-    yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
+    yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel jemalloc && \
    git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
    cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
    ARCH=$(uname -m) && \
@@ -68,4 +70,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
 RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
    python3 -m pip cache purge

+RUN echo "export LD_PRELOAD=/usr/lib64/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc
+
 CMD ["/bin/bash"]
--- a/docs/source/developer_guide/performance_and_debug/optimization_and_tuning.md
+++ b/docs/source/developer_guide/performance_and_debug/optimization_and_tuning.md
@@ -182,3 +182,87 @@ Plus, there are more features for performance optimization in specific scenarios
 - `HCCL_RDMA_TC`: Use this var to configure traffic class of RDMA NIC. Find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0045.html).
 - `HCCL_RDMA_SL`: Use this var to configure service level of RDMA NIC. Find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0046.html).
 - `HCCL_BUFFSIZE`: Use this var to control the cache size for sharing data between two NPUs. Find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0047.html).
+
+### 5. OS Optimization
+
+This section describes operating system–level optimizations applied on the host machine (bare metal or Kubernetes node) to improve performance stability, latency, and throughput for inference workloads.
+
+:::{note}
+These settings must be applied on the host OS and with root privileges. not inside containers.
+:::
+
+#### 5.1
+
+Set CPU Frequency Governor to `performance`
+
+```shell
+echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
+```
+
+Purpose
+- Forces all CPU cores to run under the `performance` governor
+- Disables dynamic frequency scaling (e.g., `ondemand`, `powersave`)
+
+Benefits
+- Keeps CPU cores at maximum frequency
+- Reduces latency jitter
+- Improves predictability for inference workloads
+
+#### 5.2 Disable Swap Usage
+
+```shell
+sysctl -w vm.swappiness=0
+```
+
+Purpose
+
+- Minimizes the kernel’s tendency to swap memory pages to disk
+
+Benefits
+
+- Prevents severe latency spikes caused by swapping
+- Improves stability for large in-memory models
+
+Notes
+- For inference workloads, swap can introduce second-level latency
+- Recommended values are `0` or `1`
+
+#### 5.3 Disable Automatic NUMA Balancing
+
+```shell
+sysctl -w kernel.numa_balancing=0
+```
+
+Purpose
+
+- Disables the kernel’s automatic NUMA page migration mechanism
+
+Benefits
+
+- Prevents background memory page migrations
+- Reduces unpredictable memory access latency
+- Improves performance stability on NUMA systems
+
+Recommended For
+- Multi-socket servers
+- Ascend / NPU deployments with explicit NUMA binding
+- Systems with manually managed CPU and memory affinity
+
+#### 5.4 Increase Scheduler Migration Cost
+
+```shell
+sysctl -w kernel.sched_migration_cost_ns=50000
+```
+
+Purpose
+- Increases the cost for the scheduler to migrate tasks between CPU cores
+
+Benefits
+- Reduces frequent thread migration
+- Improves CPU cache locality
+- Lowers latency jitter for inference workloads
+  
+Parameter Details
+- Unit: nanoseconds (ns)
+- Typical recommended range: 50000–100000
+- Higher values encourage threads to stay on the same CPU core