[Doc] Add a perf tune section (#5127)

### What this PR does / why we need it?
This patch purpose to 
1. add a  section on os point of perf tune doc
2. Set some default env in the image for performance

- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-12-19 14:52:52 +08:00
committed by GitHub
parent a6eaf816f1
commit 5ab6d124e5
7 changed files with 121 additions and 12 deletions

View File

@@ -23,7 +23,9 @@ ARG SOC_VERSION="ascend910b1"
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
ENV SOC_VERSION=$SOC_VERSION
ENV SOC_VERSION=$SOC_VERSION \
TASK_QUEUE_ENABLE=1 \
OMP_NUM_THREADS=1
WORKDIR /workspace
@@ -31,7 +33,7 @@ COPY . /vllm-workspace/vllm-ascend/
# Install Mooncake dependencies
RUN apt-get update -y && \
apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev && \
apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev libjemalloc2 && \
git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
cd /vllm-workspace/Mooncake && bash mooncake_installer.sh -y && \
@@ -66,4 +68,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
python3 -m pip cache purge
RUN echo "export LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc
CMD ["/bin/bash"]

View File

@@ -22,10 +22,13 @@ ARG SOC_VERSION="ascend310p1"
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
ENV SOC_VERSION=$SOC_VERSION
ENV SOC_VERSION=$SOC_VERSION \
TASK_QUEUE_ENABLE=1 \
OMP_NUM_THREADS=1
RUN apt-get update -y && \
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev libjemalloc2 && \
rm -rf /var/cache/apt/* && \
rm -rf /var/lib/apt/lists/*
@@ -58,4 +61,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
python3 -m pip cache purge
RUN echo "export LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc
CMD ["/bin/bash"]

View File

@@ -20,10 +20,12 @@ FROM quay.io/ascend/cann:8.3.rc2-310p-openeuler24.03-py3.11
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG SOC_VERSION="ascend310p1"
ENV SOC_VERSION=$SOC_VERSION
ENV SOC_VERSION=$SOC_VERSION \
TASK_QUEUE_ENABLE=1 \
OMP_NUM_THREADS=1
RUN yum update -y && \
yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel jemalloc && \
rm -rf /var/cache/yum
RUN pip config set global.index-url ${PIP_INDEX_URL}
@@ -55,4 +57,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
python3 -m pip cache purge
RUN echo "export LD_PRELOAD=/usr/lib64/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc
CMD ["/bin/bash"]

View File

@@ -24,7 +24,9 @@ ARG SOC_VERSION="ascend910_9391"
COPY . /vllm-workspace/vllm-ascend/
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
ENV SOC_VERSION=$SOC_VERSION
ENV SOC_VERSION=$SOC_VERSION \
TASK_QUEUE_ENABLE=1 \
OMP_NUM_THREADS=1
RUN pip config set global.index-url ${PIP_INDEX_URL}
@@ -32,7 +34,7 @@ WORKDIR /workspace
# Install Mooncake dependencies
RUN apt-get update -y && \
apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev && \
apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev libjemalloc2 && \
git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
cd /vllm-workspace/Mooncake && bash mooncake_installer.sh -y && \
@@ -65,4 +67,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
python3 -m pip cache purge
RUN echo "export LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc
CMD ["/bin/bash"]

View File

@@ -21,7 +21,9 @@ ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG MOONCAKE_TAG="v0.3.7.post2"
ARG SOC_VERSION="ascend910_9391"
ENV SOC_VERSION=$SOC_VERSION
ENV SOC_VERSION=$SOC_VERSION \
TASK_QUEUE_ENABLE=1 \
OMP_NUM_THREADS=1
RUN pip config set global.index-url ${PIP_INDEX_URL}
@@ -32,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
SHELL ["/bin/bash", "-c"]
RUN yum update -y && \
yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel jemalloc && \
git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
ARCH=$(uname -m) && \
@@ -68,4 +70,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
python3 -m pip cache purge
RUN echo "export LD_PRELOAD=/usr/lib64/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc
CMD ["/bin/bash"]

View File

@@ -21,7 +21,9 @@ ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG MOONCAKE_TAG="v0.3.7.post2"
ARG SOC_VERSION="ascend910b1"
ENV SOC_VERSION=$SOC_VERSION
ENV SOC_VERSION=$SOC_VERSION \
TASK_QUEUE_ENABLE=1 \
OMP_NUM_THREADS=1
RUN pip config set global.index-url ${PIP_INDEX_URL}
@@ -32,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
SHELL ["/bin/bash", "-c"]
RUN yum update -y && \
yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel jemalloc && \
git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
ARCH=$(uname -m) && \
@@ -68,4 +70,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
python3 -m pip cache purge
RUN echo "export LD_PRELOAD=/usr/lib64/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc
CMD ["/bin/bash"]

View File

@@ -182,3 +182,87 @@ Plus, there are more features for performance optimization in specific scenarios
- `HCCL_RDMA_TC`: Use this var to configure traffic class of RDMA NIC. Find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0045.html).
- `HCCL_RDMA_SL`: Use this var to configure service level of RDMA NIC. Find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0046.html).
- `HCCL_BUFFSIZE`: Use this var to control the cache size for sharing data between two NPUs. Find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0047.html).
### 5. OS Optimization
This section describes operating systemlevel optimizations applied on the host machine (bare metal or Kubernetes node) to improve performance stability, latency, and throughput for inference workloads.
:::{note}
These settings must be applied on the host OS and with root privileges. not inside containers.
:::
#### 5.1
Set CPU Frequency Governor to `performance`
```shell
echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
```
Purpose
- Forces all CPU cores to run under the `performance` governor
- Disables dynamic frequency scaling (e.g., `ondemand`, `powersave`)
Benefits
- Keeps CPU cores at maximum frequency
- Reduces latency jitter
- Improves predictability for inference workloads
#### 5.2 Disable Swap Usage
```shell
sysctl -w vm.swappiness=0
```
Purpose
- Minimizes the kernels tendency to swap memory pages to disk
Benefits
- Prevents severe latency spikes caused by swapping
- Improves stability for large in-memory models
Notes
- For inference workloads, swap can introduce second-level latency
- Recommended values are `0` or `1`
#### 5.3 Disable Automatic NUMA Balancing
```shell
sysctl -w kernel.numa_balancing=0
```
Purpose
- Disables the kernels automatic NUMA page migration mechanism
Benefits
- Prevents background memory page migrations
- Reduces unpredictable memory access latency
- Improves performance stability on NUMA systems
Recommended For
- Multi-socket servers
- Ascend / NPU deployments with explicit NUMA binding
- Systems with manually managed CPU and memory affinity
#### 5.4 Increase Scheduler Migration Cost
```shell
sysctl -w kernel.sched_migration_cost_ns=50000
```
Purpose
- Increases the cost for the scheduler to migrate tasks between CPU cores
Benefits
- Reduces frequent thread migration
- Improves CPU cache locality
- Lowers latency jitter for inference workloads
Parameter Details
- Unit: nanoseconds (ns)
- Typical recommended range: 50000100000
- Higher values encourage threads to stay on the same CPU core