[Doc] Add a perf tune section (#5127)
### What this PR does / why we need it?
This patch purpose to
1. add a section on os point of perf tune doc
2. Set some default env in the image for performance
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
@@ -23,7 +23,9 @@ ARG SOC_VERSION="ascend910b1"
|
||||
|
||||
# Define environments
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV SOC_VERSION=$SOC_VERSION
|
||||
ENV SOC_VERSION=$SOC_VERSION \
|
||||
TASK_QUEUE_ENABLE=1 \
|
||||
OMP_NUM_THREADS=1
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
@@ -31,7 +33,7 @@ COPY . /vllm-workspace/vllm-ascend/
|
||||
|
||||
# Install Mooncake dependencies
|
||||
RUN apt-get update -y && \
|
||||
apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev && \
|
||||
apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev libjemalloc2 && \
|
||||
git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
|
||||
cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
|
||||
cd /vllm-workspace/Mooncake && bash mooncake_installer.sh -y && \
|
||||
@@ -66,4 +68,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
||||
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
RUN echo "export LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
@@ -22,10 +22,13 @@ ARG SOC_VERSION="ascend310p1"
|
||||
|
||||
# Define environments
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV SOC_VERSION=$SOC_VERSION
|
||||
ENV SOC_VERSION=$SOC_VERSION \
|
||||
TASK_QUEUE_ENABLE=1 \
|
||||
OMP_NUM_THREADS=1
|
||||
|
||||
|
||||
RUN apt-get update -y && \
|
||||
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
|
||||
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev libjemalloc2 && \
|
||||
rm -rf /var/cache/apt/* && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
@@ -58,4 +61,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
||||
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
RUN echo "export LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
@@ -20,10 +20,12 @@ FROM quay.io/ascend/cann:8.3.rc2-310p-openeuler24.03-py3.11
|
||||
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
||||
ARG SOC_VERSION="ascend310p1"
|
||||
|
||||
ENV SOC_VERSION=$SOC_VERSION
|
||||
ENV SOC_VERSION=$SOC_VERSION \
|
||||
TASK_QUEUE_ENABLE=1 \
|
||||
OMP_NUM_THREADS=1
|
||||
|
||||
RUN yum update -y && \
|
||||
yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
|
||||
yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel jemalloc && \
|
||||
rm -rf /var/cache/yum
|
||||
|
||||
RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
@@ -55,4 +57,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
||||
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
RUN echo "export LD_PRELOAD=/usr/lib64/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
@@ -24,7 +24,9 @@ ARG SOC_VERSION="ascend910_9391"
|
||||
COPY . /vllm-workspace/vllm-ascend/
|
||||
# Define environments
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV SOC_VERSION=$SOC_VERSION
|
||||
ENV SOC_VERSION=$SOC_VERSION \
|
||||
TASK_QUEUE_ENABLE=1 \
|
||||
OMP_NUM_THREADS=1
|
||||
|
||||
RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
|
||||
@@ -32,7 +34,7 @@ WORKDIR /workspace
|
||||
|
||||
# Install Mooncake dependencies
|
||||
RUN apt-get update -y && \
|
||||
apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev && \
|
||||
apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev libjemalloc2 && \
|
||||
git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
|
||||
cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
|
||||
cd /vllm-workspace/Mooncake && bash mooncake_installer.sh -y && \
|
||||
@@ -65,4 +67,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
||||
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
RUN echo "export LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
@@ -21,7 +21,9 @@ ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
||||
ARG MOONCAKE_TAG="v0.3.7.post2"
|
||||
ARG SOC_VERSION="ascend910_9391"
|
||||
|
||||
ENV SOC_VERSION=$SOC_VERSION
|
||||
ENV SOC_VERSION=$SOC_VERSION \
|
||||
TASK_QUEUE_ENABLE=1 \
|
||||
OMP_NUM_THREADS=1
|
||||
|
||||
RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
|
||||
@@ -32,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
RUN yum update -y && \
|
||||
yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
|
||||
yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel jemalloc && \
|
||||
git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
|
||||
cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
|
||||
ARCH=$(uname -m) && \
|
||||
@@ -68,4 +70,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
||||
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
RUN echo "export LD_PRELOAD=/usr/lib64/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
@@ -21,7 +21,9 @@ ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
||||
ARG MOONCAKE_TAG="v0.3.7.post2"
|
||||
ARG SOC_VERSION="ascend910b1"
|
||||
|
||||
ENV SOC_VERSION=$SOC_VERSION
|
||||
ENV SOC_VERSION=$SOC_VERSION \
|
||||
TASK_QUEUE_ENABLE=1 \
|
||||
OMP_NUM_THREADS=1
|
||||
|
||||
RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
|
||||
@@ -32,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
RUN yum update -y && \
|
||||
yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
|
||||
yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel jemalloc && \
|
||||
git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
|
||||
cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
|
||||
ARCH=$(uname -m) && \
|
||||
@@ -68,4 +70,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
||||
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
RUN echo "export LD_PRELOAD=/usr/lib64/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
@@ -182,3 +182,87 @@ Plus, there are more features for performance optimization in specific scenarios
|
||||
- `HCCL_RDMA_TC`: Use this var to configure traffic class of RDMA NIC. Find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0045.html).
|
||||
- `HCCL_RDMA_SL`: Use this var to configure service level of RDMA NIC. Find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0046.html).
|
||||
- `HCCL_BUFFSIZE`: Use this var to control the cache size for sharing data between two NPUs. Find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0047.html).
|
||||
|
||||
### 5. OS Optimization
|
||||
|
||||
This section describes operating system–level optimizations applied on the host machine (bare metal or Kubernetes node) to improve performance stability, latency, and throughput for inference workloads.
|
||||
|
||||
:::{note}
|
||||
These settings must be applied on the host OS and with root privileges. not inside containers.
|
||||
:::
|
||||
|
||||
#### 5.1
|
||||
|
||||
Set CPU Frequency Governor to `performance`
|
||||
|
||||
```shell
|
||||
echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
|
||||
```
|
||||
|
||||
Purpose
|
||||
- Forces all CPU cores to run under the `performance` governor
|
||||
- Disables dynamic frequency scaling (e.g., `ondemand`, `powersave`)
|
||||
|
||||
Benefits
|
||||
- Keeps CPU cores at maximum frequency
|
||||
- Reduces latency jitter
|
||||
- Improves predictability for inference workloads
|
||||
|
||||
#### 5.2 Disable Swap Usage
|
||||
|
||||
```shell
|
||||
sysctl -w vm.swappiness=0
|
||||
```
|
||||
|
||||
Purpose
|
||||
|
||||
- Minimizes the kernel’s tendency to swap memory pages to disk
|
||||
|
||||
Benefits
|
||||
|
||||
- Prevents severe latency spikes caused by swapping
|
||||
- Improves stability for large in-memory models
|
||||
|
||||
Notes
|
||||
- For inference workloads, swap can introduce second-level latency
|
||||
- Recommended values are `0` or `1`
|
||||
|
||||
#### 5.3 Disable Automatic NUMA Balancing
|
||||
|
||||
```shell
|
||||
sysctl -w kernel.numa_balancing=0
|
||||
```
|
||||
|
||||
Purpose
|
||||
|
||||
- Disables the kernel’s automatic NUMA page migration mechanism
|
||||
|
||||
Benefits
|
||||
|
||||
- Prevents background memory page migrations
|
||||
- Reduces unpredictable memory access latency
|
||||
- Improves performance stability on NUMA systems
|
||||
|
||||
Recommended For
|
||||
- Multi-socket servers
|
||||
- Ascend / NPU deployments with explicit NUMA binding
|
||||
- Systems with manually managed CPU and memory affinity
|
||||
|
||||
#### 5.4 Increase Scheduler Migration Cost
|
||||
|
||||
```shell
|
||||
sysctl -w kernel.sched_migration_cost_ns=50000
|
||||
```
|
||||
|
||||
Purpose
|
||||
- Increases the cost for the scheduler to migrate tasks between CPU cores
|
||||
|
||||
Benefits
|
||||
- Reduces frequent thread migration
|
||||
- Improves CPU cache locality
|
||||
- Lowers latency jitter for inference workloads
|
||||
|
||||
Parameter Details
|
||||
- Unit: nanoseconds (ns)
|
||||
- Typical recommended range: 50000–100000
|
||||
- Higher values encourage threads to stay on the same CPU core
|
||||
|
||||
Reference in New Issue
Block a user