diff --git a/Dockerfile b/Dockerfile index bff22191..11f38018 100644 --- a/Dockerfile +++ b/Dockerfile @@ -23,7 +23,9 @@ ARG SOC_VERSION="ascend910b1" # Define environments ENV DEBIAN_FRONTEND=noninteractive -ENV SOC_VERSION=$SOC_VERSION +ENV SOC_VERSION=$SOC_VERSION \ + TASK_QUEUE_ENABLE=1 \ + OMP_NUM_THREADS=1 WORKDIR /workspace @@ -31,7 +33,7 @@ COPY . /vllm-workspace/vllm-ascend/ # Install Mooncake dependencies RUN apt-get update -y && \ - apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev && \ + apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev libjemalloc2 && \ git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \ cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \ cd /vllm-workspace/Mooncake && bash mooncake_installer.sh -y && \ @@ -66,4 +68,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \ python3 -m pip cache purge +RUN echo "export LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc + CMD ["/bin/bash"] diff --git a/Dockerfile.310p b/Dockerfile.310p index 44476d87..9f9072bd 100644 --- a/Dockerfile.310p +++ b/Dockerfile.310p @@ -22,10 +22,13 @@ ARG SOC_VERSION="ascend310p1" # Define environments ENV DEBIAN_FRONTEND=noninteractive -ENV SOC_VERSION=$SOC_VERSION +ENV SOC_VERSION=$SOC_VERSION \ + TASK_QUEUE_ENABLE=1 \ + OMP_NUM_THREADS=1 + RUN apt-get update -y && \ - apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \ + apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev libjemalloc2 && \ rm -rf /var/cache/apt/* && \ rm -rf /var/lib/apt/lists/* @@ -58,4 +61,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \ python3 -m pip cache purge +RUN echo "export LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc + CMD ["/bin/bash"] diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler index 6d03629a..f48a3e87 100644 --- a/Dockerfile.310p.openEuler +++ b/Dockerfile.310p.openEuler @@ -20,10 +20,12 @@ FROM quay.io/ascend/cann:8.3.rc2-310p-openeuler24.03-py3.11 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" ARG SOC_VERSION="ascend310p1" -ENV SOC_VERSION=$SOC_VERSION +ENV SOC_VERSION=$SOC_VERSION \ + TASK_QUEUE_ENABLE=1 \ + OMP_NUM_THREADS=1 RUN yum update -y && \ - yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \ + yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel jemalloc && \ rm -rf /var/cache/yum RUN pip config set global.index-url ${PIP_INDEX_URL} @@ -55,4 +57,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \ python3 -m pip cache purge +RUN echo "export LD_PRELOAD=/usr/lib64/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc + CMD ["/bin/bash"] diff --git a/Dockerfile.a3 b/Dockerfile.a3 index 59975d99..73187eea 100644 --- a/Dockerfile.a3 +++ b/Dockerfile.a3 @@ -24,7 +24,9 @@ ARG SOC_VERSION="ascend910_9391" COPY . /vllm-workspace/vllm-ascend/ # Define environments ENV DEBIAN_FRONTEND=noninteractive -ENV SOC_VERSION=$SOC_VERSION +ENV SOC_VERSION=$SOC_VERSION \ + TASK_QUEUE_ENABLE=1 \ + OMP_NUM_THREADS=1 RUN pip config set global.index-url ${PIP_INDEX_URL} @@ -32,7 +34,7 @@ WORKDIR /workspace # Install Mooncake dependencies RUN apt-get update -y && \ - apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev && \ + apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev libjemalloc2 && \ git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \ cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \ cd /vllm-workspace/Mooncake && bash mooncake_installer.sh -y && \ @@ -65,4 +67,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \ python3 -m pip cache purge +RUN echo "export LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc + CMD ["/bin/bash"] diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler index 80a965bd..6ec647cb 100644 --- a/Dockerfile.a3.openEuler +++ b/Dockerfile.a3.openEuler @@ -21,7 +21,9 @@ ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" ARG MOONCAKE_TAG="v0.3.7.post2" ARG SOC_VERSION="ascend910_9391" -ENV SOC_VERSION=$SOC_VERSION +ENV SOC_VERSION=$SOC_VERSION \ + TASK_QUEUE_ENABLE=1 \ + OMP_NUM_THREADS=1 RUN pip config set global.index-url ${PIP_INDEX_URL} @@ -32,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/ SHELL ["/bin/bash", "-c"] RUN yum update -y && \ - yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \ + yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel jemalloc && \ git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \ cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \ ARCH=$(uname -m) && \ @@ -68,4 +70,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \ python3 -m pip cache purge +RUN echo "export LD_PRELOAD=/usr/lib64/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc + CMD ["/bin/bash"] diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index 7d509c0a..8a0534dd 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -21,7 +21,9 @@ ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" ARG MOONCAKE_TAG="v0.3.7.post2" ARG SOC_VERSION="ascend910b1" -ENV SOC_VERSION=$SOC_VERSION +ENV SOC_VERSION=$SOC_VERSION \ + TASK_QUEUE_ENABLE=1 \ + OMP_NUM_THREADS=1 RUN pip config set global.index-url ${PIP_INDEX_URL} @@ -32,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/ SHELL ["/bin/bash", "-c"] RUN yum update -y && \ - yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \ + yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel jemalloc && \ git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \ cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \ ARCH=$(uname -m) && \ @@ -68,4 +70,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \ python3 -m pip cache purge +RUN echo "export LD_PRELOAD=/usr/lib64/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc + CMD ["/bin/bash"] diff --git a/docs/source/developer_guide/performance_and_debug/optimization_and_tuning.md b/docs/source/developer_guide/performance_and_debug/optimization_and_tuning.md index a0d06351..d61ea0a5 100644 --- a/docs/source/developer_guide/performance_and_debug/optimization_and_tuning.md +++ b/docs/source/developer_guide/performance_and_debug/optimization_and_tuning.md @@ -182,3 +182,87 @@ Plus, there are more features for performance optimization in specific scenarios - `HCCL_RDMA_TC`: Use this var to configure traffic class of RDMA NIC. Find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0045.html). - `HCCL_RDMA_SL`: Use this var to configure service level of RDMA NIC. Find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0046.html). - `HCCL_BUFFSIZE`: Use this var to control the cache size for sharing data between two NPUs. Find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0047.html). + +### 5. OS Optimization + +This section describes operating system–level optimizations applied on the host machine (bare metal or Kubernetes node) to improve performance stability, latency, and throughput for inference workloads. + +:::{note} +These settings must be applied on the host OS and with root privileges. not inside containers. +::: + +#### 5.1 + +Set CPU Frequency Governor to `performance` + +```shell +echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor +``` + +Purpose +- Forces all CPU cores to run under the `performance` governor +- Disables dynamic frequency scaling (e.g., `ondemand`, `powersave`) + +Benefits +- Keeps CPU cores at maximum frequency +- Reduces latency jitter +- Improves predictability for inference workloads + +#### 5.2 Disable Swap Usage + +```shell +sysctl -w vm.swappiness=0 +``` + +Purpose + +- Minimizes the kernel’s tendency to swap memory pages to disk + +Benefits + +- Prevents severe latency spikes caused by swapping +- Improves stability for large in-memory models + +Notes +- For inference workloads, swap can introduce second-level latency +- Recommended values are `0` or `1` + +#### 5.3 Disable Automatic NUMA Balancing + +```shell +sysctl -w kernel.numa_balancing=0 +``` + +Purpose + +- Disables the kernel’s automatic NUMA page migration mechanism + +Benefits + +- Prevents background memory page migrations +- Reduces unpredictable memory access latency +- Improves performance stability on NUMA systems + +Recommended For +- Multi-socket servers +- Ascend / NPU deployments with explicit NUMA binding +- Systems with manually managed CPU and memory affinity + +#### 5.4 Increase Scheduler Migration Cost + +```shell +sysctl -w kernel.sched_migration_cost_ns=50000 +``` + +Purpose +- Increases the cost for the scheduler to migrate tasks between CPU cores + +Benefits +- Reduces frequent thread migration +- Improves CPU cache locality +- Lowers latency jitter for inference workloads + +Parameter Details +- Unit: nanoseconds (ns) +- Typical recommended range: 50000–100000 +- Higher values encourage threads to stay on the same CPU core