[CPU] fix OOM when mem-fraction is not set (#9090)
This commit is contained in:
@@ -31,8 +31,7 @@ ENV PIP_ROOT_USER_ACTION=ignore
|
|||||||
ENV CONDA_PREFIX=/sgl-workspace/miniforge3
|
ENV CONDA_PREFIX=/sgl-workspace/miniforge3
|
||||||
|
|
||||||
RUN pip config set global.index-url https://download.pytorch.org/whl/cpu && \
|
RUN pip config set global.index-url https://download.pytorch.org/whl/cpu && \
|
||||||
pip config set global.extra-index-url https://pypi.org/simple && \
|
pip config set global.extra-index-url https://pypi.org/simple
|
||||||
pip install intel-openmp
|
|
||||||
|
|
||||||
RUN git clone https://github.com/sgl-project/sglang.git && \
|
RUN git clone https://github.com/sgl-project/sglang.git && \
|
||||||
cd sglang && \
|
cd sglang && \
|
||||||
@@ -41,7 +40,7 @@ RUN git clone https://github.com/sgl-project/sglang.git && \
|
|||||||
pip install torch==${VER_TORCH} torchvision==${VER_TORCHVISION} triton==${VER_TRITON} --force-reinstall && \
|
pip install torch==${VER_TORCH} torchvision==${VER_TORCHVISION} triton==${VER_TRITON} --force-reinstall && \
|
||||||
cd sgl-kernel && \
|
cd sgl-kernel && \
|
||||||
cp pyproject_cpu.toml pyproject.toml && \
|
cp pyproject_cpu.toml pyproject.toml && \
|
||||||
pip install -v .
|
pip install .
|
||||||
|
|
||||||
ENV SGLANG_USE_CPU_ENGINE=1
|
ENV SGLANG_USE_CPU_ENGINE=1
|
||||||
ENV LD_PRELOAD=/sgl-workspace/miniforge3/lib/libiomp5.so:/sgl-workspace/miniforge3/lib/libtcmalloc.so:/sgl-workspace/miniforge3/lib/libtbbmalloc.so.2
|
ENV LD_PRELOAD=/sgl-workspace/miniforge3/lib/libiomp5.so:/sgl-workspace/miniforge3/lib/libtcmalloc.so:/sgl-workspace/miniforge3/lib/libtbbmalloc.so.2
|
||||||
|
|||||||
@@ -84,13 +84,13 @@ git checkout <YOUR-DESIRED-VERSION>
|
|||||||
# Install SGLang dependent libs, and build SGLang main package
|
# Install SGLang dependent libs, and build SGLang main package
|
||||||
pip install --upgrade pip setuptools
|
pip install --upgrade pip setuptools
|
||||||
conda install -y libsqlite==3.48.0 gperftools tbb libnuma numactl
|
conda install -y libsqlite==3.48.0 gperftools tbb libnuma numactl
|
||||||
pip install intel-openmp
|
|
||||||
pip install -e "python[all_cpu]"
|
pip install -e "python[all_cpu]"
|
||||||
|
pip install torch==2.7.1 torchvision==0.22.1 triton==3.3.1 --force-reinstall
|
||||||
|
|
||||||
# Build the CPU backend kernels
|
# Build the CPU backend kernels
|
||||||
cd sgl-kernel
|
cd sgl-kernel
|
||||||
cp pyproject_cpu.toml pyproject.toml
|
cp pyproject_cpu.toml pyproject.toml
|
||||||
pip install -v .
|
pip install .
|
||||||
|
|
||||||
# Other required environment variables
|
# Other required environment variables
|
||||||
# Recommend to set these in ~/.bashrc in order not to set every time in a new terminal
|
# Recommend to set these in ~/.bashrc in order not to set every time in a new terminal
|
||||||
@@ -134,13 +134,17 @@ Notes:
|
|||||||
export SGLANG_CPU_OMP_THREADS_BIND="0-39|43-82|86-125|128-167|171-210|214-253"
|
export SGLANG_CPU_OMP_THREADS_BIND="0-39|43-82|86-125|128-167|171-210|214-253"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Please beware that with SGLANG_CPU_OMP_THREADS_BIND set,
|
||||||
|
the available memory amounts of the ranks may not be determined in prior.
|
||||||
|
You may need to set proper `--max-total-tokens` to avoid the out-of-memory error.
|
||||||
|
|
||||||
3. For optimizing decoding with torch.compile, please add the flag `--enable-torch-compile`.
|
3. For optimizing decoding with torch.compile, please add the flag `--enable-torch-compile`.
|
||||||
To specify the maximum batch size when using torch compile, set the flag `--torch-compile-max-bs`.
|
To specify the maximum batch size when using torch compile, set the flag `--torch-compile-max-bs`.
|
||||||
For example, `--enable-torch-compile --torch-compile-max-bs 4` means using torch compile and setting the
|
For example, `--enable-torch-compile --torch-compile-max-bs 4` means using torch compile and setting the
|
||||||
maximum batch size to 4.
|
maximum batch size to 4.
|
||||||
|
|
||||||
4. A warmup step is automatically triggered when the service is started.
|
4. A warmup step is automatically triggered when the service is started.
|
||||||
The server is ready when you see the log `The server is fired up and ready to roll!`.
|
The server is ready when you see the log `The server is fired up and ready to roll!`.
|
||||||
|
|
||||||
## Benchmarking with Requests
|
## Benchmarking with Requests
|
||||||
|
|
||||||
@@ -164,7 +168,7 @@ python -m sglang.bench_serving -h
|
|||||||
```
|
```
|
||||||
|
|
||||||
Additionally, the requests can be formed with
|
Additionally, the requests can be formed with
|
||||||
[OpenAI Completions API](https://docs.sglang.ai/backend/openai_api_completions.html)
|
[OpenAI Completions API](https://docs.sglang.ai/basic_usage/openai_api_completions.html)
|
||||||
and sent via the command line (e.g. using `curl`) or via your own script.
|
and sent via the command line (e.g. using `curl`) or via your own script.
|
||||||
|
|
||||||
## Example: Running DeepSeek-R1
|
## Example: Running DeepSeek-R1
|
||||||
@@ -180,7 +184,6 @@ python -m sglang.launch_server \
|
|||||||
--quantization w8a8_int8 \
|
--quantization w8a8_int8 \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
--mem-fraction-static 0.8 \
|
--mem-fraction-static 0.8 \
|
||||||
--max-total-token 65536 \
|
|
||||||
--tp 6
|
--tp 6
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -194,7 +197,6 @@ python -m sglang.launch_server \
|
|||||||
--device cpu \
|
--device cpu \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
--mem-fraction-static 0.8 \
|
--mem-fraction-static 0.8 \
|
||||||
--max-total-token 65536 \
|
|
||||||
--tp 6
|
--tp 6
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -87,7 +87,7 @@ srt_hip = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
# https://docs.sglang.ai/platforms/cpu_server.html
|
# https://docs.sglang.ai/platforms/cpu_server.html
|
||||||
srt_cpu = ["sglang[runtime_common]"]
|
srt_cpu = ["sglang[runtime_common]", "intel-openmp"]
|
||||||
|
|
||||||
# https://docs.sglang.ai/platforms/ascend_npu.html
|
# https://docs.sglang.ai/platforms/ascend_npu.html
|
||||||
srt_npu = ["sglang[runtime_common]"]
|
srt_npu = ["sglang[runtime_common]"]
|
||||||
|
|||||||
@@ -1673,10 +1673,9 @@ class ModelRunner:
|
|||||||
|
|
||||||
def init_threads_binding(self):
|
def init_threads_binding(self):
|
||||||
omp_cpuids = os.environ.get("SGLANG_CPU_OMP_THREADS_BIND", "all")
|
omp_cpuids = os.environ.get("SGLANG_CPU_OMP_THREADS_BIND", "all")
|
||||||
|
cpu_ids_by_node = get_cpu_ids_by_node()
|
||||||
|
n_numa_node = len(cpu_ids_by_node)
|
||||||
if omp_cpuids == "all":
|
if omp_cpuids == "all":
|
||||||
cpu_ids_by_node = get_cpu_ids_by_node()
|
|
||||||
n_numa_node = len(cpu_ids_by_node)
|
|
||||||
|
|
||||||
assert self.tp_size <= n_numa_node, (
|
assert self.tp_size <= n_numa_node, (
|
||||||
f"SGLANG_CPU_OMP_THREADS_BIND is not set, in this case, "
|
f"SGLANG_CPU_OMP_THREADS_BIND is not set, in this case, "
|
||||||
f"tp_size {self.tp_size} should be smaller than or equal to number of numa node on the machine {n_numa_node}. "
|
f"tp_size {self.tp_size} should be smaller than or equal to number of numa node on the machine {n_numa_node}. "
|
||||||
@@ -1693,7 +1692,18 @@ class ModelRunner:
|
|||||||
)
|
)
|
||||||
self.local_omp_cpuid = cpu_ids_by_node[self.tp_rank]
|
self.local_omp_cpuid = cpu_ids_by_node[self.tp_rank]
|
||||||
else:
|
else:
|
||||||
self.local_omp_cpuid = omp_cpuids.split("|")[self.tp_rank]
|
threads_bind_list = omp_cpuids.split("|")
|
||||||
|
assert self.tp_size == len(threads_bind_list), (
|
||||||
|
f"SGLANG_CPU_OMP_THREADS_BIND setting must be aligned with TP size parameter ({self.tp_size}). "
|
||||||
|
f"Please double check your settings."
|
||||||
|
)
|
||||||
|
self.local_omp_cpuid = threads_bind_list[self.tp_rank]
|
||||||
|
if self.tp_size > n_numa_node:
|
||||||
|
logger.warning(
|
||||||
|
f"TP size ({self.tp_size})is larger than numa node number ({n_numa_node}), "
|
||||||
|
f"in this case the available memory amount of each rank cannot be determined in prior. "
|
||||||
|
f"Please set proper `--max-total-tokens` to avoid the out-of-memory error."
|
||||||
|
)
|
||||||
|
|
||||||
def apply_torch_tp(self):
|
def apply_torch_tp(self):
|
||||||
logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
|
logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
|
||||||
|
|||||||
@@ -434,7 +434,9 @@ def get_available_gpu_memory(
|
|||||||
|
|
||||||
elif device == "cpu":
|
elif device == "cpu":
|
||||||
# TODO: rename the variables in the current function to be not GPU specific
|
# TODO: rename the variables in the current function to be not GPU specific
|
||||||
free_gpu_memory = psutil.virtual_memory().available
|
total_free_memory = psutil.virtual_memory().available
|
||||||
|
n_numa_node: int = len(get_cpu_ids_by_node())
|
||||||
|
free_gpu_memory = round(total_free_memory / n_numa_node, 3)
|
||||||
elif device == "npu":
|
elif device == "npu":
|
||||||
num_gpus = torch.npu.device_count()
|
num_gpus = torch.npu.device_count()
|
||||||
assert gpu_id < num_gpus
|
assert gpu_id < num_gpus
|
||||||
|
|||||||
@@ -109,7 +109,7 @@ class TestIntelAMXAttnBackend(CustomTestCase):
|
|||||||
"--attention-backend",
|
"--attention-backend",
|
||||||
"intel_amx",
|
"intel_amx",
|
||||||
"--mem-fraction-static",
|
"--mem-fraction-static",
|
||||||
"0.05",
|
"0.3",
|
||||||
"--disable-radix",
|
"--disable-radix",
|
||||||
"--trust-remote-code",
|
"--trust-remote-code",
|
||||||
"--disable-overlap-schedule",
|
"--disable-overlap-schedule",
|
||||||
|
|||||||
Reference in New Issue
Block a user