[CPU] misc updates (#11906)
This commit is contained in:
@@ -1,18 +1,19 @@
|
|||||||
# CPU Servers
|
# CPU Servers
|
||||||
|
|
||||||
The document addresses how to set up the [SGLang](https://github.com/sgl-project/sglang) environment and run LLM inference on CPU servers.
|
The document addresses how to set up the [SGLang](https://github.com/sgl-project/sglang) environment and run LLM inference on CPU servers.
|
||||||
Specifically, SGLang is well optimized on the CPUs equipped with Intel® AMX® Instructions,
|
SGLang is enabled and optimized on the CPUs equipped with Intel® AMX® Instructions,
|
||||||
which are 4th generation or newer Intel® Xeon® Scalable Processors.
|
which are 4th generation or newer Intel® Xeon® Scalable Processors.
|
||||||
|
|
||||||
## Optimized Model List
|
## Optimized Model List
|
||||||
|
|
||||||
A list of popular LLMs are optimized and run efficiently on CPU,
|
A list of popular LLMs are optimized and run efficiently on CPU,
|
||||||
including the most notable open-source models like Llama series, Qwen series,
|
including the most notable open-source models like Llama series, Qwen series,
|
||||||
and the phenomenal high-quality reasoning model DeepSeek-R1.
|
and DeepSeek series like DeepSeek-R1 and DeepSeek-V3.1-Terminus.
|
||||||
|
|
||||||
| Model Name | BF16 | w8a8_int8 | FP8 |
|
| Model Name | BF16 | W8A8_INT8 | FP8 |
|
||||||
|:---:|:---:|:---:|:---:|
|
|:---:|:---:|:---:|:---:|
|
||||||
| DeepSeek-R1 | | [meituan/DeepSeek-R1-Channel-INT8](https://huggingface.co/meituan/DeepSeek-R1-Channel-INT8) | [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) |
|
| DeepSeek-R1 | | [meituan/DeepSeek-R1-Channel-INT8](https://huggingface.co/meituan/DeepSeek-R1-Channel-INT8) | [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) |
|
||||||
|
| DeepSeek-V3.1-Terminus | | [IntervitensInc/DeepSeek-V3.1-Terminus-Channel-int8](https://huggingface.co/IntervitensInc/DeepSeek-V3.1-Terminus-Channel-int8) | [deepseek-ai/DeepSeek-V3.1-Terminus](https://huggingface.co/deepseek-ai/DeepSeek-V3.1-Terminus) |
|
||||||
| Llama-3.2-3B | [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | [RedHatAI/Llama-3.2-3B-quantized.w8a8](https://huggingface.co/RedHatAI/Llama-3.2-3B-Instruct-quantized.w8a8) | |
|
| Llama-3.2-3B | [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | [RedHatAI/Llama-3.2-3B-quantized.w8a8](https://huggingface.co/RedHatAI/Llama-3.2-3B-Instruct-quantized.w8a8) | |
|
||||||
| Llama-3.1-8B | [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) | [RedHatAI/Meta-Llama-3.1-8B-quantized.w8a8](https://huggingface.co/RedHatAI/Meta-Llama-3.1-8B-quantized.w8a8) | |
|
| Llama-3.1-8B | [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) | [RedHatAI/Meta-Llama-3.1-8B-quantized.w8a8](https://huggingface.co/RedHatAI/Meta-Llama-3.1-8B-quantized.w8a8) | |
|
||||||
| QwQ-32B | | [RedHatAI/QwQ-32B-quantized.w8a8](https://huggingface.co/RedHatAI/QwQ-32B-quantized.w8a8) | |
|
| QwQ-32B | | [RedHatAI/QwQ-32B-quantized.w8a8](https://huggingface.co/RedHatAI/QwQ-32B-quantized.w8a8) | |
|
||||||
@@ -36,7 +37,7 @@ git clone https://github.com/sgl-project/sglang.git
|
|||||||
cd sglang/docker
|
cd sglang/docker
|
||||||
|
|
||||||
# Build the docker image
|
# Build the docker image
|
||||||
docker build -t sglang-cpu:main -f Dockerfile.xeon .
|
docker build -t sglang-cpu:latest -f Dockerfile.xeon .
|
||||||
|
|
||||||
# Initiate a docker container
|
# Initiate a docker container
|
||||||
docker run \
|
docker run \
|
||||||
@@ -48,7 +49,7 @@ docker run \
|
|||||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||||
-p 30000:30000 \
|
-p 30000:30000 \
|
||||||
-e "HF_TOKEN=<secret>" \
|
-e "HF_TOKEN=<secret>" \
|
||||||
sglang-cpu:main /bin/bash
|
sglang-cpu:latest /bin/bash
|
||||||
```
|
```
|
||||||
|
|
||||||
### Install From Source
|
### Install From Source
|
||||||
@@ -121,9 +122,9 @@ Notes:
|
|||||||
|
|
||||||
2. The flag `--tp 6` specifies that tensor parallelism will be applied using 6 ranks (TP6).
|
2. The flag `--tp 6` specifies that tensor parallelism will be applied using 6 ranks (TP6).
|
||||||
The number of TP specified is how many TP ranks will be used during the execution.
|
The number of TP specified is how many TP ranks will be used during the execution.
|
||||||
In a CPU platform, a TP rank means a sub-NUMA cluster (SNC).
|
On a CPU platform, a TP rank means a sub-NUMA cluster (SNC).
|
||||||
Usually we can get the SNC information (How many available) from Operation System.
|
Usually we can get the SNC information (How many available) from the Operating System.
|
||||||
User can specify TP to be no more than the total available SNCs in current system.
|
Users can specify TP to be no more than the total available SNCs in current system.
|
||||||
|
|
||||||
If the specified TP rank number differs from the total SNC count,
|
If the specified TP rank number differs from the total SNC count,
|
||||||
the system will automatically utilize the first `n` SNCs.
|
the system will automatically utilize the first `n` SNCs.
|
||||||
@@ -175,29 +176,29 @@ Additionally, the requests can be formed with
|
|||||||
[OpenAI Completions API](https://docs.sglang.ai/basic_usage/openai_api_completions.html)
|
[OpenAI Completions API](https://docs.sglang.ai/basic_usage/openai_api_completions.html)
|
||||||
and sent via the command line (e.g. using `curl`) or via your own script.
|
and sent via the command line (e.g. using `curl`) or via your own script.
|
||||||
|
|
||||||
## Example: Running DeepSeek-R1
|
## Example: Running DeepSeek-V3.1-Terminus
|
||||||
|
|
||||||
An example command to launch service for W8A8 DeepSeek-R1 on a Xeon® 6980P server
|
An example command to launch service for W8A8_INT8 DeepSeek-V3.1-Terminus on a Xeon® 6980P server:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m sglang.launch_server \
|
python -m sglang.launch_server \
|
||||||
--model meituan/DeepSeek-R1-Channel-INT8 \
|
--model IntervitensInc/DeepSeek-V3.1-Terminus-Channel-int8 \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--disable-overlap-schedule \
|
--disable-overlap-schedule \
|
||||||
--device cpu \
|
--device cpu \
|
||||||
--quantization w8a8_int8 \
|
--quantization w8a8_int8 \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
--mem-fraction-static 0.8 \
|
--mem-fraction-static 0.8 \
|
||||||
--enable-torch-compile \
|
--enable-torch-compile \
|
||||||
--torch-compile-max-bs 4 \
|
--torch-compile-max-bs 4 \
|
||||||
--tp 6
|
--tp 6
|
||||||
```
|
```
|
||||||
|
|
||||||
Similarly, an example command to launch service for FP8 DeepSeek-R1 would be
|
Similarly, an example command to launch service for FP8 DeepSeek-V3.1-Terminus would be:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m sglang.launch_server \
|
python -m sglang.launch_server \
|
||||||
--model deepseek-ai/DeepSeek-R1 \
|
--model deepseek-ai/DeepSeek-V3.1-Terminus \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--disable-overlap-schedule \
|
--disable-overlap-schedule \
|
||||||
--device cpu \
|
--device cpu \
|
||||||
|
|||||||
@@ -1623,13 +1623,18 @@ def get_cpu_memory_capacity():
|
|||||||
for numa_id in range(n_numa_node):
|
for numa_id in range(n_numa_node):
|
||||||
file_meminfo = f"node{numa_id}/meminfo"
|
file_meminfo = f"node{numa_id}/meminfo"
|
||||||
with open(os.path.join(file_prefix, file_meminfo), "r") as f:
|
with open(os.path.join(file_prefix, file_meminfo), "r") as f:
|
||||||
# 1st line contains 'MemTotal'
|
# MemTotal info is at the 1st line
|
||||||
line = f.read().split("\n")[0]
|
line = f.readline()
|
||||||
numa_mem_list.append(int(line.split()[3]))
|
# Expected format: "Node 0 MemTotal: 100000000 kB"
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) >= 4 and parts[2] == "MemTotal:":
|
||||||
|
numa_mem_list.append(int(parts[3]))
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unexpected format in {file_meminfo}: {line}")
|
||||||
# Retrieved value in KB, need MB
|
# Retrieved value in KB, need MB
|
||||||
numa_mem = float(min(numa_mem_list) // 1024)
|
numa_mem = float(min(numa_mem_list) // 1024)
|
||||||
return numa_mem
|
return numa_mem
|
||||||
except FileNotFoundError:
|
except (FileNotFoundError, ValueError, IndexError):
|
||||||
numa_mem = psutil.virtual_memory().total / n_numa_node
|
numa_mem = psutil.virtual_memory().total / n_numa_node
|
||||||
# Retrieved value in Byte, need MB
|
# Retrieved value in Byte, need MB
|
||||||
return float(numa_mem // (1 << 20))
|
return float(numa_mem // (1 << 20))
|
||||||
|
|||||||
@@ -15,8 +15,7 @@ requires-python = ">=3.10"
|
|||||||
license = { file = "LICENSE" }
|
license = { file = "LICENSE" }
|
||||||
classifiers = [
|
classifiers = [
|
||||||
"Programming Language :: Python :: 3",
|
"Programming Language :: Python :: 3",
|
||||||
"License :: OSI Approved :: Apache Software License",
|
"License :: OSI Approved :: Apache Software License"
|
||||||
"Environment :: CPU"
|
|
||||||
]
|
]
|
||||||
dependencies = []
|
dependencies = []
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user