From 72aa7e690a28130ccf09a7823f3e28a8fbde7dcb Mon Sep 17 00:00:00 2001
From: Lu Xinlong <luxinlong02@4paradigm.com>
Date: Tue, 23 Jun 2026 17:17:22 +0800
Subject: [PATCH] Add README and start commands

---
 README.md            | 132 ++++++++++---------------------------------
 README_qwen3_6.md    |  42 --------------
 computility-run.yaml |  33 +++++++++++
 3 files changed, 63 insertions(+), 144 deletions(-)
 delete mode 100644 README_qwen3_6.md
 create mode 100644 computility-run.yaml

diff --git a/README.md b/README.md
index afd81ce..c7c70a8 100644
--- a/README.md
+++ b/README.md
@@ -1,118 +1,46 @@
-# 天数智芯 天垓100 文本生成引擎（基于 vLLM 优化）
-
-本项目是为**天数智芯-天垓100**加速卡深度优化的高性能文本生成推理引擎，基于开源 **vLLM** 框架进行架构级适配与增强，率先实现对 **Qwen3 系列**等最新大模型的高效支持。通过引入 **Prefix Caching**、PagedAttention 等先进优化技术，显著提升吞吐与响应速度，同时提供标准 **OpenAI 兼容 API 接口**，便于无缝集成现有应用生态。
-
-## 支持模型
-
-- **Qwen3**
-- **Llama3**
-- **DeepSeek-R1-Distill**
-- 其他兼容 vLLM 的 HuggingFace 模型（持续扩展中）
-
-> 模型下载地址：[https://modelscope.cn/models/Qwen](https://modelscope.cn/models/Qwen)  
-
----
-
-## Quick Start
-
-### 1. 模型下载
-
-从 ModelScope 下载所需模型（以 Qwen2.5-7B-Instruct 为例）：
-
-```bash
-modelscope download --model qwen/Qwen2.5-7B-Instruct README.md --local_dir /mnt/models/Qwen2.5-7B-Instruct
-```
-
-> ⚠️ 请确保模型路径在后续 Docker 启动时正确挂载。
-
----
-
-### 2. 拉取并构建 Docker 镜像
-
-我们提供已预装天垓100驱动与vLLM优化版本的Docker镜像：
+# 天数智芯 天垓100 文本生成引擎（基于 vLLM 优化适配Qwen3.6-35B-A3B）
 
 ```
 # 本地构建
-docker build -t enginex-iluvatar-vllm:bi100 -f Dockerfile .
+docker build -t enginex-iluvatar-vllm:bi100-qwen3.6 -f Dockerfile .
 ```
 
----
 
-### 3. 启动服务容器
+启动容器镜像
 
-```bash
-docker run -it --rm -p 8000:80 \
-  --name vllm-iluvatar \
-  -v /mnt/models/Qwen2.5-7B-Instruct:/model:ro \
-  --privileged \
-  -e TENSOR_PARALLEL_SIZE=1 \
-  -e PREFIX_CACHING=true \
-  -e MAX_MODEL_LEN=10000 \
-  enginex-iluvatar-vllm:bi100
+下载Qwen3.6-35B-A3B模型，并且需要将模型的config.json文件中architectures字段改成
+```json
+"architectures": [
+        "Qwen3_5MoeForCausalLM"
+    ]
 ```
 
-> ✅ 参数说明：
-> - `PREFIX_CACHING=true`: 启用 Prefix Caching 优化，显著提升多请求共享前缀的推理效率
-> - `MAX_MODEL_LEN=10000`: 支持长上下文推理
-> - `--privileged`: 确保天垓100设备可见
-
----
-
-## 4. 测试服务（使用 OpenAI 兼容接口）
-
-服务启动后，可通过标准 OpenAI SDK 或 `curl` 进行测试。
-
-### 示例：文本生成请求
-
 ```bash
-curl http://localhost:8000/v1/chat/completions \
+docker run -dit --network=host --ipc=host \
+  -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev --privileged \
+  -v /mnt/disk1/models/Qwen3.6-35B-A3B:/model:ro --entrypoint=python3 \
+  -e CUDA_VISIBLE_DEVICES=4,5,6,7 -e VLLM_ENGINE_ITERATION_TIMEOUT_S=3600 \
+  enginex-iluvatar-vllm:bi100-qwen3.6 \
+  -m vllm.entrypoints.openai.api_server \
+  --model /model --port 1111 --served-model-name llm \
+  --max-model-len 100000 --trust-remote-code -tp 4 --gpu-memory-utilization 0.95 \
+  --max-num-seqs 1 --disable-log-requests --disable-frontend-multiprocessing \
+  --max-num-batched-tokens 4096 --enable-chunked-prefill \
+  --max-seq-len-to-capture 32768 --enable-auto-tool-choice \
+  --tool-call-parser qwen3_coder --reasoning-parser qwen3
+```
+
+请求
+```bash
+curl http://localhost:1111/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "qwen3-8b",
+    "model": "llm",
     "messages": [
       {"role": "system", "content": "You are a helpful assistant."},
-      {"role": "user", "content": "请用中文介绍一下上海的特点。"}
+      {"role": "user", "content": "Can you tell me the story of Snow White?"}
     ],
-    "temperature": 0.7,
-    "max_tokens": 512
+    "max_tokens": 200,
+    "temperature": 0.7
   }'
-```
-
-### 使用 OpenAI Python SDK（需安装 `openai>=1.0`）
-
-```python
-from openai import OpenAI
-
-client = OpenAI(base_url="http://localhost:8000/v1", api_key="none")
-
-response = client.chat.completions.create(
-    model="qwen3-8b",
-    messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "请简要介绍杭州的特色文化。"}
-    ],
-    max_tokens=512,
-    temperature=0.7
-)
-
-print(response.choices[0].message.content)
-```
-
----
-
-## 测试结果对比（A100 vs 天垓100）
-
-### 测试数据集
-
-[chat_dataset_v0.json](chat_dataset_v0.json)
-
-### 测试结果
-
-在相同模型和输入条件下，测试平均输出速度（单位：字每秒），结果如下：
-
-| 模型 | 天垓100 输出速度 | Nvidia A100 输出速度 |
-|--------|--------------------------|-------------------------------|
-| Qwen2.5-7B-Instruct | 36.8 | 112.4 |
-| Qwen2.5-1.5B-Instruct-AWQ | 72.4 | 100.8 |
-| Qwen/Qwen1.5-32B-Chat | 12.4 | 55.7 |
-
+```
\ No newline at end of file
diff --git a/README_qwen3_6.md b/README_qwen3_6.md
deleted file mode 100644
index edd388f..0000000
--- a/README_qwen3_6.md
+++ /dev/null
@@ -1,42 +0,0 @@
-# 天数智芯 天垓100 文本生成引擎（基于 vLLM 优化适配Qwen3.6-27B）
-
-```
-# 本地构建
-docker build -t enginex-iluvatar-vllm:bi100-qwen3.6 -f Dockerfile .
-```
-
-
-启动容器镜像
-
-下载Qwen3.6-27B模型，并且需要将模型的config.json文件中architectures字段改成
-```json
-"architectures": [
-        "Qwen3_5ForCausalLM"
-    ]
-```
-
-```bash
-docker run -dit --network=host --ipc=host \
-  -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev --privileged \
-  --name vllm-iluvatar \
-  -v /mnt/models/Qwen3.6-27B:/model:ro --entrypoint=python3 \
-  enginex-iluvatar-vllm:bi100 \
-  -m vllm.entrypoints.openai.api_server \
-  --model /model --port 1111 --served-model-name llm \
-  --max-model-len 10000 --enforce-eager --trust-remote-code -tp 4 --gpu-memory-utilization 0.95
-```
-
-请求
-```bash
-curl http://localhost:1111/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "llm",
-    "messages": [
-      {"role": "system", "content": "You are a helpful assistant."},
-      {"role": "user", "content": "Can you tell me the story of Snow White?"}
-    ],
-    "max_tokens": 200,
-    "temperature": 0.7
-  }'
-```
\ No newline at end of file
diff --git a/computility-run.yaml b/computility-run.yaml
new file mode 100644
index 0000000..116a20a
--- /dev/null
+++ b/computility-run.yaml
@@ -0,0 +1,33 @@
+gpu_num: 4
+command:
+    - python3
+    - -m
+    - vllm.entrypoints.openai.api_server
+    - --model
+    - /model
+    - --served-model-name
+    - llm
+    - --max-model-len
+    - '100000'
+    - --gpu-memory-utilization
+    - '0.95'
+    - --trust-remote-code
+    - -tp
+    - '4'
+    - --max-num-seqs
+    - '1'
+    - --disable-log-requests
+    - --disable-frontend-multiprocessing
+    - --max-num-batched-tokens
+    - '4096'
+    - --enable-chunked-prefill
+    - --max-seq-len-to-capture
+    - '32768'
+    - --enable-auto-tool-choice
+    - --tool-call-parser
+    - qwen3_coder
+    - --reasoning-parser
+    - qwen3
+env:
+    - name: VLLM_ENGINE_ITERATION_TIMEOUT_S
+      value: 3600
\ No newline at end of file