This commit is contained in:
2025-08-06 10:30:31 +08:00
commit dfb6b3ae16
8 changed files with 353 additions and 0 deletions

4
.dockerignore Normal file
View File

@@ -0,0 +1,4 @@
.git
__pycache__/
Dockerfile
Dockerfile.xtrt-llm

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
__pycache__/

12
Dockerfile Normal file
View File

@@ -0,0 +1,12 @@
ARG BASE_IMAGE=harbor.4pd.io/mic-llm-x/kunlunxin-xtrt-llm:0.5.3.2
FROM $BASE_IMAGE
ARG PYPI_MIRROR=https://pypi.tuna.tsinghua.edu.cn/simple
RUN pip config set global.index-url $PYPI_MIRROR
ADD requirements.txt .
RUN pip install -r requirements.txt
ADD llm_utils.py main.py ./
ENTRYPOINT ["python3", "main.py"]

12
Dockerfile.xtrt-llm Normal file
View File

@@ -0,0 +1,12 @@
ARG BASE_IMAGE=xtcl_ubuntu2004:v4.6
FROM $BASE_IMAGE
WORKDIR /workspace
ADD . .
ARG PYPI_MIRROR=https://pypi.tuna.tsinghua.edu.cn/simple
RUN pip config set global.index-url $PYPI_MIRROR
RUN bash scripts/install_release.sh && \
rm -rf *.whl *.run downloads
ENV PATH=/usr/local/cuda-11.7/bin:/home/pt201/bin:$PATH

46
README.md Normal file
View File

@@ -0,0 +1,46 @@
# kunlunxin
适配 kunlunxin 昆仑芯R200-8F加速卡的大模型推理服务镜像
## 启动
### 使用docker方式启动
```bash
docker run -it --rm \
--net=host \
-v /mnt/disk0/models/model-qwen1-5-72b-chat/:/model \
-e MODEL_NAME=qwen1.5-72b \
-e NUM_GPUs=4 \
-e WEIGHT_ONLY_PRECISION=int8 \
--device /dev/xpuctrl \
--device /dev/xpu0 \
--device /dev/xpu1 \
--device /dev/xpu2 \
--device /dev/xpu3 \
    harbor.4pd.io/mic-llm-x/slx-infer-kunlunxin:release-0.1-pipe-1-commit-cd30b38d
```
### 参数说明
#### 环境变量
- MODEL_PATH: 模型在容器中的路径,默认为 `/model`
- MODEL_NAME: 模型名字用于api接口中
- PORT端口默认`80`
- BUILD_SCRIPT_ROOT编译脚本目录一般不需要修改
- WEIGHT_ONLY_PRECISION量化权重的精度`int8``int4`
- ENGINE_DIR编译后的模型存储路径默认`./xtrt_engine`
- BUILD_EXTRA编译用到的额外参数
#### 参数
基本与vllm相同可以使用--help查看。
由于后端的engine使用的是xtrt的engine所以相关的参数无效或造成未知的结果所以不建议修改相关参数。

33
llm_utils.py Normal file
View File

@@ -0,0 +1,33 @@
from transformers import PretrainedConfig
class ModelConfig:
def __init__(self, model_path: str):
self.hf_config = PretrainedConfig.from_pretrained(model_path)
def model_type(self):
return self.hf_config.model_type
def max_model_len(self):
derived_max_model_len = float("inf")
possible_keys = [
# OPT
"max_position_embeddings",
# GPT-2
"n_positions",
# MPT
"max_seq_len",
# ChatGLM2
"seq_length",
# Others
"max_sequence_length",
"max_seq_length",
"seq_len",
]
for key in possible_keys:
max_len_key = getattr(self.hf_config, key, None)
if max_len_key is not None:
derived_max_model_len = min(derived_max_model_len, max_len_key)
if derived_max_model_len == float("inf"):
return None
return derived_max_model_len

242
main.py Normal file
View File

@@ -0,0 +1,242 @@
import os
import sys
# import copy
import subprocess
from abc import ABC, abstractmethod
from typing import List
from llm_utils import ModelConfig
# from xtrt_llm.vllm.entrypoints.openai.api_server import parse_args
class Config:
def __init__(self):
self.model_path = os.getenv("MODEL_PATH", "/model")
self.model_name = os.getenv("MODEL_NAME")
self.num_gpus = int(os.getenv("NUM_GPUs", "1"))
self.port = os.getenv("PORT", "80")
self.script_root = os.getenv("BUILD_SCRIPT_ROOT", "examples")
self.weight_only_precision = os.getenv("WEIGHT_ONLY_PRECISION")
self.engine_dir = os.getenv("ENGINE_DIR", "./xtrt_engine")
self.build_extra = os.getenv("BUILD_EXTRA")
# self.parallel_build = os.getenv("PARALLEL_BUILD")
self.model_config = ModelConfig(self.model_path)
class ModelRunner(ABC):
def __init__(self, config):
self.config = config
@abstractmethod
def build_script(self) -> str:
raise NotImplementedError()
@abstractmethod
def build_args(self) -> List[str]:
pass
def build_command(self) -> List[str]:
cmd = [
sys.executable,
self.build_script()
] + self.build_args()
if self.config.build_extra:
cmd.extend(self.config.build_extra.split(' '))
return cmd
def build(self):
if os.path.exists(self.config.engine_dir):
print(f"engine path {self.config.engine_dir} exists")
return
cmd = self.build_command()
print(f"build command: {cmd}")
p = subprocess.Popen(cmd)
p.wait()
if p.returncode != 0:
raise RuntimeError(f"build failed, exit code {p.returncode}")
print("build success")
@staticmethod
def serve_module():
return 'xtrt_llm.vllm.entrypoints.openai.api_server'
def serve_command(self) -> List[str]:
cmd = [
sys.executable,
'-m',
self.serve_module(),
'--port',
self.config.port,
'--model',
self.config.model_path,
'--engine_dir',
self.config.engine_dir,
'--trust-remote-code',
'--tensor-parallel-size',
str(self.config.num_gpus),
'--dtype',
'float16',
]
if self.config.model_name:
cmd.extend(['--served-model-name', self.config.model_name])
cmd.extend(sys.argv[1:])
return cmd
def serve(self):
cmd = self.serve_command()
print(f"serve command: {cmd}")
p = subprocess.Popen(cmd)
p.wait()
if p.returncode != 0:
raise RuntimeError(f"serve failed, exit code {p.returncode}")
class ChatGLMRunner(ModelRunner):
def __init__(self, config):
super().__init__(config)
# used in build args
self.build_model_name = os.getenv("BUILD_MODEL_NAME", "chatglm3_6b")
def build_script(self):
return f"{self.config.script_root}/chatglm/build.py"
def build_args(self):
args = [
'--model_dir',
self.config.model_path,
'--output_dir',
self.config.engine_dir,
'--model_name',
self.build_model_name,
'--dtype',
'float16',
'--use_gpt_attention_plugin',
'float16',
'--remove_input_padding',
'--paged_kv_cache',
'--world_size',
str(self.config.num_gpus),
'--tp_size',
str(self.config.num_gpus),
# '--parallel_build',
]
return args
class LlamaRunner(ModelRunner):
def __init__(self, config):
super().__init__(config)
# model_names = ["llama2-7b", "llama2-13b", "llama2-70b"]
# self.build_model_name = os.getenv("BUILD_MODEL_NAME")
# if self.build_model_name not in model_names:
# raise RuntimeError(f"BUILD_MODEL_NAME not in {model_names}")
def build_script(self):
return f"{self.config.script_root}/llama/build.py"
def build_args(self):
args = [
'--model_dir',
self.config.model_path,
'--output_dir',
self.config.engine_dir,
'--dtype',
'float16',
'--use_gpt_attention_plugin',
'float16',
'--world_size',
str(self.config.num_gpus),
'--tp_size',
str(self.config.num_gpus),
# '--parallel_build',
'--use_parallel_embedding',
'--remove_input_padding',
'--opt_memory_use',
'--paged_kv_cache',
'--tokens_per_block',
'64',
]
if self.config.weight_only_precision:
args.extend([
'--use_weight_only',
'--weight_only_precision',
self.config.weight_only_precision
])
return args
class QWenRunner(ModelRunner):
def __init__(self, config):
super().__init__(config)
# model_names = ["qwen1.5-7b", "qwen1.5-14b", "qwen1.5-72b"]
# self.build_model_name = os.getenv("BUILD_MODEL_NAME")
# if self.build_model_name not in model_names:
# raise RuntimeError(f"BUILD_MODEL_NAME not in {model_names}")
def build_script(self):
return f"{self.config.script_root}/qwen/build.py"
def build_args(self):
args = [
'--hf_model_dir',
self.config.model_path,
'--output_dir',
self.config.engine_dir,
'--dtype',
'float16',
'--use_gpt_attention_plugin',
'float16',
'--world_size',
str(self.config.num_gpus),
'--tp_size',
str(self.config.num_gpus),
# '--parallel_build',
# '--use_parallel_embedding',
'--remove_input_padding',
'--opt_memory_use',
'--paged_kv_cache',
'--tokens_per_block',
'64',
]
model_type = self.config.model_config.model_type()
if model_type == "qwen2":
# only support 1.5
args.extend(["--version", "1.5"])
if self.config.weight_only_precision:
args.extend([
'--use_weight_only',
'--weight_only_precision',
self.config.weight_only_precision
])
return args
runners = {
"chatglm": ChatGLMRunner,
"llama": LlamaRunner,
"qwen": QWenRunner,
"qwen2": QWenRunner,
}
def new_runner() -> ModelRunner:
config = Config()
model_type = config.model_config.model_type()
runner_cls = runners.get(model_type)
if runner_cls is None:
raise RuntimeError(f"model type {model_type} unsupported")
return runner_cls(config)
def check_args():
if '-h' in sys.argv[1:] or '--help' in sys.argv[1:]:
cmd = [sys.executable, '-m', ModelRunner.serve_module(), '--help']
p = subprocess.Popen(cmd)
p.wait()
sys.exit(p.returncode)
if __name__ == '__main__':
check_args()
runner = new_runner()
runner.build()
runner.serve()

3
requirements.txt Normal file
View File

@@ -0,0 +1,3 @@
# for qwen1.5
transformers==4.37.1
accelerate==0.21.0