add src
This commit is contained in:
4
.dockerignore
Normal file
4
.dockerignore
Normal file
@@ -0,0 +1,4 @@
|
||||
.git
|
||||
__pycache__/
|
||||
Dockerfile
|
||||
Dockerfile.xtrt-llm
|
||||
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
__pycache__/
|
||||
12
Dockerfile
Normal file
12
Dockerfile
Normal file
@@ -0,0 +1,12 @@
|
||||
ARG BASE_IMAGE=harbor.4pd.io/mic-llm-x/kunlunxin-xtrt-llm:0.5.3.2
|
||||
FROM $BASE_IMAGE
|
||||
|
||||
ARG PYPI_MIRROR=https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
RUN pip config set global.index-url $PYPI_MIRROR
|
||||
|
||||
ADD requirements.txt .
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
ADD llm_utils.py main.py ./
|
||||
|
||||
ENTRYPOINT ["python3", "main.py"]
|
||||
12
Dockerfile.xtrt-llm
Normal file
12
Dockerfile.xtrt-llm
Normal file
@@ -0,0 +1,12 @@
|
||||
ARG BASE_IMAGE=xtcl_ubuntu2004:v4.6
|
||||
FROM $BASE_IMAGE
|
||||
|
||||
WORKDIR /workspace
|
||||
ADD . .
|
||||
|
||||
ARG PYPI_MIRROR=https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
RUN pip config set global.index-url $PYPI_MIRROR
|
||||
|
||||
RUN bash scripts/install_release.sh && \
|
||||
rm -rf *.whl *.run downloads
|
||||
ENV PATH=/usr/local/cuda-11.7/bin:/home/pt201/bin:$PATH
|
||||
46
README.md
Normal file
46
README.md
Normal file
@@ -0,0 +1,46 @@
|
||||
# kunlunxin
|
||||
|
||||
适配 kunlunxin 昆仑芯R200-8F加速卡的大模型推理服务镜像
|
||||
|
||||
## 启动
|
||||
|
||||
### 使用docker方式启动
|
||||
|
||||
```bash
|
||||
docker run -it --rm \
|
||||
--net=host \
|
||||
-v /mnt/disk0/models/model-qwen1-5-72b-chat/:/model \
|
||||
-e MODEL_NAME=qwen1.5-72b \
|
||||
-e NUM_GPUs=4 \
|
||||
-e WEIGHT_ONLY_PRECISION=int8 \
|
||||
--device /dev/xpuctrl \
|
||||
--device /dev/xpu0 \
|
||||
--device /dev/xpu1 \
|
||||
--device /dev/xpu2 \
|
||||
--device /dev/xpu3 \
|
||||
harbor.4pd.io/mic-llm-x/slx-infer-kunlunxin:release-0.1-pipe-1-commit-cd30b38d
|
||||
```
|
||||
|
||||
### 参数说明
|
||||
|
||||
#### 环境变量
|
||||
|
||||
- MODEL_PATH: 模型在容器中的路径,默认为 `/model`
|
||||
|
||||
- MODEL_NAME: 模型名字,用于api接口中
|
||||
|
||||
- PORT:端口,默认`80`
|
||||
|
||||
- BUILD_SCRIPT_ROOT:编译脚本目录,一般不需要修改
|
||||
|
||||
- WEIGHT_ONLY_PRECISION:量化权重的精度,`int8`或`int4`
|
||||
|
||||
- ENGINE_DIR:编译后的模型存储路径,默认`./xtrt_engine`
|
||||
|
||||
- BUILD_EXTRA:编译用到的额外参数
|
||||
|
||||
#### 参数
|
||||
|
||||
基本与vllm相同,可以使用--help查看。
|
||||
|
||||
由于后端的engine使用的是xtrt的engine,所以相关的参数无效或造成未知的结果,所以不建议修改相关参数。
|
||||
33
llm_utils.py
Normal file
33
llm_utils.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class ModelConfig:
|
||||
def __init__(self, model_path: str):
|
||||
self.hf_config = PretrainedConfig.from_pretrained(model_path)
|
||||
|
||||
def model_type(self):
|
||||
return self.hf_config.model_type
|
||||
|
||||
def max_model_len(self):
|
||||
derived_max_model_len = float("inf")
|
||||
possible_keys = [
|
||||
# OPT
|
||||
"max_position_embeddings",
|
||||
# GPT-2
|
||||
"n_positions",
|
||||
# MPT
|
||||
"max_seq_len",
|
||||
# ChatGLM2
|
||||
"seq_length",
|
||||
# Others
|
||||
"max_sequence_length",
|
||||
"max_seq_length",
|
||||
"seq_len",
|
||||
]
|
||||
for key in possible_keys:
|
||||
max_len_key = getattr(self.hf_config, key, None)
|
||||
if max_len_key is not None:
|
||||
derived_max_model_len = min(derived_max_model_len, max_len_key)
|
||||
if derived_max_model_len == float("inf"):
|
||||
return None
|
||||
return derived_max_model_len
|
||||
242
main.py
Normal file
242
main.py
Normal file
@@ -0,0 +1,242 @@
|
||||
import os
|
||||
import sys
|
||||
# import copy
|
||||
import subprocess
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List
|
||||
from llm_utils import ModelConfig
|
||||
# from xtrt_llm.vllm.entrypoints.openai.api_server import parse_args
|
||||
|
||||
|
||||
class Config:
|
||||
def __init__(self):
|
||||
self.model_path = os.getenv("MODEL_PATH", "/model")
|
||||
self.model_name = os.getenv("MODEL_NAME")
|
||||
self.num_gpus = int(os.getenv("NUM_GPUs", "1"))
|
||||
self.port = os.getenv("PORT", "80")
|
||||
self.script_root = os.getenv("BUILD_SCRIPT_ROOT", "examples")
|
||||
self.weight_only_precision = os.getenv("WEIGHT_ONLY_PRECISION")
|
||||
self.engine_dir = os.getenv("ENGINE_DIR", "./xtrt_engine")
|
||||
self.build_extra = os.getenv("BUILD_EXTRA")
|
||||
# self.parallel_build = os.getenv("PARALLEL_BUILD")
|
||||
self.model_config = ModelConfig(self.model_path)
|
||||
|
||||
|
||||
class ModelRunner(ABC):
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
|
||||
@abstractmethod
|
||||
def build_script(self) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def build_args(self) -> List[str]:
|
||||
pass
|
||||
|
||||
def build_command(self) -> List[str]:
|
||||
cmd = [
|
||||
sys.executable,
|
||||
self.build_script()
|
||||
] + self.build_args()
|
||||
if self.config.build_extra:
|
||||
cmd.extend(self.config.build_extra.split(' '))
|
||||
return cmd
|
||||
|
||||
def build(self):
|
||||
if os.path.exists(self.config.engine_dir):
|
||||
print(f"engine path {self.config.engine_dir} exists")
|
||||
return
|
||||
cmd = self.build_command()
|
||||
print(f"build command: {cmd}")
|
||||
p = subprocess.Popen(cmd)
|
||||
p.wait()
|
||||
if p.returncode != 0:
|
||||
raise RuntimeError(f"build failed, exit code {p.returncode}")
|
||||
print("build success")
|
||||
|
||||
@staticmethod
|
||||
def serve_module():
|
||||
return 'xtrt_llm.vllm.entrypoints.openai.api_server'
|
||||
|
||||
def serve_command(self) -> List[str]:
|
||||
cmd = [
|
||||
sys.executable,
|
||||
'-m',
|
||||
self.serve_module(),
|
||||
'--port',
|
||||
self.config.port,
|
||||
'--model',
|
||||
self.config.model_path,
|
||||
'--engine_dir',
|
||||
self.config.engine_dir,
|
||||
'--trust-remote-code',
|
||||
'--tensor-parallel-size',
|
||||
str(self.config.num_gpus),
|
||||
'--dtype',
|
||||
'float16',
|
||||
]
|
||||
if self.config.model_name:
|
||||
cmd.extend(['--served-model-name', self.config.model_name])
|
||||
cmd.extend(sys.argv[1:])
|
||||
return cmd
|
||||
|
||||
def serve(self):
|
||||
cmd = self.serve_command()
|
||||
print(f"serve command: {cmd}")
|
||||
p = subprocess.Popen(cmd)
|
||||
p.wait()
|
||||
if p.returncode != 0:
|
||||
raise RuntimeError(f"serve failed, exit code {p.returncode}")
|
||||
|
||||
|
||||
class ChatGLMRunner(ModelRunner):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
# used in build args
|
||||
self.build_model_name = os.getenv("BUILD_MODEL_NAME", "chatglm3_6b")
|
||||
|
||||
def build_script(self):
|
||||
return f"{self.config.script_root}/chatglm/build.py"
|
||||
|
||||
def build_args(self):
|
||||
args = [
|
||||
'--model_dir',
|
||||
self.config.model_path,
|
||||
'--output_dir',
|
||||
self.config.engine_dir,
|
||||
'--model_name',
|
||||
self.build_model_name,
|
||||
'--dtype',
|
||||
'float16',
|
||||
'--use_gpt_attention_plugin',
|
||||
'float16',
|
||||
'--remove_input_padding',
|
||||
'--paged_kv_cache',
|
||||
'--world_size',
|
||||
str(self.config.num_gpus),
|
||||
'--tp_size',
|
||||
str(self.config.num_gpus),
|
||||
# '--parallel_build',
|
||||
]
|
||||
return args
|
||||
|
||||
|
||||
class LlamaRunner(ModelRunner):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
# model_names = ["llama2-7b", "llama2-13b", "llama2-70b"]
|
||||
# self.build_model_name = os.getenv("BUILD_MODEL_NAME")
|
||||
# if self.build_model_name not in model_names:
|
||||
# raise RuntimeError(f"BUILD_MODEL_NAME not in {model_names}")
|
||||
|
||||
def build_script(self):
|
||||
return f"{self.config.script_root}/llama/build.py"
|
||||
|
||||
def build_args(self):
|
||||
args = [
|
||||
'--model_dir',
|
||||
self.config.model_path,
|
||||
'--output_dir',
|
||||
self.config.engine_dir,
|
||||
'--dtype',
|
||||
'float16',
|
||||
'--use_gpt_attention_plugin',
|
||||
'float16',
|
||||
'--world_size',
|
||||
str(self.config.num_gpus),
|
||||
'--tp_size',
|
||||
str(self.config.num_gpus),
|
||||
# '--parallel_build',
|
||||
'--use_parallel_embedding',
|
||||
'--remove_input_padding',
|
||||
'--opt_memory_use',
|
||||
'--paged_kv_cache',
|
||||
'--tokens_per_block',
|
||||
'64',
|
||||
]
|
||||
if self.config.weight_only_precision:
|
||||
args.extend([
|
||||
'--use_weight_only',
|
||||
'--weight_only_precision',
|
||||
self.config.weight_only_precision
|
||||
])
|
||||
return args
|
||||
|
||||
|
||||
class QWenRunner(ModelRunner):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
# model_names = ["qwen1.5-7b", "qwen1.5-14b", "qwen1.5-72b"]
|
||||
# self.build_model_name = os.getenv("BUILD_MODEL_NAME")
|
||||
# if self.build_model_name not in model_names:
|
||||
# raise RuntimeError(f"BUILD_MODEL_NAME not in {model_names}")
|
||||
|
||||
def build_script(self):
|
||||
return f"{self.config.script_root}/qwen/build.py"
|
||||
|
||||
def build_args(self):
|
||||
args = [
|
||||
'--hf_model_dir',
|
||||
self.config.model_path,
|
||||
'--output_dir',
|
||||
self.config.engine_dir,
|
||||
'--dtype',
|
||||
'float16',
|
||||
'--use_gpt_attention_plugin',
|
||||
'float16',
|
||||
'--world_size',
|
||||
str(self.config.num_gpus),
|
||||
'--tp_size',
|
||||
str(self.config.num_gpus),
|
||||
# '--parallel_build',
|
||||
# '--use_parallel_embedding',
|
||||
'--remove_input_padding',
|
||||
'--opt_memory_use',
|
||||
'--paged_kv_cache',
|
||||
'--tokens_per_block',
|
||||
'64',
|
||||
]
|
||||
model_type = self.config.model_config.model_type()
|
||||
if model_type == "qwen2":
|
||||
# only support 1.5
|
||||
args.extend(["--version", "1.5"])
|
||||
if self.config.weight_only_precision:
|
||||
args.extend([
|
||||
'--use_weight_only',
|
||||
'--weight_only_precision',
|
||||
self.config.weight_only_precision
|
||||
])
|
||||
return args
|
||||
|
||||
|
||||
runners = {
|
||||
"chatglm": ChatGLMRunner,
|
||||
"llama": LlamaRunner,
|
||||
"qwen": QWenRunner,
|
||||
"qwen2": QWenRunner,
|
||||
}
|
||||
|
||||
|
||||
def new_runner() -> ModelRunner:
|
||||
config = Config()
|
||||
model_type = config.model_config.model_type()
|
||||
runner_cls = runners.get(model_type)
|
||||
if runner_cls is None:
|
||||
raise RuntimeError(f"model type {model_type} unsupported")
|
||||
return runner_cls(config)
|
||||
|
||||
|
||||
def check_args():
|
||||
if '-h' in sys.argv[1:] or '--help' in sys.argv[1:]:
|
||||
cmd = [sys.executable, '-m', ModelRunner.serve_module(), '--help']
|
||||
p = subprocess.Popen(cmd)
|
||||
p.wait()
|
||||
sys.exit(p.returncode)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
check_args()
|
||||
runner = new_runner()
|
||||
runner.build()
|
||||
runner.serve()
|
||||
3
requirements.txt
Normal file
3
requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
# for qwen1.5
|
||||
transformers==4.37.1
|
||||
accelerate==0.21.0
|
||||
Reference in New Issue
Block a user