commit dfb6b3ae167212736b2b36562c4b92f13d06a184 Author: Yang Jun01 Date: Wed Aug 6 10:30:31 2025 +0800 add src diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..68e484e --- /dev/null +++ b/.dockerignore @@ -0,0 +1,4 @@ +.git +__pycache__/ +Dockerfile +Dockerfile.xtrt-llm diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c18dd8d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..cff095a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,12 @@ +ARG BASE_IMAGE=harbor.4pd.io/mic-llm-x/kunlunxin-xtrt-llm:0.5.3.2 +FROM $BASE_IMAGE + +ARG PYPI_MIRROR=https://pypi.tuna.tsinghua.edu.cn/simple +RUN pip config set global.index-url $PYPI_MIRROR + +ADD requirements.txt . +RUN pip install -r requirements.txt + +ADD llm_utils.py main.py ./ + +ENTRYPOINT ["python3", "main.py"] diff --git a/Dockerfile.xtrt-llm b/Dockerfile.xtrt-llm new file mode 100644 index 0000000..c742046 --- /dev/null +++ b/Dockerfile.xtrt-llm @@ -0,0 +1,12 @@ +ARG BASE_IMAGE=xtcl_ubuntu2004:v4.6 +FROM $BASE_IMAGE + +WORKDIR /workspace +ADD . . + +ARG PYPI_MIRROR=https://pypi.tuna.tsinghua.edu.cn/simple +RUN pip config set global.index-url $PYPI_MIRROR + +RUN bash scripts/install_release.sh && \ + rm -rf *.whl *.run downloads +ENV PATH=/usr/local/cuda-11.7/bin:/home/pt201/bin:$PATH diff --git a/README.md b/README.md new file mode 100644 index 0000000..8fdc17f --- /dev/null +++ b/README.md @@ -0,0 +1,46 @@ +# kunlunxin + +适配 kunlunxin 昆仑芯R200-8F加速卡的大模型推理服务镜像 + +## 启动 + +### 使用docker方式启动 + +```bash +docker run -it --rm \ + --net=host \ + -v /mnt/disk0/models/model-qwen1-5-72b-chat/:/model \ + -e MODEL_NAME=qwen1.5-72b \ + -e NUM_GPUs=4 \ + -e WEIGHT_ONLY_PRECISION=int8 \ + --device /dev/xpuctrl \ + --device /dev/xpu0 \ + --device /dev/xpu1 \ + --device /dev/xpu2 \ + --device /dev/xpu3 \ +    harbor.4pd.io/mic-llm-x/slx-infer-kunlunxin:release-0.1-pipe-1-commit-cd30b38d +``` + +### 参数说明 + +#### 环境变量 + +- MODEL_PATH: 模型在容器中的路径,默认为 `/model` + +- MODEL_NAME: 模型名字,用于api接口中 + +- PORT:端口,默认`80` + +- BUILD_SCRIPT_ROOT:编译脚本目录,一般不需要修改 + +- WEIGHT_ONLY_PRECISION:量化权重的精度,`int8`或`int4` + +- ENGINE_DIR:编译后的模型存储路径,默认`./xtrt_engine` + +- BUILD_EXTRA:编译用到的额外参数 + +#### 参数 + +基本与vllm相同,可以使用--help查看。 + +由于后端的engine使用的是xtrt的engine,所以相关的参数无效或造成未知的结果,所以不建议修改相关参数。 \ No newline at end of file diff --git a/llm_utils.py b/llm_utils.py new file mode 100644 index 0000000..8984a77 --- /dev/null +++ b/llm_utils.py @@ -0,0 +1,33 @@ +from transformers import PretrainedConfig + + +class ModelConfig: + def __init__(self, model_path: str): + self.hf_config = PretrainedConfig.from_pretrained(model_path) + + def model_type(self): + return self.hf_config.model_type + + def max_model_len(self): + derived_max_model_len = float("inf") + possible_keys = [ + # OPT + "max_position_embeddings", + # GPT-2 + "n_positions", + # MPT + "max_seq_len", + # ChatGLM2 + "seq_length", + # Others + "max_sequence_length", + "max_seq_length", + "seq_len", + ] + for key in possible_keys: + max_len_key = getattr(self.hf_config, key, None) + if max_len_key is not None: + derived_max_model_len = min(derived_max_model_len, max_len_key) + if derived_max_model_len == float("inf"): + return None + return derived_max_model_len diff --git a/main.py b/main.py new file mode 100644 index 0000000..f71fdcc --- /dev/null +++ b/main.py @@ -0,0 +1,242 @@ +import os +import sys +# import copy +import subprocess +from abc import ABC, abstractmethod +from typing import List +from llm_utils import ModelConfig +# from xtrt_llm.vllm.entrypoints.openai.api_server import parse_args + + +class Config: + def __init__(self): + self.model_path = os.getenv("MODEL_PATH", "/model") + self.model_name = os.getenv("MODEL_NAME") + self.num_gpus = int(os.getenv("NUM_GPUs", "1")) + self.port = os.getenv("PORT", "80") + self.script_root = os.getenv("BUILD_SCRIPT_ROOT", "examples") + self.weight_only_precision = os.getenv("WEIGHT_ONLY_PRECISION") + self.engine_dir = os.getenv("ENGINE_DIR", "./xtrt_engine") + self.build_extra = os.getenv("BUILD_EXTRA") + # self.parallel_build = os.getenv("PARALLEL_BUILD") + self.model_config = ModelConfig(self.model_path) + + +class ModelRunner(ABC): + def __init__(self, config): + self.config = config + + @abstractmethod + def build_script(self) -> str: + raise NotImplementedError() + + @abstractmethod + def build_args(self) -> List[str]: + pass + + def build_command(self) -> List[str]: + cmd = [ + sys.executable, + self.build_script() + ] + self.build_args() + if self.config.build_extra: + cmd.extend(self.config.build_extra.split(' ')) + return cmd + + def build(self): + if os.path.exists(self.config.engine_dir): + print(f"engine path {self.config.engine_dir} exists") + return + cmd = self.build_command() + print(f"build command: {cmd}") + p = subprocess.Popen(cmd) + p.wait() + if p.returncode != 0: + raise RuntimeError(f"build failed, exit code {p.returncode}") + print("build success") + + @staticmethod + def serve_module(): + return 'xtrt_llm.vllm.entrypoints.openai.api_server' + + def serve_command(self) -> List[str]: + cmd = [ + sys.executable, + '-m', + self.serve_module(), + '--port', + self.config.port, + '--model', + self.config.model_path, + '--engine_dir', + self.config.engine_dir, + '--trust-remote-code', + '--tensor-parallel-size', + str(self.config.num_gpus), + '--dtype', + 'float16', + ] + if self.config.model_name: + cmd.extend(['--served-model-name', self.config.model_name]) + cmd.extend(sys.argv[1:]) + return cmd + + def serve(self): + cmd = self.serve_command() + print(f"serve command: {cmd}") + p = subprocess.Popen(cmd) + p.wait() + if p.returncode != 0: + raise RuntimeError(f"serve failed, exit code {p.returncode}") + + +class ChatGLMRunner(ModelRunner): + def __init__(self, config): + super().__init__(config) + # used in build args + self.build_model_name = os.getenv("BUILD_MODEL_NAME", "chatglm3_6b") + + def build_script(self): + return f"{self.config.script_root}/chatglm/build.py" + + def build_args(self): + args = [ + '--model_dir', + self.config.model_path, + '--output_dir', + self.config.engine_dir, + '--model_name', + self.build_model_name, + '--dtype', + 'float16', + '--use_gpt_attention_plugin', + 'float16', + '--remove_input_padding', + '--paged_kv_cache', + '--world_size', + str(self.config.num_gpus), + '--tp_size', + str(self.config.num_gpus), + # '--parallel_build', + ] + return args + + +class LlamaRunner(ModelRunner): + def __init__(self, config): + super().__init__(config) + # model_names = ["llama2-7b", "llama2-13b", "llama2-70b"] + # self.build_model_name = os.getenv("BUILD_MODEL_NAME") + # if self.build_model_name not in model_names: + # raise RuntimeError(f"BUILD_MODEL_NAME not in {model_names}") + + def build_script(self): + return f"{self.config.script_root}/llama/build.py" + + def build_args(self): + args = [ + '--model_dir', + self.config.model_path, + '--output_dir', + self.config.engine_dir, + '--dtype', + 'float16', + '--use_gpt_attention_plugin', + 'float16', + '--world_size', + str(self.config.num_gpus), + '--tp_size', + str(self.config.num_gpus), + # '--parallel_build', + '--use_parallel_embedding', + '--remove_input_padding', + '--opt_memory_use', + '--paged_kv_cache', + '--tokens_per_block', + '64', + ] + if self.config.weight_only_precision: + args.extend([ + '--use_weight_only', + '--weight_only_precision', + self.config.weight_only_precision + ]) + return args + + +class QWenRunner(ModelRunner): + def __init__(self, config): + super().__init__(config) + # model_names = ["qwen1.5-7b", "qwen1.5-14b", "qwen1.5-72b"] + # self.build_model_name = os.getenv("BUILD_MODEL_NAME") + # if self.build_model_name not in model_names: + # raise RuntimeError(f"BUILD_MODEL_NAME not in {model_names}") + + def build_script(self): + return f"{self.config.script_root}/qwen/build.py" + + def build_args(self): + args = [ + '--hf_model_dir', + self.config.model_path, + '--output_dir', + self.config.engine_dir, + '--dtype', + 'float16', + '--use_gpt_attention_plugin', + 'float16', + '--world_size', + str(self.config.num_gpus), + '--tp_size', + str(self.config.num_gpus), + # '--parallel_build', + # '--use_parallel_embedding', + '--remove_input_padding', + '--opt_memory_use', + '--paged_kv_cache', + '--tokens_per_block', + '64', + ] + model_type = self.config.model_config.model_type() + if model_type == "qwen2": + # only support 1.5 + args.extend(["--version", "1.5"]) + if self.config.weight_only_precision: + args.extend([ + '--use_weight_only', + '--weight_only_precision', + self.config.weight_only_precision + ]) + return args + + +runners = { + "chatglm": ChatGLMRunner, + "llama": LlamaRunner, + "qwen": QWenRunner, + "qwen2": QWenRunner, +} + + +def new_runner() -> ModelRunner: + config = Config() + model_type = config.model_config.model_type() + runner_cls = runners.get(model_type) + if runner_cls is None: + raise RuntimeError(f"model type {model_type} unsupported") + return runner_cls(config) + + +def check_args(): + if '-h' in sys.argv[1:] or '--help' in sys.argv[1:]: + cmd = [sys.executable, '-m', ModelRunner.serve_module(), '--help'] + p = subprocess.Popen(cmd) + p.wait() + sys.exit(p.returncode) + + +if __name__ == '__main__': + check_args() + runner = new_runner() + runner.build() + runner.serve() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b3f5f0d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +# for qwen1.5 +transformers==4.37.1 +accelerate==0.21.0