add src

2025-08-06 10:30:31 +08:00
commit dfb6b3ae16
8 changed files with 353 additions and 0 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,4 @@
+.git
+__pycache__/
+Dockerfile
+Dockerfile.xtrt-llm
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+__pycache__/
--- a/12
+++ b/12
@@ -0,0 +1,12 @@
+ARG BASE_IMAGE=harbor.4pd.io/mic-llm-x/kunlunxin-xtrt-llm:0.5.3.2
+FROM $BASE_IMAGE
+
+ARG PYPI_MIRROR=https://pypi.tuna.tsinghua.edu.cn/simple
+RUN pip config set global.index-url $PYPI_MIRROR
+
+ADD requirements.txt .
+RUN pip install -r requirements.txt
+
+ADD llm_utils.py main.py ./
+
+ENTRYPOINT ["python3", "main.py"]
--- a/Dockerfile.xtrt-llm
+++ b/Dockerfile.xtrt-llm
@@ -0,0 +1,12 @@
+ARG BASE_IMAGE=xtcl_ubuntu2004:v4.6
+FROM $BASE_IMAGE
+
+WORKDIR /workspace
+ADD . .
+
+ARG PYPI_MIRROR=https://pypi.tuna.tsinghua.edu.cn/simple
+RUN pip config set global.index-url $PYPI_MIRROR
+
+RUN bash scripts/install_release.sh && \
+    rm -rf *.whl *.run downloads
+ENV PATH=/usr/local/cuda-11.7/bin:/home/pt201/bin:$PATH
--- a/README.md
+++ b/README.md
@@ -0,0 +1,46 @@
+# kunlunxin
+
+适配 kunlunxin 昆仑芯R200-8F加速卡的大模型推理服务镜像
+
+## 启动
+
+### 使用docker方式启动
+
+```bash
+docker run -it --rm \
+    --net=host \
+    -v /mnt/disk0/models/model-qwen1-5-72b-chat/:/model \
+    -e MODEL_NAME=qwen1.5-72b \
+    -e NUM_GPUs=4 \
+    -e WEIGHT_ONLY_PRECISION=int8 \
+    --device /dev/xpuctrl \
+    --device /dev/xpu0 \
+    --device /dev/xpu1 \
+    --device /dev/xpu2 \
+    --device /dev/xpu3 \
+    harbor.4pd.io/mic-llm-x/slx-infer-kunlunxin:release-0.1-pipe-1-commit-cd30b38d
+```
+
+### 参数说明
+
+#### 环境变量
+
+- MODEL_PATH:  模型在容器中的路径，默认为 `/model`
+
+- MODEL_NAME:  模型名字，用于api接口中
+
+- PORT：端口，默认`80`
+
+- BUILD_SCRIPT_ROOT：编译脚本目录，一般不需要修改
+
+- WEIGHT_ONLY_PRECISION：量化权重的精度，`int8`或`int4`
+
+- ENGINE_DIR：编译后的模型存储路径，默认`./xtrt_engine`
+
+- BUILD_EXTRA：编译用到的额外参数
+
+#### 参数
+
+基本与vllm相同，可以使用--help查看。
+
+由于后端的engine使用的是xtrt的engine，所以相关的参数无效或造成未知的结果，所以不建议修改相关参数。
--- a/llm_utils.py
+++ b/llm_utils.py
@@ -0,0 +1,33 @@
+from transformers import PretrainedConfig
+
+
+class ModelConfig:
+    def __init__(self, model_path: str):
+        self.hf_config = PretrainedConfig.from_pretrained(model_path)
+
+    def model_type(self):
+        return self.hf_config.model_type
+
+    def max_model_len(self):
+        derived_max_model_len = float("inf")
+        possible_keys = [
+            # OPT
+            "max_position_embeddings",
+            # GPT-2
+            "n_positions",
+            # MPT
+            "max_seq_len",
+            # ChatGLM2
+            "seq_length",
+            # Others
+            "max_sequence_length",
+            "max_seq_length",
+            "seq_len",
+        ]
+        for key in possible_keys:
+            max_len_key = getattr(self.hf_config, key, None)
+            if max_len_key is not None:
+                derived_max_model_len = min(derived_max_model_len, max_len_key)
+        if derived_max_model_len == float("inf"):
+            return None
+        return derived_max_model_len
--- a/main.py
+++ b/main.py
@@ -0,0 +1,242 @@
+import os
+import sys
+# import copy
+import subprocess
+from abc import ABC, abstractmethod
+from typing import List
+from llm_utils import ModelConfig
+# from xtrt_llm.vllm.entrypoints.openai.api_server import parse_args
+
+
+class Config:
+    def __init__(self):
+        self.model_path = os.getenv("MODEL_PATH", "/model")
+        self.model_name = os.getenv("MODEL_NAME")
+        self.num_gpus = int(os.getenv("NUM_GPUs", "1"))
+        self.port = os.getenv("PORT", "80")
+        self.script_root = os.getenv("BUILD_SCRIPT_ROOT", "examples")
+        self.weight_only_precision = os.getenv("WEIGHT_ONLY_PRECISION")
+        self.engine_dir = os.getenv("ENGINE_DIR", "./xtrt_engine")
+        self.build_extra = os.getenv("BUILD_EXTRA")
+        # self.parallel_build = os.getenv("PARALLEL_BUILD")
+        self.model_config = ModelConfig(self.model_path)
+
+
+class ModelRunner(ABC):
+    def __init__(self, config):
+        self.config = config
+
+    @abstractmethod
+    def build_script(self) -> str:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def build_args(self) -> List[str]:
+        pass
+
+    def build_command(self) -> List[str]:
+        cmd = [
+            sys.executable,
+            self.build_script()
+        ] + self.build_args()
+        if self.config.build_extra:
+            cmd.extend(self.config.build_extra.split(' '))
+        return cmd
+
+    def build(self):
+        if os.path.exists(self.config.engine_dir):
+            print(f"engine path {self.config.engine_dir} exists")
+            return
+        cmd = self.build_command()
+        print(f"build command: {cmd}")
+        p = subprocess.Popen(cmd)
+        p.wait()
+        if p.returncode != 0:
+            raise RuntimeError(f"build failed, exit code {p.returncode}")
+        print("build success")
+
+    @staticmethod
+    def serve_module():
+        return 'xtrt_llm.vllm.entrypoints.openai.api_server'
+
+    def serve_command(self) -> List[str]:
+        cmd = [
+            sys.executable,
+            '-m',
+            self.serve_module(),
+            '--port',
+            self.config.port,
+            '--model',
+            self.config.model_path,
+            '--engine_dir',
+            self.config.engine_dir,
+            '--trust-remote-code',
+            '--tensor-parallel-size',
+            str(self.config.num_gpus),
+            '--dtype',
+            'float16',
+        ]
+        if self.config.model_name:
+            cmd.extend(['--served-model-name', self.config.model_name])
+        cmd.extend(sys.argv[1:])
+        return cmd
+
+    def serve(self):
+        cmd = self.serve_command()
+        print(f"serve command: {cmd}")
+        p = subprocess.Popen(cmd)
+        p.wait()
+        if p.returncode != 0:
+            raise RuntimeError(f"serve failed, exit code {p.returncode}")
+
+
+class ChatGLMRunner(ModelRunner):
+    def __init__(self, config):
+        super().__init__(config)
+        # used in build args
+        self.build_model_name = os.getenv("BUILD_MODEL_NAME", "chatglm3_6b")
+
+    def build_script(self):
+        return f"{self.config.script_root}/chatglm/build.py"
+
+    def build_args(self):
+        args = [
+            '--model_dir',
+            self.config.model_path,
+            '--output_dir',
+            self.config.engine_dir,
+            '--model_name',
+            self.build_model_name,
+            '--dtype',
+            'float16',
+            '--use_gpt_attention_plugin',
+            'float16',
+            '--remove_input_padding',
+            '--paged_kv_cache',
+            '--world_size',
+            str(self.config.num_gpus),
+            '--tp_size',
+            str(self.config.num_gpus),
+            # '--parallel_build',
+        ]
+        return args
+
+
+class LlamaRunner(ModelRunner):
+    def __init__(self, config):
+        super().__init__(config)
+        # model_names = ["llama2-7b", "llama2-13b", "llama2-70b"]
+        # self.build_model_name = os.getenv("BUILD_MODEL_NAME")
+        # if self.build_model_name not in model_names:
+        #     raise RuntimeError(f"BUILD_MODEL_NAME not in {model_names}")
+
+    def build_script(self):
+        return f"{self.config.script_root}/llama/build.py"
+
+    def build_args(self):
+        args = [
+            '--model_dir',
+            self.config.model_path,
+            '--output_dir',
+            self.config.engine_dir,
+            '--dtype',
+            'float16',
+            '--use_gpt_attention_plugin',
+            'float16',
+            '--world_size',
+            str(self.config.num_gpus),
+            '--tp_size',
+            str(self.config.num_gpus),
+            # '--parallel_build',
+            '--use_parallel_embedding',
+            '--remove_input_padding',
+            '--opt_memory_use',
+            '--paged_kv_cache',
+            '--tokens_per_block',
+            '64',
+        ]
+        if self.config.weight_only_precision:
+            args.extend([
+                '--use_weight_only',
+                '--weight_only_precision',
+                self.config.weight_only_precision
+            ])
+        return args
+
+
+class QWenRunner(ModelRunner):
+    def __init__(self, config):
+        super().__init__(config)
+        # model_names = ["qwen1.5-7b", "qwen1.5-14b", "qwen1.5-72b"]
+        # self.build_model_name = os.getenv("BUILD_MODEL_NAME")
+        # if self.build_model_name not in model_names:
+        #     raise RuntimeError(f"BUILD_MODEL_NAME not in {model_names}")
+
+    def build_script(self):
+        return f"{self.config.script_root}/qwen/build.py"
+
+    def build_args(self):
+        args = [
+            '--hf_model_dir',
+            self.config.model_path,
+            '--output_dir',
+            self.config.engine_dir,
+            '--dtype',
+            'float16',
+            '--use_gpt_attention_plugin',
+            'float16',
+            '--world_size',
+            str(self.config.num_gpus),
+            '--tp_size',
+            str(self.config.num_gpus),
+            # '--parallel_build',
+            # '--use_parallel_embedding',
+            '--remove_input_padding',
+            '--opt_memory_use',
+            '--paged_kv_cache',
+            '--tokens_per_block',
+            '64',
+        ]
+        model_type = self.config.model_config.model_type()
+        if model_type == "qwen2":
+            # only support 1.5
+            args.extend(["--version", "1.5"])
+        if self.config.weight_only_precision:
+            args.extend([
+                '--use_weight_only',
+                '--weight_only_precision',
+                self.config.weight_only_precision
+            ])
+        return args
+
+
+runners = {
+    "chatglm": ChatGLMRunner,
+    "llama": LlamaRunner,
+    "qwen": QWenRunner,
+    "qwen2": QWenRunner,
+}
+
+
+def new_runner() -> ModelRunner:
+    config = Config()
+    model_type = config.model_config.model_type()
+    runner_cls = runners.get(model_type)
+    if runner_cls is None:
+        raise RuntimeError(f"model type {model_type} unsupported")
+    return runner_cls(config)
+
+
+def check_args():
+    if '-h' in sys.argv[1:] or '--help' in sys.argv[1:]:
+        cmd = [sys.executable, '-m', ModelRunner.serve_module(), '--help']
+        p = subprocess.Popen(cmd)
+        p.wait()
+        sys.exit(p.returncode)
+
+
+if __name__ == '__main__':
+    check_args()
+    runner = new_runner()
+    runner.build()
+    runner.serve()
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+# for qwen1.5
+transformers==4.37.1
+accelerate==0.21.0