Support Alibaba-NLP/gte-Qwen2-7B-instruct embedding Model (#1186)

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
2024-08-26 01:29:12 +08:00
parent 66e7dcaf70
commit 30b4f771b0
15 changed files with 167 additions and 55 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -38,6 +38,7 @@ class ServerArgs:
    quantization: Optional[str] = None
    served_model_name: Optional[str] = None
    chat_template: Optional[str] = None
+    is_embedding: bool = False

    # Port
    host: str = "127.0.0.1"
@@ -200,6 +201,11 @@ class ServerArgs:
            action="store_true",
            help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
        )
+        parser.add_argument(
+            "--is-embedding",
+            action="store_true",
+            help="Whether to use a CausalLM as an embedding model.",
+        )
        parser.add_argument(
            "--context-length",
            type=int,
@@ -458,6 +464,11 @@ class ServerArgs:
        assert not (
            self.dp_size > 1 and self.node_rank is not None
        ), "multi-node data parallel is not supported"
+        if "Alibaba-NLP/gte-Qwen2-1.5B-instruct" == self.model_path:
+            logger.info(
+                "Not sure why, the tokenizer will add an additional token at the end of the prompt when trust_remote_mode=True"
+            )
+            self.trust_remote_code = False
        if "gemma-2" in self.model_path.lower():
            logger.info("When using sliding window in gemma-2, turn on flashinfer.")
            self.disable_flashinfer = False