From b91a4cb1b1c21b94ca74f4e75305e4b26673dc5b Mon Sep 17 00:00:00 2001
From: liuyhwangyh <liuyhwangyh@163.com>
Date: Fri, 9 Aug 2024 17:52:14 +0800
Subject: [PATCH] support models from www.modelscope.cn (#994)

Co-authored-by: mulin.lyh <mulin.lyh@taobao.com>
---
 README.md                               | 10 ++++++
 python/sglang/srt/server.py             |  6 ++++
 python/sglang/srt/utils.py              | 20 +++++++++++
 test/srt/run_suite.py                   |  1 +
 test/srt/test_models_from_modelscope.py | 47 +++++++++++++++++++++++++
 5 files changed, 84 insertions(+)
 create mode 100644 test/srt/test_models_from_modelscope.py

diff --git a/README.md b/README.md
index b7057401d..9be13509f 100644
--- a/README.md
+++ b/README.md
@@ -168,6 +168,16 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
 
+### Use Models From ModelScope
+To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
+```
+export SGLANG_USE_MODELSCOPE=true
+```
+Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
+```
+SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
+```    
+  
 ### Supported Models
 
 - Llama / Llama 2 / Llama 3 / Llama 3.1
diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
index ee84a99e4..d6e3f31ec 100644
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -74,6 +74,8 @@ from sglang.srt.utils import (
     enable_show_time_cost,
     kill_child_process,
     maybe_set_triton_cache_manager,
+    prepare_model,
+    prepare_tokenizer,
     set_ulimit,
 )
 from sglang.utils import get_exception_traceback
@@ -250,6 +252,10 @@ def launch_server(
     )
     logger.info(f"{server_args=}")
 
+    # Use model from www.modelscope.cn, first download the model.
+    server_args.model_path = prepare_model(server_args.model_path)
+    server_args.tokenizer_path = prepare_tokenizer(server_args.tokenizer_path)
+
     # Launch processes for multi-node tensor parallelism
     if server_args.nnodes > 1:
         if server_args.node_rank != 0:
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index 525ae8ca7..dd41156f3 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -701,3 +701,23 @@ def add_api_key_middleware(app, api_key):
         if request.headers.get("Authorization") != "Bearer " + api_key:
             return JSONResponse(content={"error": "Unauthorized"}, status_code=401)
         return await call_next(request)
+
+
+def prepare_model(model_path):
+    if "SGLANG_USE_MODELSCOPE" in os.environ:
+        if not os.path.exists(model_path):
+            from modelscope import snapshot_download
+
+            return snapshot_download(model_path)
+    return model_path
+
+
+def prepare_tokenizer(tokenizer_path):
+    if "SGLANG_USE_MODELSCOPE" in os.environ:
+        if not os.path.exists(tokenizer_path):
+            from modelscope import snapshot_download
+
+            return snapshot_download(
+                tokenizer_path, ignore_patterns=["*.bin", "*.safetensors"]
+            )
+    return tokenizer_path
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 67d772b30..d5051ffc1 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -10,6 +10,7 @@ suites = {
         "test_vision_openai_server.py",
         "test_chunked_prefill.py",
         "test_torch_compile.py",
+        "test_models_from_modelscope.py",
         "models/test_generation_models.py",
         "models/test_embedding_models.py",
         "sampling/penaltylib",
diff --git a/test/srt/test_models_from_modelscope.py b/test/srt/test_models_from_modelscope.py
new file mode 100644
index 000000000..2313053b9
--- /dev/null
+++ b/test/srt/test_models_from_modelscope.py
@@ -0,0 +1,47 @@
+import os
+import shutil
+import subprocess
+import unittest
+from unittest import mock
+
+from sglang.srt.utils import prepare_model, prepare_tokenizer
+
+
+class TestDownloadFromModelScope(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "iic/nlp_lstmcrf_word-segmentation_chinese-news"
+        stat, output = subprocess.getstatusoutput("pip install modelscope")
+
+        cls.with_modelscope_environ = {k: v for k, v in os.environ.items()}
+        cls.with_modelscope_environ["SGLANG_USE_MODELSCOPE"] = "True"
+
+    @classmethod
+    def tearDownClass(cls):
+        pass
+
+    def test_prepare_model(self):
+        from modelscope.utils.file_utils import get_model_cache_root
+
+        model_cache_root = get_model_cache_root()
+        if os.path.exists(model_cache_root):
+            shutil.rmtree(model_cache_root)
+        with mock.patch.dict(os.environ, self.with_modelscope_environ, clear=True):
+            model_path = prepare_model(self.model)
+            assert os.path.exists(os.path.join(model_path, "pytorch_model.bin"))
+
+    def test_prepare_tokenizer(self):
+        from modelscope.utils.file_utils import get_model_cache_root
+
+        model_cache_root = get_model_cache_root()
+        if os.path.exists(model_cache_root):
+            shutil.rmtree(model_cache_root)
+        with mock.patch.dict(os.environ, self.with_modelscope_environ, clear=True):
+            tokenizer_path = prepare_tokenizer(self.model)
+            assert not os.path.exists(os.path.join(tokenizer_path, "pytorch_model.bin"))
+            assert os.path.exists(os.path.join(tokenizer_path, "config.json"))
+
+
+if __name__ == "__main__":
+    unittest.main(warnings="ignore")