From b91a4cb1b1c21b94ca74f4e75305e4b26673dc5b Mon Sep 17 00:00:00 2001 From: liuyhwangyh Date: Fri, 9 Aug 2024 17:52:14 +0800 Subject: [PATCH] support models from www.modelscope.cn (#994) Co-authored-by: mulin.lyh --- README.md | 10 ++++++ python/sglang/srt/server.py | 6 ++++ python/sglang/srt/utils.py | 20 +++++++++++ test/srt/run_suite.py | 1 + test/srt/test_models_from_modelscope.py | 47 +++++++++++++++++++++++++ 5 files changed, 84 insertions(+) create mode 100644 test/srt/test_models_from_modelscope.py diff --git a/README.md b/README.md index b7057401d..9be13509f 100644 --- a/README.md +++ b/README.md @@ -168,6 +168,16 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments. - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes. +### Use Models From ModelScope +To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE. +``` +export SGLANG_USE_MODELSCOPE=true +``` +Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server +``` +SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000 +``` + ### Supported Models - Llama / Llama 2 / Llama 3 / Llama 3.1 diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index ee84a99e4..d6e3f31ec 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -74,6 +74,8 @@ from sglang.srt.utils import ( enable_show_time_cost, kill_child_process, maybe_set_triton_cache_manager, + prepare_model, + prepare_tokenizer, set_ulimit, ) from sglang.utils import get_exception_traceback @@ -250,6 +252,10 @@ def launch_server( ) logger.info(f"{server_args=}") + # Use model from www.modelscope.cn, first download the model. + server_args.model_path = prepare_model(server_args.model_path) + server_args.tokenizer_path = prepare_tokenizer(server_args.tokenizer_path) + # Launch processes for multi-node tensor parallelism if server_args.nnodes > 1: if server_args.node_rank != 0: diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 525ae8ca7..dd41156f3 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -701,3 +701,23 @@ def add_api_key_middleware(app, api_key): if request.headers.get("Authorization") != "Bearer " + api_key: return JSONResponse(content={"error": "Unauthorized"}, status_code=401) return await call_next(request) + + +def prepare_model(model_path): + if "SGLANG_USE_MODELSCOPE" in os.environ: + if not os.path.exists(model_path): + from modelscope import snapshot_download + + return snapshot_download(model_path) + return model_path + + +def prepare_tokenizer(tokenizer_path): + if "SGLANG_USE_MODELSCOPE" in os.environ: + if not os.path.exists(tokenizer_path): + from modelscope import snapshot_download + + return snapshot_download( + tokenizer_path, ignore_patterns=["*.bin", "*.safetensors"] + ) + return tokenizer_path diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 67d772b30..d5051ffc1 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -10,6 +10,7 @@ suites = { "test_vision_openai_server.py", "test_chunked_prefill.py", "test_torch_compile.py", + "test_models_from_modelscope.py", "models/test_generation_models.py", "models/test_embedding_models.py", "sampling/penaltylib", diff --git a/test/srt/test_models_from_modelscope.py b/test/srt/test_models_from_modelscope.py new file mode 100644 index 000000000..2313053b9 --- /dev/null +++ b/test/srt/test_models_from_modelscope.py @@ -0,0 +1,47 @@ +import os +import shutil +import subprocess +import unittest +from unittest import mock + +from sglang.srt.utils import prepare_model, prepare_tokenizer + + +class TestDownloadFromModelScope(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.model = "iic/nlp_lstmcrf_word-segmentation_chinese-news" + stat, output = subprocess.getstatusoutput("pip install modelscope") + + cls.with_modelscope_environ = {k: v for k, v in os.environ.items()} + cls.with_modelscope_environ["SGLANG_USE_MODELSCOPE"] = "True" + + @classmethod + def tearDownClass(cls): + pass + + def test_prepare_model(self): + from modelscope.utils.file_utils import get_model_cache_root + + model_cache_root = get_model_cache_root() + if os.path.exists(model_cache_root): + shutil.rmtree(model_cache_root) + with mock.patch.dict(os.environ, self.with_modelscope_environ, clear=True): + model_path = prepare_model(self.model) + assert os.path.exists(os.path.join(model_path, "pytorch_model.bin")) + + def test_prepare_tokenizer(self): + from modelscope.utils.file_utils import get_model_cache_root + + model_cache_root = get_model_cache_root() + if os.path.exists(model_cache_root): + shutil.rmtree(model_cache_root) + with mock.patch.dict(os.environ, self.with_modelscope_environ, clear=True): + tokenizer_path = prepare_tokenizer(self.model) + assert not os.path.exists(os.path.join(tokenizer_path, "pytorch_model.bin")) + assert os.path.exists(os.path.join(tokenizer_path, "config.json")) + + +if __name__ == "__main__": + unittest.main(warnings="ignore")