support models from www.modelscope.cn (#994)
Co-authored-by: mulin.lyh <mulin.lyh@taobao.com>
This commit is contained in:
10
README.md
10
README.md
@@ -168,6 +168,16 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
||||
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
||||
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
||||
|
||||
### Use Models From ModelScope
|
||||
To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
|
||||
```
|
||||
export SGLANG_USE_MODELSCOPE=true
|
||||
```
|
||||
Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
|
||||
```
|
||||
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
||||
```
|
||||
|
||||
### Supported Models
|
||||
|
||||
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
||||
|
||||
@@ -74,6 +74,8 @@ from sglang.srt.utils import (
|
||||
enable_show_time_cost,
|
||||
kill_child_process,
|
||||
maybe_set_triton_cache_manager,
|
||||
prepare_model,
|
||||
prepare_tokenizer,
|
||||
set_ulimit,
|
||||
)
|
||||
from sglang.utils import get_exception_traceback
|
||||
@@ -250,6 +252,10 @@ def launch_server(
|
||||
)
|
||||
logger.info(f"{server_args=}")
|
||||
|
||||
# Use model from www.modelscope.cn, first download the model.
|
||||
server_args.model_path = prepare_model(server_args.model_path)
|
||||
server_args.tokenizer_path = prepare_tokenizer(server_args.tokenizer_path)
|
||||
|
||||
# Launch processes for multi-node tensor parallelism
|
||||
if server_args.nnodes > 1:
|
||||
if server_args.node_rank != 0:
|
||||
|
||||
@@ -701,3 +701,23 @@ def add_api_key_middleware(app, api_key):
|
||||
if request.headers.get("Authorization") != "Bearer " + api_key:
|
||||
return JSONResponse(content={"error": "Unauthorized"}, status_code=401)
|
||||
return await call_next(request)
|
||||
|
||||
|
||||
def prepare_model(model_path):
|
||||
if "SGLANG_USE_MODELSCOPE" in os.environ:
|
||||
if not os.path.exists(model_path):
|
||||
from modelscope import snapshot_download
|
||||
|
||||
return snapshot_download(model_path)
|
||||
return model_path
|
||||
|
||||
|
||||
def prepare_tokenizer(tokenizer_path):
|
||||
if "SGLANG_USE_MODELSCOPE" in os.environ:
|
||||
if not os.path.exists(tokenizer_path):
|
||||
from modelscope import snapshot_download
|
||||
|
||||
return snapshot_download(
|
||||
tokenizer_path, ignore_patterns=["*.bin", "*.safetensors"]
|
||||
)
|
||||
return tokenizer_path
|
||||
|
||||
@@ -10,6 +10,7 @@ suites = {
|
||||
"test_vision_openai_server.py",
|
||||
"test_chunked_prefill.py",
|
||||
"test_torch_compile.py",
|
||||
"test_models_from_modelscope.py",
|
||||
"models/test_generation_models.py",
|
||||
"models/test_embedding_models.py",
|
||||
"sampling/penaltylib",
|
||||
|
||||
47
test/srt/test_models_from_modelscope.py
Normal file
47
test/srt/test_models_from_modelscope.py
Normal file
@@ -0,0 +1,47 @@
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import unittest
|
||||
from unittest import mock
|
||||
|
||||
from sglang.srt.utils import prepare_model, prepare_tokenizer
|
||||
|
||||
|
||||
class TestDownloadFromModelScope(unittest.TestCase):
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = "iic/nlp_lstmcrf_word-segmentation_chinese-news"
|
||||
stat, output = subprocess.getstatusoutput("pip install modelscope")
|
||||
|
||||
cls.with_modelscope_environ = {k: v for k, v in os.environ.items()}
|
||||
cls.with_modelscope_environ["SGLANG_USE_MODELSCOPE"] = "True"
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
pass
|
||||
|
||||
def test_prepare_model(self):
|
||||
from modelscope.utils.file_utils import get_model_cache_root
|
||||
|
||||
model_cache_root = get_model_cache_root()
|
||||
if os.path.exists(model_cache_root):
|
||||
shutil.rmtree(model_cache_root)
|
||||
with mock.patch.dict(os.environ, self.with_modelscope_environ, clear=True):
|
||||
model_path = prepare_model(self.model)
|
||||
assert os.path.exists(os.path.join(model_path, "pytorch_model.bin"))
|
||||
|
||||
def test_prepare_tokenizer(self):
|
||||
from modelscope.utils.file_utils import get_model_cache_root
|
||||
|
||||
model_cache_root = get_model_cache_root()
|
||||
if os.path.exists(model_cache_root):
|
||||
shutil.rmtree(model_cache_root)
|
||||
with mock.patch.dict(os.environ, self.with_modelscope_environ, clear=True):
|
||||
tokenizer_path = prepare_tokenizer(self.model)
|
||||
assert not os.path.exists(os.path.join(tokenizer_path, "pytorch_model.bin"))
|
||||
assert os.path.exists(os.path.join(tokenizer_path, "config.json"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(warnings="ignore")
|
||||
Reference in New Issue
Block a user