Make sglang compat with vllm 0.5.1 (#598)
This commit is contained in:
@@ -53,7 +53,11 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
||||
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
|
||||
|
||||
### Common Notes
|
||||
- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html).
|
||||
- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
|
||||
```
|
||||
pip uninstall -y triton triton-nightly
|
||||
pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
|
||||
```
|
||||
- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
|
||||
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ dependencies = [
|
||||
|
||||
[project.optional-dependencies]
|
||||
srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow",
|
||||
"psutil", "pydantic", "rpyc", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.0", "outlines>=0.0.44"]
|
||||
"psutil", "pydantic", "rpyc", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.1", "outlines>=0.0.44"]
|
||||
openai = ["openai>=1.0", "tiktoken"]
|
||||
anthropic = ["anthropic>=0.20.0"]
|
||||
litellm = ["litellm>=1.0.0"]
|
||||
|
||||
@@ -326,7 +326,7 @@ class ModelRunner:
|
||||
device_config=device_config,
|
||||
load_config=load_config,
|
||||
lora_config=None,
|
||||
vision_language_config=None,
|
||||
multimodal_config=None,
|
||||
parallel_config=None,
|
||||
scheduler_config=None,
|
||||
cache_config=None,
|
||||
|
||||
@@ -476,7 +476,7 @@ def monkey_patch_vllm_dummy_weight_loader():
|
||||
ModelConfig,
|
||||
ParallelConfig,
|
||||
SchedulerConfig,
|
||||
VisionLanguageConfig,
|
||||
MultiModalConfig,
|
||||
_initialize_model,
|
||||
initialize_dummy_weights,
|
||||
nn,
|
||||
@@ -489,7 +489,7 @@ def monkey_patch_vllm_dummy_weight_loader():
|
||||
model_config: ModelConfig,
|
||||
device_config: DeviceConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
vision_language_config: Optional[VisionLanguageConfig],
|
||||
multimodal_config: Optional[MultiModalConfig],
|
||||
parallel_config: ParallelConfig,
|
||||
scheduler_config: SchedulerConfig,
|
||||
cache_config: CacheConfig,
|
||||
@@ -500,7 +500,7 @@ def monkey_patch_vllm_dummy_weight_loader():
|
||||
model_config,
|
||||
self.load_config,
|
||||
lora_config,
|
||||
vision_language_config,
|
||||
multimodal_config,
|
||||
cache_config,
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user