From bf3e271fe05f586c372d765422d2094bf0d5981c Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Fri, 7 Jun 2024 12:11:31 -0700 Subject: [PATCH] Update vllm to v0.4.3 (#511) Co-authored-by: Qubitium <417764+Qubitium@users.noreply.github.com> Co-authored-by: ZX --- python/pyproject.toml | 2 +- python/sglang/srt/managers/controller/model_runner.py | 7 ++++--- python/sglang/srt/models/commandr.py | 2 ++ python/sglang/srt/models/dbrx.py | 2 ++ python/sglang/srt/models/gemma.py | 3 ++- python/sglang/srt/models/grok.py | 2 ++ python/sglang/srt/models/llama2.py | 2 ++ python/sglang/srt/models/llava.py | 8 ++++++-- python/sglang/srt/models/llavavid.py | 2 ++ python/sglang/srt/models/mixtral.py | 2 ++ python/sglang/srt/models/mixtral_quant.py | 2 ++ python/sglang/srt/models/qwen.py | 2 ++ python/sglang/srt/models/qwen2.py | 2 ++ python/sglang/srt/models/stablelm.py | 2 ++ python/sglang/srt/models/yivl.py | 11 ++++++++--- python/sglang/srt/utils.py | 5 +++-- 16 files changed, 44 insertions(+), 12 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 59c3fe012..247c4e5cc 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -20,7 +20,7 @@ dependencies = [ [project.optional-dependencies] srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn", - "zmq", "vllm==0.4.2", "interegular", "pydantic", "pillow", "packaging", "huggingface_hub", "hf_transfer", "outlines>=0.0.34"] + "zmq", "vllm==0.4.3", "interegular", "pydantic", "pillow", "packaging", "huggingface_hub", "hf_transfer", "outlines>=0.0.34"] openai = ["openai>=1.0", "numpy", "tiktoken"] anthropic = ["anthropic>=0.20.0", "numpy"] all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]"] diff --git a/python/sglang/srt/managers/controller/model_runner.py b/python/sglang/srt/managers/controller/model_runner.py index 6a64c84f5..0c22e2720 100644 --- a/python/sglang/srt/managers/controller/model_runner.py +++ b/python/sglang/srt/managers/controller/model_runner.py @@ -11,7 +11,7 @@ import torch import torch.nn as nn from vllm.config import DeviceConfig, LoadConfig from vllm.config import ModelConfig as VllmModelConfig -from vllm.distributed import initialize_model_parallel +from vllm.distributed import initialize_model_parallel, init_distributed_environment from vllm.model_executor.model_loader import get_model from vllm.model_executor.models import ModelRegistry @@ -240,11 +240,11 @@ class ModelRunner: logger.info(f"[gpu_id={self.gpu_id}] Set cuda device.") torch.cuda.set_device(self.gpu_id) logger.info(f"[gpu_id={self.gpu_id}] Init nccl begin.") - torch.distributed.init_process_group( + init_distributed_environment( backend="nccl", world_size=self.tp_size, rank=self.tp_rank, - init_method=f"tcp://127.0.0.1:{self.nccl_port}", + distributed_init_method=f"tcp://127.0.0.1:{self.nccl_port}", ) initialize_model_parallel(tensor_model_parallel_size=self.tp_size) total_gpu_memory = get_available_gpu_memory( @@ -291,6 +291,7 @@ class ModelRunner: vision_language_config=None, parallel_config=None, scheduler_config=None, + cache_config=None, ) logger.info( f"[gpu_id={self.gpu_id}] Load weight end. " diff --git a/python/sglang/srt/models/commandr.py b/python/sglang/srt/models/commandr.py index 6c1cd0ea3..c08e7eb3a 100644 --- a/python/sglang/srt/models/commandr.py +++ b/python/sglang/srt/models/commandr.py @@ -30,6 +30,7 @@ import torch.utils.checkpoint from torch import nn from torch.nn.parameter import Parameter from transformers import PretrainedConfig +from vllm.config import CacheConfig from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -304,6 +305,7 @@ class CohereForCausalLM(nn.Module): self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, + cache_config: Optional[CacheConfig] = None, ) -> None: super().__init__() self.config = config diff --git a/python/sglang/srt/models/dbrx.py b/python/sglang/srt/models/dbrx.py index ad4e27199..8436386c1 100644 --- a/python/sglang/srt/models/dbrx.py +++ b/python/sglang/srt/models/dbrx.py @@ -5,6 +5,7 @@ from typing import Iterable, Optional, Tuple import torch import torch.nn as nn +from vllm.config import CacheConfig from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -352,6 +353,7 @@ class DbrxForCausalLM(nn.Module): self, config: DbrxConfig, quant_config: Optional[QuantizationConfig] = None, + cache_config: Optional[CacheConfig] = None, ): super().__init__() self.config = config diff --git a/python/sglang/srt/models/gemma.py b/python/sglang/srt/models/gemma.py index 5c0b60fd6..e150a56ca 100644 --- a/python/sglang/srt/models/gemma.py +++ b/python/sglang/srt/models/gemma.py @@ -6,7 +6,7 @@ from typing import Iterable, Optional, Tuple import torch from torch import nn from transformers import PretrainedConfig -from vllm.config import LoRAConfig +from vllm.config import LoRAConfig, CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import GeluAndMul from vllm.model_executor.layers.layernorm import RMSNorm @@ -264,6 +264,7 @@ class GemmaForCausalLM(nn.Module): config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, lora_config: Optional[LoRAConfig] = None, + cache_config: Optional[CacheConfig] = None, ) -> None: del lora_config # Unused. super().__init__() diff --git a/python/sglang/srt/models/grok.py b/python/sglang/srt/models/grok.py index 91cab15f6..a2dcfec8e 100644 --- a/python/sglang/srt/models/grok.py +++ b/python/sglang/srt/models/grok.py @@ -11,6 +11,7 @@ from torch import nn from transformers import PretrainedConfig from vllm import _custom_ops as ops +from vllm.config import CacheConfig from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -538,6 +539,7 @@ class Grok1ModelForCausalLM(nn.Module): self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, + cache_config: Optional[CacheConfig] = None, ) -> None: super().__init__() self.config = config diff --git a/python/sglang/srt/models/llama2.py b/python/sglang/srt/models/llama2.py index aa8c4752d..d0f162c31 100644 --- a/python/sglang/srt/models/llama2.py +++ b/python/sglang/srt/models/llama2.py @@ -7,6 +7,7 @@ import torch import tqdm from torch import nn from transformers import LlamaConfig +from vllm.config import CacheConfig from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size @@ -258,6 +259,7 @@ class LlamaForCausalLM(nn.Module): self, config: LlamaConfig, quant_config: Optional[QuantizationConfig] = None, + cache_config: Optional[CacheConfig] = None, ) -> None: super().__init__() self.config = config diff --git a/python/sglang/srt/models/llava.py b/python/sglang/srt/models/llava.py index efcc8d91c..fad0e68b8 100644 --- a/python/sglang/srt/models/llava.py +++ b/python/sglang/srt/models/llava.py @@ -7,6 +7,7 @@ import torch from torch import nn from transformers import CLIPVisionModel, CLIPVisionConfig, LlavaConfig, Qwen2Config, MistralConfig from transformers.models.llava.modeling_llava import LlavaMultiModalProjector +from vllm.config import CacheConfig from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -27,6 +28,7 @@ class LlavaLlamaForCausalLM(nn.Module): self, config: LlavaConfig, quant_config: Optional[QuantizationConfig] = None, + cache_config: Optional[CacheConfig] = None, ) -> None: super().__init__() self.config = config @@ -294,8 +296,9 @@ class LlavaQwenForCausalLM(LlavaLlamaForCausalLM): self, config: LlavaConfig, quant_config: Optional[QuantizationConfig] = None, + cache_config: Optional[CacheConfig] = None, ) -> None: - super().__init__(config, quant_config=quant_config) + super().__init__(config, quant_config=quant_config, cache_config=cache_config) self.config = config self.vision_tower = None if getattr(self.config, "vision_config", None) is None: @@ -356,8 +359,9 @@ class LlavaMistralForCausalLM(LlavaLlamaForCausalLM): self, config: LlavaConfig, quant_config: Optional[QuantizationConfig] = None, + cache_config: Optional[CacheConfig] = None, ) -> None: - super().__init__(config, quant_config=quant_config) + super().__init__(config, quant_config=quant_config, cache_config=cache_config) self.config = config self.vision_tower = None if getattr(self.config, "vision_config", None) is None: diff --git a/python/sglang/srt/models/llavavid.py b/python/sglang/srt/models/llavavid.py index e79b81af1..541258811 100644 --- a/python/sglang/srt/models/llavavid.py +++ b/python/sglang/srt/models/llavavid.py @@ -7,6 +7,7 @@ import torch from torch import nn from transformers import CLIPVisionModel, LlavaConfig from transformers.models.llava.modeling_llava import LlavaMultiModalProjector +from vllm.config import CacheConfig from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -25,6 +26,7 @@ class LlavaVidForCausalLM(nn.Module): self, config: LlavaConfig, quant_config: Optional[QuantizationConfig] = None, + cache_config: Optional[CacheConfig] = None, ) -> None: super().__init__() self.config = config diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py index f718af47f..2c14dd142 100644 --- a/python/sglang/srt/models/mixtral.py +++ b/python/sglang/srt/models/mixtral.py @@ -9,6 +9,7 @@ import torch.nn.functional as F from torch import nn from transformers import MixtralConfig from vllm import _custom_ops as ops +from vllm.config import CacheConfig from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -414,6 +415,7 @@ class MixtralForCausalLM(nn.Module): self, config: MixtralConfig, quant_config: Optional[QuantizationConfig] = None, + cache_config: Optional[CacheConfig] = None, ) -> None: super().__init__() self.config = config diff --git a/python/sglang/srt/models/mixtral_quant.py b/python/sglang/srt/models/mixtral_quant.py index e9edf43c5..94df124e8 100644 --- a/python/sglang/srt/models/mixtral_quant.py +++ b/python/sglang/srt/models/mixtral_quant.py @@ -8,6 +8,7 @@ import torch import torch.nn.functional as F from torch import nn from transformers import MixtralConfig +from vllm.config import CacheConfig from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -313,6 +314,7 @@ class QuantMixtralForCausalLM(nn.Module): self, config: MixtralConfig, quant_config: Optional[QuantizationConfig] = None, + cache_config: Optional[CacheConfig] = None, ) -> None: super().__init__() self.config = config diff --git a/python/sglang/srt/models/qwen.py b/python/sglang/srt/models/qwen.py index bce76d53d..c2ff0aeea 100644 --- a/python/sglang/srt/models/qwen.py +++ b/python/sglang/srt/models/qwen.py @@ -5,6 +5,7 @@ from typing import Any, Dict, Optional, Iterable, Tuple import torch from torch import nn from transformers import PretrainedConfig +from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm @@ -227,6 +228,7 @@ class QWenLMHeadModel(nn.Module): self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, + cache_config: Optional[CacheConfig] = None, ): super().__init__() self.config = config diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py index f5bee35a3..5d0115522 100644 --- a/python/sglang/srt/models/qwen2.py +++ b/python/sglang/srt/models/qwen2.py @@ -5,6 +5,7 @@ from typing import Any, Dict, Optional, Tuple, Iterable import torch from torch import nn +from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm @@ -251,6 +252,7 @@ class Qwen2ForCausalLM(nn.Module): self, config: Qwen2Config, quant_config: Optional[QuantizationConfig] = None, + cache_config: Optional[CacheConfig] = None, ) -> None: super().__init__() self.config = config diff --git a/python/sglang/srt/models/stablelm.py b/python/sglang/srt/models/stablelm.py index 279184d8d..72fa1508d 100644 --- a/python/sglang/srt/models/stablelm.py +++ b/python/sglang/srt/models/stablelm.py @@ -7,6 +7,7 @@ from typing import Optional, Tuple, Iterable import torch from torch import nn from transformers import PretrainedConfig +from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.linear import ( @@ -225,6 +226,7 @@ class StableLmForCausalLM(nn.Module): self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, + cache_config: Optional[CacheConfig] = None, ) -> None: super().__init__() self.config = config diff --git a/python/sglang/srt/models/yivl.py b/python/sglang/srt/models/yivl.py index 0d5d70bc7..2675502b0 100644 --- a/python/sglang/srt/models/yivl.py +++ b/python/sglang/srt/models/yivl.py @@ -1,12 +1,14 @@ """Inference-only Yi-VL model.""" -from typing import Tuple, Iterable +from typing import Tuple, Iterable, Optional import torch import torch.nn as nn from transformers import CLIPVisionModel, LlavaConfig +from vllm.config import CacheConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from sglang.srt.models.llava import ( LlavaLlamaForCausalLM, monkey_path_clip_vision_embed_forward, @@ -15,9 +17,12 @@ from sglang.srt.models.llava import ( class YiVLForCausalLM(LlavaLlamaForCausalLM): def __init__( - self, config, quant_config = None, + self, + config: LlavaConfig, + quant_config: Optional[QuantizationConfig] = None, + cache_config: Optional[CacheConfig] = None, ) -> None: - super().__init__(config, quant_config) + super().__init__(config, quant_config, cache_config) self.multi_modal_projector = YiVLMultiModalProjector(self.config) self.vision_tower_subfolder = self.config.mm_vision_tower.replace( diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 5c32fd65d..272c2beac 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -421,9 +421,10 @@ def suppress_other_loggers(): from vllm.logger import logger as vllm_default_logger vllm_default_logger.setLevel(logging.WARN) - logging.getLogger("vllm.utils").setLevel(logging.WARN) - logging.getLogger("vllm.selector").setLevel(logging.WARN) logging.getLogger("vllm.config").setLevel(logging.ERROR) + logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(logging.WARN) + logging.getLogger("vllm.selector").setLevel(logging.WARN) + logging.getLogger("vllm.utils").setLevel(logging.WARN) def assert_pkg_version(pkg: str, min_version: str):