Update vllm to v0.4.3 (#511)

Co-authored-by: Qubitium <417764+Qubitium@users.noreply.github.com>
Co-authored-by: ZX <zx@lbx.dev>
This commit is contained in:
Lianmin Zheng
2024-06-07 12:11:31 -07:00
committed by GitHub
parent 3bc01ac137
commit bf3e271fe0
16 changed files with 44 additions and 12 deletions

View File

@@ -20,7 +20,7 @@ dependencies = [
[project.optional-dependencies] [project.optional-dependencies]
srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn", srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn",
"zmq", "vllm==0.4.2", "interegular", "pydantic", "pillow", "packaging", "huggingface_hub", "hf_transfer", "outlines>=0.0.34"] "zmq", "vllm==0.4.3", "interegular", "pydantic", "pillow", "packaging", "huggingface_hub", "hf_transfer", "outlines>=0.0.34"]
openai = ["openai>=1.0", "numpy", "tiktoken"] openai = ["openai>=1.0", "numpy", "tiktoken"]
anthropic = ["anthropic>=0.20.0", "numpy"] anthropic = ["anthropic>=0.20.0", "numpy"]
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]"] all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]"]

View File

@@ -11,7 +11,7 @@ import torch
import torch.nn as nn import torch.nn as nn
from vllm.config import DeviceConfig, LoadConfig from vllm.config import DeviceConfig, LoadConfig
from vllm.config import ModelConfig as VllmModelConfig from vllm.config import ModelConfig as VllmModelConfig
from vllm.distributed import initialize_model_parallel from vllm.distributed import initialize_model_parallel, init_distributed_environment
from vllm.model_executor.model_loader import get_model from vllm.model_executor.model_loader import get_model
from vllm.model_executor.models import ModelRegistry from vllm.model_executor.models import ModelRegistry
@@ -240,11 +240,11 @@ class ModelRunner:
logger.info(f"[gpu_id={self.gpu_id}] Set cuda device.") logger.info(f"[gpu_id={self.gpu_id}] Set cuda device.")
torch.cuda.set_device(self.gpu_id) torch.cuda.set_device(self.gpu_id)
logger.info(f"[gpu_id={self.gpu_id}] Init nccl begin.") logger.info(f"[gpu_id={self.gpu_id}] Init nccl begin.")
torch.distributed.init_process_group( init_distributed_environment(
backend="nccl", backend="nccl",
world_size=self.tp_size, world_size=self.tp_size,
rank=self.tp_rank, rank=self.tp_rank,
init_method=f"tcp://127.0.0.1:{self.nccl_port}", distributed_init_method=f"tcp://127.0.0.1:{self.nccl_port}",
) )
initialize_model_parallel(tensor_model_parallel_size=self.tp_size) initialize_model_parallel(tensor_model_parallel_size=self.tp_size)
total_gpu_memory = get_available_gpu_memory( total_gpu_memory = get_available_gpu_memory(
@@ -291,6 +291,7 @@ class ModelRunner:
vision_language_config=None, vision_language_config=None,
parallel_config=None, parallel_config=None,
scheduler_config=None, scheduler_config=None,
cache_config=None,
) )
logger.info( logger.info(
f"[gpu_id={self.gpu_id}] Load weight end. " f"[gpu_id={self.gpu_id}] Load weight end. "

View File

@@ -30,6 +30,7 @@ import torch.utils.checkpoint
from torch import nn from torch import nn
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config import CacheConfig
from vllm.distributed import ( from vllm.distributed import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
@@ -304,6 +305,7 @@ class CohereForCausalLM(nn.Module):
self, self,
config: PretrainedConfig, config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__() super().__init__()
self.config = config self.config = config

View File

@@ -5,6 +5,7 @@ from typing import Iterable, Optional, Tuple
import torch import torch
import torch.nn as nn import torch.nn as nn
from vllm.config import CacheConfig
from vllm.distributed import ( from vllm.distributed import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
@@ -352,6 +353,7 @@ class DbrxForCausalLM(nn.Module):
self, self,
config: DbrxConfig, config: DbrxConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
): ):
super().__init__() super().__init__()
self.config = config self.config = config

View File

@@ -6,7 +6,7 @@ from typing import Iterable, Optional, Tuple
import torch import torch
from torch import nn from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config import LoRAConfig from vllm.config import LoRAConfig, CacheConfig
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import GeluAndMul from vllm.model_executor.layers.activation import GeluAndMul
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
@@ -264,6 +264,7 @@ class GemmaForCausalLM(nn.Module):
config: PretrainedConfig, config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
lora_config: Optional[LoRAConfig] = None, lora_config: Optional[LoRAConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
del lora_config # Unused. del lora_config # Unused.
super().__init__() super().__init__()

View File

@@ -11,6 +11,7 @@ from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.config import CacheConfig
from vllm.distributed import ( from vllm.distributed import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
@@ -538,6 +539,7 @@ class Grok1ModelForCausalLM(nn.Module):
self, self,
config: PretrainedConfig, config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__() super().__init__()
self.config = config self.config = config

View File

@@ -7,6 +7,7 @@ import torch
import tqdm import tqdm
from torch import nn from torch import nn
from transformers import LlamaConfig from transformers import LlamaConfig
from vllm.config import CacheConfig
from vllm.distributed import ( from vllm.distributed import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size get_tensor_model_parallel_world_size
@@ -258,6 +259,7 @@ class LlamaForCausalLM(nn.Module):
self, self,
config: LlamaConfig, config: LlamaConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__() super().__init__()
self.config = config self.config = config

View File

@@ -7,6 +7,7 @@ import torch
from torch import nn from torch import nn
from transformers import CLIPVisionModel, CLIPVisionConfig, LlavaConfig, Qwen2Config, MistralConfig from transformers import CLIPVisionModel, CLIPVisionConfig, LlavaConfig, Qwen2Config, MistralConfig
from transformers.models.llava.modeling_llava import LlavaMultiModalProjector from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
from vllm.config import CacheConfig
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -27,6 +28,7 @@ class LlavaLlamaForCausalLM(nn.Module):
self, self,
config: LlavaConfig, config: LlavaConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__() super().__init__()
self.config = config self.config = config
@@ -294,8 +296,9 @@ class LlavaQwenForCausalLM(LlavaLlamaForCausalLM):
self, self,
config: LlavaConfig, config: LlavaConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__(config, quant_config=quant_config) super().__init__(config, quant_config=quant_config, cache_config=cache_config)
self.config = config self.config = config
self.vision_tower = None self.vision_tower = None
if getattr(self.config, "vision_config", None) is None: if getattr(self.config, "vision_config", None) is None:
@@ -356,8 +359,9 @@ class LlavaMistralForCausalLM(LlavaLlamaForCausalLM):
self, self,
config: LlavaConfig, config: LlavaConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__(config, quant_config=quant_config) super().__init__(config, quant_config=quant_config, cache_config=cache_config)
self.config = config self.config = config
self.vision_tower = None self.vision_tower = None
if getattr(self.config, "vision_config", None) is None: if getattr(self.config, "vision_config", None) is None:

View File

@@ -7,6 +7,7 @@ import torch
from torch import nn from torch import nn
from transformers import CLIPVisionModel, LlavaConfig from transformers import CLIPVisionModel, LlavaConfig
from transformers.models.llava.modeling_llava import LlavaMultiModalProjector from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
from vllm.config import CacheConfig
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -25,6 +26,7 @@ class LlavaVidForCausalLM(nn.Module):
self, self,
config: LlavaConfig, config: LlavaConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__() super().__init__()
self.config = config self.config = config

View File

@@ -9,6 +9,7 @@ import torch.nn.functional as F
from torch import nn from torch import nn
from transformers import MixtralConfig from transformers import MixtralConfig
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.config import CacheConfig
from vllm.distributed import ( from vllm.distributed import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
@@ -414,6 +415,7 @@ class MixtralForCausalLM(nn.Module):
self, self,
config: MixtralConfig, config: MixtralConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__() super().__init__()
self.config = config self.config = config

View File

@@ -8,6 +8,7 @@ import torch
import torch.nn.functional as F import torch.nn.functional as F
from torch import nn from torch import nn
from transformers import MixtralConfig from transformers import MixtralConfig
from vllm.config import CacheConfig
from vllm.distributed import ( from vllm.distributed import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
@@ -313,6 +314,7 @@ class QuantMixtralForCausalLM(nn.Module):
self, self,
config: MixtralConfig, config: MixtralConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__() super().__init__()
self.config = config self.config = config

View File

@@ -5,6 +5,7 @@ from typing import Any, Dict, Optional, Iterable, Tuple
import torch import torch
from torch import nn from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config import CacheConfig
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
@@ -227,6 +228,7 @@ class QWenLMHeadModel(nn.Module):
self, self,
config: PretrainedConfig, config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
): ):
super().__init__() super().__init__()
self.config = config self.config = config

View File

@@ -5,6 +5,7 @@ from typing import Any, Dict, Optional, Tuple, Iterable
import torch import torch
from torch import nn from torch import nn
from vllm.config import CacheConfig
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
@@ -251,6 +252,7 @@ class Qwen2ForCausalLM(nn.Module):
self, self,
config: Qwen2Config, config: Qwen2Config,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__() super().__init__()
self.config = config self.config = config

View File

@@ -7,6 +7,7 @@ from typing import Optional, Tuple, Iterable
import torch import torch
from torch import nn from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config import CacheConfig
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
@@ -225,6 +226,7 @@ class StableLmForCausalLM(nn.Module):
self, self,
config: PretrainedConfig, config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__() super().__init__()
self.config = config self.config = config

View File

@@ -1,12 +1,14 @@
"""Inference-only Yi-VL model.""" """Inference-only Yi-VL model."""
from typing import Tuple, Iterable from typing import Tuple, Iterable, Optional
import torch import torch
import torch.nn as nn import torch.nn as nn
from transformers import CLIPVisionModel, LlavaConfig from transformers import CLIPVisionModel, LlavaConfig
from vllm.config import CacheConfig
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
from sglang.srt.models.llava import ( from sglang.srt.models.llava import (
LlavaLlamaForCausalLM, LlavaLlamaForCausalLM,
monkey_path_clip_vision_embed_forward, monkey_path_clip_vision_embed_forward,
@@ -15,9 +17,12 @@ from sglang.srt.models.llava import (
class YiVLForCausalLM(LlavaLlamaForCausalLM): class YiVLForCausalLM(LlavaLlamaForCausalLM):
def __init__( def __init__(
self, config, quant_config = None, self,
config: LlavaConfig,
quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__(config, quant_config) super().__init__(config, quant_config, cache_config)
self.multi_modal_projector = YiVLMultiModalProjector(self.config) self.multi_modal_projector = YiVLMultiModalProjector(self.config)
self.vision_tower_subfolder = self.config.mm_vision_tower.replace( self.vision_tower_subfolder = self.config.mm_vision_tower.replace(

View File

@@ -421,9 +421,10 @@ def suppress_other_loggers():
from vllm.logger import logger as vllm_default_logger from vllm.logger import logger as vllm_default_logger
vllm_default_logger.setLevel(logging.WARN) vllm_default_logger.setLevel(logging.WARN)
logging.getLogger("vllm.utils").setLevel(logging.WARN)
logging.getLogger("vllm.selector").setLevel(logging.WARN)
logging.getLogger("vllm.config").setLevel(logging.ERROR) logging.getLogger("vllm.config").setLevel(logging.ERROR)
logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(logging.WARN)
logging.getLogger("vllm.selector").setLevel(logging.WARN)
logging.getLogger("vllm.utils").setLevel(logging.WARN)
def assert_pkg_version(pkg: str, min_version: str): def assert_pkg_version(pkg: str, min_version: str):