Update vllm to v0.4.3 (#511)
Co-authored-by: Qubitium <417764+Qubitium@users.noreply.github.com> Co-authored-by: ZX <zx@lbx.dev>
This commit is contained in:
@@ -20,7 +20,7 @@ dependencies = [
|
|||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn",
|
srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn",
|
||||||
"zmq", "vllm==0.4.2", "interegular", "pydantic", "pillow", "packaging", "huggingface_hub", "hf_transfer", "outlines>=0.0.34"]
|
"zmq", "vllm==0.4.3", "interegular", "pydantic", "pillow", "packaging", "huggingface_hub", "hf_transfer", "outlines>=0.0.34"]
|
||||||
openai = ["openai>=1.0", "numpy", "tiktoken"]
|
openai = ["openai>=1.0", "numpy", "tiktoken"]
|
||||||
anthropic = ["anthropic>=0.20.0", "numpy"]
|
anthropic = ["anthropic>=0.20.0", "numpy"]
|
||||||
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]"]
|
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]"]
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ import torch
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from vllm.config import DeviceConfig, LoadConfig
|
from vllm.config import DeviceConfig, LoadConfig
|
||||||
from vllm.config import ModelConfig as VllmModelConfig
|
from vllm.config import ModelConfig as VllmModelConfig
|
||||||
from vllm.distributed import initialize_model_parallel
|
from vllm.distributed import initialize_model_parallel, init_distributed_environment
|
||||||
from vllm.model_executor.model_loader import get_model
|
from vllm.model_executor.model_loader import get_model
|
||||||
from vllm.model_executor.models import ModelRegistry
|
from vllm.model_executor.models import ModelRegistry
|
||||||
|
|
||||||
@@ -240,11 +240,11 @@ class ModelRunner:
|
|||||||
logger.info(f"[gpu_id={self.gpu_id}] Set cuda device.")
|
logger.info(f"[gpu_id={self.gpu_id}] Set cuda device.")
|
||||||
torch.cuda.set_device(self.gpu_id)
|
torch.cuda.set_device(self.gpu_id)
|
||||||
logger.info(f"[gpu_id={self.gpu_id}] Init nccl begin.")
|
logger.info(f"[gpu_id={self.gpu_id}] Init nccl begin.")
|
||||||
torch.distributed.init_process_group(
|
init_distributed_environment(
|
||||||
backend="nccl",
|
backend="nccl",
|
||||||
world_size=self.tp_size,
|
world_size=self.tp_size,
|
||||||
rank=self.tp_rank,
|
rank=self.tp_rank,
|
||||||
init_method=f"tcp://127.0.0.1:{self.nccl_port}",
|
distributed_init_method=f"tcp://127.0.0.1:{self.nccl_port}",
|
||||||
)
|
)
|
||||||
initialize_model_parallel(tensor_model_parallel_size=self.tp_size)
|
initialize_model_parallel(tensor_model_parallel_size=self.tp_size)
|
||||||
total_gpu_memory = get_available_gpu_memory(
|
total_gpu_memory = get_available_gpu_memory(
|
||||||
@@ -291,6 +291,7 @@ class ModelRunner:
|
|||||||
vision_language_config=None,
|
vision_language_config=None,
|
||||||
parallel_config=None,
|
parallel_config=None,
|
||||||
scheduler_config=None,
|
scheduler_config=None,
|
||||||
|
cache_config=None,
|
||||||
)
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
f"[gpu_id={self.gpu_id}] Load weight end. "
|
f"[gpu_id={self.gpu_id}] Load weight end. "
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ import torch.utils.checkpoint
|
|||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn.parameter import Parameter
|
from torch.nn.parameter import Parameter
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
from vllm.config import CacheConfig
|
||||||
from vllm.distributed import (
|
from vllm.distributed import (
|
||||||
get_tensor_model_parallel_rank,
|
get_tensor_model_parallel_rank,
|
||||||
get_tensor_model_parallel_world_size,
|
get_tensor_model_parallel_world_size,
|
||||||
@@ -304,6 +305,7 @@ class CohereForCausalLM(nn.Module):
|
|||||||
self,
|
self,
|
||||||
config: PretrainedConfig,
|
config: PretrainedConfig,
|
||||||
quant_config: Optional[QuantizationConfig] = None,
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
|
cache_config: Optional[CacheConfig] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from typing import Iterable, Optional, Tuple
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
from vllm.config import CacheConfig
|
||||||
from vllm.distributed import (
|
from vllm.distributed import (
|
||||||
get_tensor_model_parallel_rank,
|
get_tensor_model_parallel_rank,
|
||||||
get_tensor_model_parallel_world_size,
|
get_tensor_model_parallel_world_size,
|
||||||
@@ -352,6 +353,7 @@ class DbrxForCausalLM(nn.Module):
|
|||||||
self,
|
self,
|
||||||
config: DbrxConfig,
|
config: DbrxConfig,
|
||||||
quant_config: Optional[QuantizationConfig] = None,
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
|
cache_config: Optional[CacheConfig] = None,
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ from typing import Iterable, Optional, Tuple
|
|||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
from vllm.config import LoRAConfig
|
from vllm.config import LoRAConfig, CacheConfig
|
||||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||||
from vllm.model_executor.layers.activation import GeluAndMul
|
from vllm.model_executor.layers.activation import GeluAndMul
|
||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
@@ -264,6 +264,7 @@ class GemmaForCausalLM(nn.Module):
|
|||||||
config: PretrainedConfig,
|
config: PretrainedConfig,
|
||||||
quant_config: Optional[QuantizationConfig] = None,
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
lora_config: Optional[LoRAConfig] = None,
|
lora_config: Optional[LoRAConfig] = None,
|
||||||
|
cache_config: Optional[CacheConfig] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
del lora_config # Unused.
|
del lora_config # Unused.
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ from torch import nn
|
|||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.config import CacheConfig
|
||||||
from vllm.distributed import (
|
from vllm.distributed import (
|
||||||
get_tensor_model_parallel_rank,
|
get_tensor_model_parallel_rank,
|
||||||
get_tensor_model_parallel_world_size,
|
get_tensor_model_parallel_world_size,
|
||||||
@@ -538,6 +539,7 @@ class Grok1ModelForCausalLM(nn.Module):
|
|||||||
self,
|
self,
|
||||||
config: PretrainedConfig,
|
config: PretrainedConfig,
|
||||||
quant_config: Optional[QuantizationConfig] = None,
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
|
cache_config: Optional[CacheConfig] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ import torch
|
|||||||
import tqdm
|
import tqdm
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers import LlamaConfig
|
from transformers import LlamaConfig
|
||||||
|
from vllm.config import CacheConfig
|
||||||
from vllm.distributed import (
|
from vllm.distributed import (
|
||||||
get_tensor_model_parallel_rank,
|
get_tensor_model_parallel_rank,
|
||||||
get_tensor_model_parallel_world_size
|
get_tensor_model_parallel_world_size
|
||||||
@@ -258,6 +259,7 @@ class LlamaForCausalLM(nn.Module):
|
|||||||
self,
|
self,
|
||||||
config: LlamaConfig,
|
config: LlamaConfig,
|
||||||
quant_config: Optional[QuantizationConfig] = None,
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
|
cache_config: Optional[CacheConfig] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ import torch
|
|||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers import CLIPVisionModel, CLIPVisionConfig, LlavaConfig, Qwen2Config, MistralConfig
|
from transformers import CLIPVisionModel, CLIPVisionConfig, LlavaConfig, Qwen2Config, MistralConfig
|
||||||
from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
|
from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
|
||||||
|
from vllm.config import CacheConfig
|
||||||
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
||||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||||
|
|
||||||
@@ -27,6 +28,7 @@ class LlavaLlamaForCausalLM(nn.Module):
|
|||||||
self,
|
self,
|
||||||
config: LlavaConfig,
|
config: LlavaConfig,
|
||||||
quant_config: Optional[QuantizationConfig] = None,
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
|
cache_config: Optional[CacheConfig] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.config = config
|
self.config = config
|
||||||
@@ -294,8 +296,9 @@ class LlavaQwenForCausalLM(LlavaLlamaForCausalLM):
|
|||||||
self,
|
self,
|
||||||
config: LlavaConfig,
|
config: LlavaConfig,
|
||||||
quant_config: Optional[QuantizationConfig] = None,
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
|
cache_config: Optional[CacheConfig] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(config, quant_config=quant_config)
|
super().__init__(config, quant_config=quant_config, cache_config=cache_config)
|
||||||
self.config = config
|
self.config = config
|
||||||
self.vision_tower = None
|
self.vision_tower = None
|
||||||
if getattr(self.config, "vision_config", None) is None:
|
if getattr(self.config, "vision_config", None) is None:
|
||||||
@@ -356,8 +359,9 @@ class LlavaMistralForCausalLM(LlavaLlamaForCausalLM):
|
|||||||
self,
|
self,
|
||||||
config: LlavaConfig,
|
config: LlavaConfig,
|
||||||
quant_config: Optional[QuantizationConfig] = None,
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
|
cache_config: Optional[CacheConfig] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(config, quant_config=quant_config)
|
super().__init__(config, quant_config=quant_config, cache_config=cache_config)
|
||||||
self.config = config
|
self.config = config
|
||||||
self.vision_tower = None
|
self.vision_tower = None
|
||||||
if getattr(self.config, "vision_config", None) is None:
|
if getattr(self.config, "vision_config", None) is None:
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ import torch
|
|||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers import CLIPVisionModel, LlavaConfig
|
from transformers import CLIPVisionModel, LlavaConfig
|
||||||
from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
|
from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
|
||||||
|
from vllm.config import CacheConfig
|
||||||
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
||||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||||
|
|
||||||
@@ -25,6 +26,7 @@ class LlavaVidForCausalLM(nn.Module):
|
|||||||
self,
|
self,
|
||||||
config: LlavaConfig,
|
config: LlavaConfig,
|
||||||
quant_config: Optional[QuantizationConfig] = None,
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
|
cache_config: Optional[CacheConfig] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import torch.nn.functional as F
|
|||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers import MixtralConfig
|
from transformers import MixtralConfig
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.config import CacheConfig
|
||||||
from vllm.distributed import (
|
from vllm.distributed import (
|
||||||
get_tensor_model_parallel_rank,
|
get_tensor_model_parallel_rank,
|
||||||
get_tensor_model_parallel_world_size,
|
get_tensor_model_parallel_world_size,
|
||||||
@@ -414,6 +415,7 @@ class MixtralForCausalLM(nn.Module):
|
|||||||
self,
|
self,
|
||||||
config: MixtralConfig,
|
config: MixtralConfig,
|
||||||
quant_config: Optional[QuantizationConfig] = None,
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
|
cache_config: Optional[CacheConfig] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import torch
|
|||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers import MixtralConfig
|
from transformers import MixtralConfig
|
||||||
|
from vllm.config import CacheConfig
|
||||||
from vllm.distributed import (
|
from vllm.distributed import (
|
||||||
get_tensor_model_parallel_rank,
|
get_tensor_model_parallel_rank,
|
||||||
get_tensor_model_parallel_world_size,
|
get_tensor_model_parallel_world_size,
|
||||||
@@ -313,6 +314,7 @@ class QuantMixtralForCausalLM(nn.Module):
|
|||||||
self,
|
self,
|
||||||
config: MixtralConfig,
|
config: MixtralConfig,
|
||||||
quant_config: Optional[QuantizationConfig] = None,
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
|
cache_config: Optional[CacheConfig] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from typing import Any, Dict, Optional, Iterable, Tuple
|
|||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
from vllm.config import CacheConfig
|
||||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||||
from vllm.model_executor.layers.activation import SiluAndMul
|
from vllm.model_executor.layers.activation import SiluAndMul
|
||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
@@ -227,6 +228,7 @@ class QWenLMHeadModel(nn.Module):
|
|||||||
self,
|
self,
|
||||||
config: PretrainedConfig,
|
config: PretrainedConfig,
|
||||||
quant_config: Optional[QuantizationConfig] = None,
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
|
cache_config: Optional[CacheConfig] = None,
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from typing import Any, Dict, Optional, Tuple, Iterable
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
from vllm.config import CacheConfig
|
||||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||||
from vllm.model_executor.layers.activation import SiluAndMul
|
from vllm.model_executor.layers.activation import SiluAndMul
|
||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
@@ -251,6 +252,7 @@ class Qwen2ForCausalLM(nn.Module):
|
|||||||
self,
|
self,
|
||||||
config: Qwen2Config,
|
config: Qwen2Config,
|
||||||
quant_config: Optional[QuantizationConfig] = None,
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
|
cache_config: Optional[CacheConfig] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from typing import Optional, Tuple, Iterable
|
|||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
from vllm.config import CacheConfig
|
||||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||||
from vllm.model_executor.layers.activation import SiluAndMul
|
from vllm.model_executor.layers.activation import SiluAndMul
|
||||||
from vllm.model_executor.layers.linear import (
|
from vllm.model_executor.layers.linear import (
|
||||||
@@ -225,6 +226,7 @@ class StableLmForCausalLM(nn.Module):
|
|||||||
self,
|
self,
|
||||||
config: PretrainedConfig,
|
config: PretrainedConfig,
|
||||||
quant_config: Optional[QuantizationConfig] = None,
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
|
cache_config: Optional[CacheConfig] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|||||||
@@ -1,12 +1,14 @@
|
|||||||
"""Inference-only Yi-VL model."""
|
"""Inference-only Yi-VL model."""
|
||||||
|
|
||||||
from typing import Tuple, Iterable
|
from typing import Tuple, Iterable, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from transformers import CLIPVisionModel, LlavaConfig
|
from transformers import CLIPVisionModel, LlavaConfig
|
||||||
|
from vllm.config import CacheConfig
|
||||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||||
|
|
||||||
|
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
||||||
from sglang.srt.models.llava import (
|
from sglang.srt.models.llava import (
|
||||||
LlavaLlamaForCausalLM,
|
LlavaLlamaForCausalLM,
|
||||||
monkey_path_clip_vision_embed_forward,
|
monkey_path_clip_vision_embed_forward,
|
||||||
@@ -15,9 +17,12 @@ from sglang.srt.models.llava import (
|
|||||||
|
|
||||||
class YiVLForCausalLM(LlavaLlamaForCausalLM):
|
class YiVLForCausalLM(LlavaLlamaForCausalLM):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, config, quant_config = None,
|
self,
|
||||||
|
config: LlavaConfig,
|
||||||
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
|
cache_config: Optional[CacheConfig] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(config, quant_config)
|
super().__init__(config, quant_config, cache_config)
|
||||||
|
|
||||||
self.multi_modal_projector = YiVLMultiModalProjector(self.config)
|
self.multi_modal_projector = YiVLMultiModalProjector(self.config)
|
||||||
self.vision_tower_subfolder = self.config.mm_vision_tower.replace(
|
self.vision_tower_subfolder = self.config.mm_vision_tower.replace(
|
||||||
|
|||||||
@@ -421,9 +421,10 @@ def suppress_other_loggers():
|
|||||||
from vllm.logger import logger as vllm_default_logger
|
from vllm.logger import logger as vllm_default_logger
|
||||||
|
|
||||||
vllm_default_logger.setLevel(logging.WARN)
|
vllm_default_logger.setLevel(logging.WARN)
|
||||||
logging.getLogger("vllm.utils").setLevel(logging.WARN)
|
|
||||||
logging.getLogger("vllm.selector").setLevel(logging.WARN)
|
|
||||||
logging.getLogger("vllm.config").setLevel(logging.ERROR)
|
logging.getLogger("vllm.config").setLevel(logging.ERROR)
|
||||||
|
logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(logging.WARN)
|
||||||
|
logging.getLogger("vllm.selector").setLevel(logging.WARN)
|
||||||
|
logging.getLogger("vllm.utils").setLevel(logging.WARN)
|
||||||
|
|
||||||
|
|
||||||
def assert_pkg_version(pkg: str, min_version: str):
|
def assert_pkg_version(pkg: str, min_version: str):
|
||||||
|
|||||||
Reference in New Issue
Block a user