init

2025-08-07 07:25:16 +00:00
commit ae2c299b3a
117 changed files with 29475 additions and 0 deletions
--- a/vllm/model_executor/model_loader.py
+++ b/vllm/model_executor/model_loader.py
@@ -0,0 +1,137 @@
+"""Utilities for selecting and loading models."""
+import contextlib
+from typing import Type
+
+import torch
+import torch.nn as nn
+
+from vllm.config import DeviceConfig, ModelConfig
+from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.weight_utils import (get_quant_config,
+                                              initialize_dummy_weights)
+
+
+@contextlib.contextmanager
+def _set_default_torch_dtype(dtype: torch.dtype):
+    """Sets the default torch dtype to the given dtype."""
+    old_dtype = torch.get_default_dtype()
+    torch.set_default_dtype(dtype)
+    yield
+    torch.set_default_dtype(old_dtype)
+
+
+def _get_model_architecture(model_config: ModelConfig) -> Type[nn.Module]:
+    architectures = getattr(model_config.hf_config, "architectures", [])
+    # Special handling for quantized Mixtral.
+    # FIXME(woosuk): This is a temporary hack.
+    if (model_config.quantization is not None
+            and "MixtralForCausalLM" in architectures):
+        architectures = ["QuantMixtralForCausalLM"]
+
+    for arch in architectures:
+        model_cls = ModelRegistry.load_model_cls(arch)
+        if model_cls is not None:
+            return model_cls
+    raise ValueError(
+        f"Model architectures {architectures} are not supported for now. "
+        f"Supported architectures: {ModelRegistry.get_supported_archs()}")
+
+
+def get_model(model_config: ModelConfig, device_config: DeviceConfig,
+              **kwargs) -> nn.Module:
+    lora_config = kwargs.get("lora_config", None)
+    model_class = _get_model_architecture(model_config)
+
+    # Get the (maybe quantized) linear method.
+    linear_method = None
+    if model_config.quantization is not None:
+        quant_config = get_quant_config(model_config)
+        capability = (9, 0)
+        # capability = torch.cuda.get_device_capability() avoid capability error
+        capability = capability[0] * 10 + capability[1]
+        if capability < quant_config.get_min_capability():
+            raise ValueError(
+                f"The quantization method {model_config.quantization} is not "
+                "supported for the current GPU. "
+                f"Minimum capability: {quant_config.get_min_capability()}. "
+                f"Current capability: {capability}.")
+        supported_dtypes = quant_config.get_supported_act_dtypes()
+        if model_config.dtype not in supported_dtypes:
+            raise ValueError(
+                f"{model_config.dtype} is not supported for quantization "
+                f"method {model_config.quantization}. Supported dtypes: "
+                f"{supported_dtypes}")
+        linear_method = quant_config.get_linear_method()
+
+    with _set_default_torch_dtype(model_config.dtype):
+        # Create a model instance.
+        # The weights will be initialized as empty tensors.
+        try:
+            # with torch.device contextmanager need torch >= 2.0
+            with torch.device(device_config.device):
+                if hasattr(model_class, "supported_lora_modules"):
+                    model = model_class(model_config.hf_config, linear_method,
+                                        lora_config)
+                elif lora_config:
+                    raise ValueError(
+                        f"Model {model_class.__name__} does not support LoRA, "
+                        "but LoRA is enabled. Support for this model may "
+                        "be added in the future. If this is important to you, "
+                        "please open an issue on github.")
+                else:
+                    model = model_class(model_config.hf_config, linear_method)
+            if model_config.load_format == "dummy":
+                # NOTE(woosuk): For accurate performance evaluation, we assign
+                # random values to the weights.
+                initialize_dummy_weights(model)
+            else:
+                # Load the weights from the cached or downloaded files.
+                model.load_weights(model_config.model, model_config.download_dir,
+                                model_config.load_format, model_config.revision)
+        # for torch < 2.0
+        except:
+            if hasattr(model_class, "supported_lora_modules"):
+                    model = model_class(model_config.hf_config, linear_method,
+                                        lora_config)
+            elif lora_config:
+                raise ValueError(
+                    f"Model {model_class.__name__} does not support LoRA, "
+                    "but LoRA is enabled. Support for this model may "
+                    "be added in the future. If this is important to you, "
+                    "please open an issue on github.")
+            else:
+                model = model_class(model_config.hf_config, linear_method)
+            model = model.cuda()
+            if model_config.load_format == "dummy":
+                # NOTE(woosuk): For accurate performance evaluation, we assign
+                # random values to the weights.
+                initialize_dummy_weights(model)
+            else:
+                # Load the weights from the cached or downloaded files.
+                model.load_weights(model_config.model, model_config.download_dir,
+                                model_config.load_format, model_config.revision)
+        return model.eval()
+        # TODO align
+        """
+        with torch.device(device_config.device):
+            if hasattr(model_class, "supported_lora_modules"):
+                model = model_class(model_config.hf_config, linear_method,
+                                    lora_config)
+            elif lora_config:
+                raise ValueError(
+                    f"Model {model_class.__name__} does not support LoRA, "
+                    "but LoRA is enabled. Support for this model may "
+                    "be added in the future. If this is important to you, "
+                    "please open an issue on github.")
+            else:
+                model = model_class(model_config.hf_config, linear_method)
+        if model_config.load_format == "dummy":
+            # NOTE(woosuk): For accurate performance evaluation, we assign
+            # random values to the weights.
+            initialize_dummy_weights(model)
+        else:
+            # Load the weights from the cached or downloaded files.
+            model.load_weights(model_config.model, model_config.download_dir,
+                               model_config.load_format, model_config.revision)
+    return model.eval()
+        """