Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/vllm/model_executor/layers/quantization/schema.py
+++ b/vllm/model_executor/layers/quantization/schema.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file contains the Pydantic schemas for various quantization-related
 parameters. When a relevant quantization technique is specified, these
@@ -11,8 +13,6 @@ possible on ROCm), the model can be optionally augmented with KV cache
 scaling factors.
 """

-from typing import Dict, Optional
-
 from pydantic import BaseModel, ConfigDict, ValidationInfo, model_validator


@@ -22,13 +22,14 @@ class KVCacheQuantSchema(BaseModel):
    # layer indices to their per-tensor KV cache scaling factor.
    # TODO: Consider pulling this and its validation methods out into its
    # own schema class (tricky as its members are variable)
-    scaling_factor: Dict[int, Dict[int, float]]
+    scaling_factor: dict[int, dict[int, float]]

    @model_validator(mode="after")
    def check_is_fp8(self) -> "KVCacheQuantSchema":
        assert self.dtype == "float8_e4m3fn", (
            "Loaded scaling factors intended for KV cache dtype = "
-            f"{self.dtype} rather than float8_e4m3fn!")
+            f"{self.dtype} rather than float8_e4m3fn!"
+        )
        return self

    @model_validator(mode="after")
@@ -39,15 +40,18 @@ class KVCacheQuantSchema(BaseModel):
            num_hidden_layers = context["num_hidden_layers"]
            assert len(self.scaling_factor) == tp_size, (
                f"Loaded dictionary has TP size {len(self.scaling_factor)} "
-                f"but LLM engine is currently running with TP size {tp_size}.")
+                f"but LLM engine is currently running with TP size {tp_size}."
+            )
            for tp_rank, layer_maps in self.scaling_factor.items():
                assert len(layer_maps) == num_hidden_layers, (
                    f"KV cache scales map for TP rank {tp_rank} is malformed. "
                    f"Expected {num_hidden_layers} layers, got "
-                    f"{len(layer_maps)}.")
+                    f"{len(layer_maps)}."
+                )
            for i in range(tp_size):
                assert i in self.scaling_factor, (
-                    f"KV cache scales map for TP rank {i} not found.")
+                    f"KV cache scales map for TP rank {i} not found."
+                )
        return self

    @model_validator(mode="after")
@@ -60,7 +64,8 @@ class KVCacheQuantSchema(BaseModel):
            for i in range(num_hidden_layers):
                assert i in layer_scales_map, (
                    f"Could not find KV cache scales for layer {i} in "
-                    f"TP rank {tp_rank}.")
+                    f"TP rank {tp_rank}."
+                )
        return self


@@ -68,7 +73,7 @@ class QuantParamSchema(BaseModel):
    # TODO: Generalize and extend with more fields
    # (e.g. weights/activations params) once functionality is enabled
    model_config = ConfigDict(protected_namespaces=())
-    model_type: Optional[str]
+    model_type: str | None
    kv_cache: KVCacheQuantSchema

    @model_validator(mode="after")
@@ -80,5 +85,6 @@ class QuantParamSchema(BaseModel):
                assert model_type == self.model_type, (
                    f"Model type is {model_type} but loaded "
                    f"scaling factors belonging to different "
-                    f"model type {self.model_type}!")
+                    f"model type {self.model_type}!"
+                )
        return self