Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)

Co-authored-by: SangBin Cho <rkooo567@gmail.com> Co-authored-by: dhou-xai <dhou@x.ai> Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>
2025-03-03 00:12:04 -08:00
parent 0194948fd9
commit ac2387279e
86 changed files with 4116 additions and 2015 deletions
--- a/python/sglang/srt/model_loader/weight_utils.py
+++ b/python/sglang/srt/model_loader/weight_utils.py
@@ -25,10 +25,10 @@ import filelock
 import gguf
 import huggingface_hub.constants
 import numpy as np
+import safetensors.torch
 import torch
 from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
 from pydantic import BaseModel, ConfigDict, ValidationInfo, model_validator
-from safetensors.torch import load_file, safe_open, save_file
 from tqdm.auto import tqdm

 from sglang.srt.configs.load_config import LoadConfig
@@ -62,7 +62,6 @@ enable_hf_transfer()


 class DisabledTqdm(tqdm):
-
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs, disable=True)

@@ -121,7 +120,7 @@ def convert_bin_to_safetensor_file(
        )

    # check if the tensors are the same
-    reloaded = load_file(sf_filename)
+    reloaded = safetensors.torch.load_file(sf_filename)
    for k in loaded:
        pt_tensor = loaded[k]
        sf_tensor = reloaded[k]
@@ -133,7 +132,6 @@ def convert_bin_to_safetensor_file(
 def get_quant_config(
    model_config: ModelConfig, load_config: LoadConfig
 ) -> QuantizationConfig:
-
    quant_cls = get_quantization_config(model_config.quantization)

    # GGUF doesn't have config file
@@ -402,15 +400,34 @@ def np_cache_weights_iterator(
        yield name, torch.from_numpy(param)


+def decrypt(fn, key):
+    raise NotImplementedError()
+
+
+def safetensors_encrypted_weights_iterator(
+    hf_weights_files: List[str],
+    is_all_weights_sharded: bool = False,
+    decryption_key: Optional[str] = None,
+):
+    raise NotImplementedError()
+
+
 def safetensors_weights_iterator(
    hf_weights_files: List[str],
    is_all_weights_sharded: bool = False,
+    decryption_key: Optional[str] = None,
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
    """Iterate over the weights in the model safetensor files.

    If is_all_weights_sharded is True, it uses more optimize read by reading an
    entire file instead of reading each tensor one by one.
    """
+    if decryption_key:
+        yield from safetensors_encrypted_weights_iterator(
+            hf_weights_files, is_all_weights_sharded, decryption_key
+        )
+        return
+
    enable_tqdm = (
        not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
    )
@@ -420,15 +437,9 @@ def safetensors_weights_iterator(
        disable=not enable_tqdm,
        bar_format=_BAR_FORMAT,
    ):
-        if not is_all_weights_sharded:
-            with safe_open(st_file, framework="pt") as f:
-                for name in f.keys():  # noqa: SIM118
-                    param = f.get_tensor(name)
-                    yield name, param
-        else:
-            result = load_file(st_file, device="cpu")
-            for name, param in result.items():
-                yield name, param
+        result = safetensors.torch.load_file(st_file, device="cpu")
+        for name, param in result.items():
+            yield name, param


 def pt_weights_iterator(