Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)
Co-authored-by: SangBin Cho <rkooo567@gmail.com> Co-authored-by: dhou-xai <dhou@x.ai> Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>
This commit is contained in:
@@ -25,10 +25,10 @@ import filelock
|
||||
import gguf
|
||||
import huggingface_hub.constants
|
||||
import numpy as np
|
||||
import safetensors.torch
|
||||
import torch
|
||||
from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
|
||||
from pydantic import BaseModel, ConfigDict, ValidationInfo, model_validator
|
||||
from safetensors.torch import load_file, safe_open, save_file
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
from sglang.srt.configs.load_config import LoadConfig
|
||||
@@ -62,7 +62,6 @@ enable_hf_transfer()
|
||||
|
||||
|
||||
class DisabledTqdm(tqdm):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs, disable=True)
|
||||
|
||||
@@ -121,7 +120,7 @@ def convert_bin_to_safetensor_file(
|
||||
)
|
||||
|
||||
# check if the tensors are the same
|
||||
reloaded = load_file(sf_filename)
|
||||
reloaded = safetensors.torch.load_file(sf_filename)
|
||||
for k in loaded:
|
||||
pt_tensor = loaded[k]
|
||||
sf_tensor = reloaded[k]
|
||||
@@ -133,7 +132,6 @@ def convert_bin_to_safetensor_file(
|
||||
def get_quant_config(
|
||||
model_config: ModelConfig, load_config: LoadConfig
|
||||
) -> QuantizationConfig:
|
||||
|
||||
quant_cls = get_quantization_config(model_config.quantization)
|
||||
|
||||
# GGUF doesn't have config file
|
||||
@@ -402,15 +400,34 @@ def np_cache_weights_iterator(
|
||||
yield name, torch.from_numpy(param)
|
||||
|
||||
|
||||
def decrypt(fn, key):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
def safetensors_encrypted_weights_iterator(
|
||||
hf_weights_files: List[str],
|
||||
is_all_weights_sharded: bool = False,
|
||||
decryption_key: Optional[str] = None,
|
||||
):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
def safetensors_weights_iterator(
|
||||
hf_weights_files: List[str],
|
||||
is_all_weights_sharded: bool = False,
|
||||
decryption_key: Optional[str] = None,
|
||||
) -> Generator[Tuple[str, torch.Tensor], None, None]:
|
||||
"""Iterate over the weights in the model safetensor files.
|
||||
|
||||
If is_all_weights_sharded is True, it uses more optimize read by reading an
|
||||
entire file instead of reading each tensor one by one.
|
||||
"""
|
||||
if decryption_key:
|
||||
yield from safetensors_encrypted_weights_iterator(
|
||||
hf_weights_files, is_all_weights_sharded, decryption_key
|
||||
)
|
||||
return
|
||||
|
||||
enable_tqdm = (
|
||||
not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
|
||||
)
|
||||
@@ -420,15 +437,9 @@ def safetensors_weights_iterator(
|
||||
disable=not enable_tqdm,
|
||||
bar_format=_BAR_FORMAT,
|
||||
):
|
||||
if not is_all_weights_sharded:
|
||||
with safe_open(st_file, framework="pt") as f:
|
||||
for name in f.keys(): # noqa: SIM118
|
||||
param = f.get_tensor(name)
|
||||
yield name, param
|
||||
else:
|
||||
result = load_file(st_file, device="cpu")
|
||||
for name, param in result.items():
|
||||
yield name, param
|
||||
result = safetensors.torch.load_file(st_file, device="cpu")
|
||||
for name, param in result.items():
|
||||
yield name, param
|
||||
|
||||
|
||||
def pt_weights_iterator(
|
||||
|
||||
Reference in New Issue
Block a user