Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

@@ -1,6 +1,7 @@
# coding=utf-8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
# Copyright (c) 2024 - 2024 Moore Threads Technology Co., Ltd("Moore Threads"). All rights reserved.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
# Copyright 2023 Cerebras Systems.
#
@@ -73,10 +74,9 @@ class JAISConfig(PretrainedConfig):
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values
attentions (not used by all models).
scale_attn_by_inverse_layer_idx (`bool`, *optional*,
defaults to `False`):
Whether to additionally scale attention weights by
`1 / layer_idx + 1`.
scale_attn_by_inverse_layer_idx (`bool`, *optional*, default `True`):
Whether to additionally scale attention weights
by `1 / layer_idx + 1`.
reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
Whether to scale keys (K) prior to computing attention
(dot-product)
@@ -98,7 +98,7 @@ class JAISConfig(PretrainedConfig):
Scale attention weights by dividing by hidden_size instead of
sqrt(hidden_size). Need to set scale_attn_weights to `True` as
well.
alibi_scaling (`Dict`, *optional*):
alibi_scaling (`dict`, *optional*):
Dictionary containing the scaling configuration for ALiBi
embeddings. Currently only supports linear
scaling strategy. Can specify either the scaling `factor` (must be
@@ -108,7 +108,7 @@ class JAISConfig(PretrainedConfig):
formats are `{"type": strategy name, "factor": scaling factor}` or
`{"type": strategy name,
"train_seq_len": training sequence length}`.
architectures (`List`, *optional*, defaults to ['JAISLMHeadModel']):
architectures (`list`, *optional*, defaults to ['JAISLMHeadModel']):
architecture names for Jais.
Example:
@@ -209,29 +209,35 @@ class JAISConfig(PretrainedConfig):
if self.alibi_scaling is None:
return
if (not isinstance(self.alibi_scaling, dict)
or len(self.alibi_scaling) != 2):
if not isinstance(self.alibi_scaling, dict) or len(self.alibi_scaling) != 2:
raise ValueError(
"`alibi_scaling` must be a dictionary with two fields,"
"`alibi_scaling` must be a dictionary with two fields, "
"`type` and `factor` or `type` and `train_seq_len`, "
f"got {self.alibi_scaling}")
f"got {self.alibi_scaling}"
)
alibi_scaling_type = self.alibi_scaling.get("type", None)
alibi_scaling_factor = self.alibi_scaling.get("factor", None)
alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
if alibi_scaling_type is None or alibi_scaling_type != "linear":
raise ValueError(f"`alibi_scaling`'s type field must be 'linear',"
f"got {alibi_scaling_type}")
if (alibi_scaling_factor is not None
and not isinstance(alibi_scaling_factor, float)
or (alibi_scaling_factor is not None
and alibi_scaling_factor <= 1.0)):
raise ValueError(
f"`alibi_scaling`'s factor field must be a float > 1.0,"
f"got {alibi_scaling_factor}")
if (alibi_dynamic_scaling is not None
and not isinstance(alibi_dynamic_scaling, int)
or (alibi_dynamic_scaling is not None
and alibi_dynamic_scaling <= 1)):
f"`alibi_scaling`'s type field must be 'linear', "
f"got {alibi_scaling_type}"
)
if (
alibi_scaling_factor is not None
and not isinstance(alibi_scaling_factor, float)
or (alibi_scaling_factor is not None and alibi_scaling_factor <= 1.0)
):
raise ValueError(
f"`alibi_scaling`'s `train_seq_len` field must be an"
f"integer > 1, got {alibi_dynamic_scaling}")
f"`alibi_scaling`'s factor field must be a float > 1.0, "
f"got {alibi_scaling_factor}"
)
if (
alibi_dynamic_scaling is not None
and not isinstance(alibi_dynamic_scaling, int)
or (alibi_dynamic_scaling is not None and alibi_dynamic_scaling <= 1)
):
raise ValueError(
f"`alibi_scaling`'s `train_seq_len` field must be an "
f"integer > 1, got {alibi_dynamic_scaling}"
)