Make constrained decoding work for overlap scheduler (#2095)
This commit is contained in:
@@ -123,7 +123,6 @@ class ServerArgs:
|
||||
disable_disk_cache: bool = False
|
||||
disable_custom_all_reduce: bool = False
|
||||
disable_mla: bool = False
|
||||
disable_penalizer: bool = False
|
||||
enable_overlap_schedule: bool = False
|
||||
enable_mixed_chunk: bool = False
|
||||
enable_dp_attention: bool = False
|
||||
@@ -200,12 +199,7 @@ class ServerArgs:
|
||||
)
|
||||
|
||||
if self.enable_overlap_schedule:
|
||||
logger.warning(
|
||||
"Overlap scheduler mode is enabled. This is an experimental feature. "
|
||||
"Sampling penalizer (e.g., frequency and repetition penalty), constrained decoding (e.g., regex, JSON), "
|
||||
"and embedding APIs are not supported and will lead to wrong results. "
|
||||
)
|
||||
self.disable_penalizer = True
|
||||
self.disable_jump_forward = True
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(parser: argparse.ArgumentParser):
|
||||
@@ -622,11 +616,6 @@ class ServerArgs:
|
||||
action="store_true",
|
||||
help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-penalizer",
|
||||
action="store_true",
|
||||
help="Disable the logit penalizers (e.g., frequency and repetition penalty) for better performance if they are not used in any requests.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-nan-detection",
|
||||
action="store_true",
|
||||
|
||||
Reference in New Issue
Block a user