[Docs] clean up structured outputs docs (#2654)

This commit is contained in:
Lianmin Zheng
2024-12-29 23:57:16 -08:00
committed by GitHub
parent e6f523b5f2
commit 8c3b420eec
10 changed files with 62 additions and 70 deletions

View File

@@ -171,15 +171,15 @@ class CompletionRequest(BaseModel):
top_k: int = -1
min_p: float = 0.0
min_tokens: int = 0
regex: Optional[str] = None
json_schema: Optional[str] = None
regex: Optional[str] = None
ebnf: Optional[str] = None
repetition_penalty: float = 1.0
stop_token_ids: Optional[List[int]] = None
no_stop_trim: bool = False
ignore_eos: bool = False
skip_special_tokens: bool = True
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
ebnf: Optional[str] = None
class CompletionResponseChoice(BaseModel):
@@ -315,13 +315,13 @@ class ChatCompletionRequest(BaseModel):
min_p: float = 0.0
min_tokens: int = 0
regex: Optional[str] = None
ebnf: Optional[str] = None
repetition_penalty: float = 1.0
stop_token_ids: Optional[List[int]] = None
no_stop_trim: bool = False
ignore_eos: bool = False
skip_special_tokens: bool = True
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
ebnf: Optional[str] = None
class FunctionResponse(BaseModel):

View File

@@ -19,6 +19,14 @@ _SAMPLING_EPS = 1e-6
class SamplingParams:
"""
The sampling parameters.
See docs/references/sampling_params.md or
https://sgl-project.github.io/references/sampling_params.html
for the documentation.
"""
def __init__(
self,
max_new_tokens: int = 128,
@@ -33,9 +41,9 @@ class SamplingParams:
repetition_penalty: float = 1.0,
min_new_tokens: int = 0,
spaces_between_special_tokens: bool = True,
regex: Optional[str] = None,
n: int = 1,
json_schema: Optional[str] = None,
regex: Optional[str] = None,
ebnf: Optional[str] = None,
no_stop_trim: bool = False,
ignore_eos: bool = False,

View File

@@ -578,6 +578,8 @@ def _set_envs_and_config(server_args: ServerArgs):
os.environ["NCCL_NVLS_ENABLE"] = "0"
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
if "GLOO_SOCKET_IFNAME" not in os.environ:
os.environ["GLOO_SOCKET_IFNAME"] = "eth0"
# Set prometheus env vars
if server_args.enable_metrics:

View File

@@ -42,7 +42,6 @@ class ServerArgs:
model_path: str
tokenizer_path: Optional[str] = None
tokenizer_mode: str = "auto"
skip_tokenizer_init: bool = False
load_format: str = "auto"
trust_remote_code: bool = True
dtype: str = "auto"
@@ -54,6 +53,7 @@ class ServerArgs:
chat_template: Optional[str] = None
is_embedding: bool = False
revision: Optional[str] = None
skip_tokenizer_init: bool = False
return_token_ids: bool = False
# Port for the HTTP server
@@ -276,17 +276,6 @@ class ServerArgs:
"tokenizer if available, and 'slow' will "
"always use the slow tokenizer.",
)
parser.add_argument(
"--skip-tokenizer-init",
action="store_true",
help="If set, skip init tokenizer and pass input_ids in generate request",
)
parser.add_argument(
"--return-token-ids",
action="store_true",
default=ServerArgs.return_token_ids,
help="Whether to return token IDs in the output, this may introduce additional overhead.",
)
parser.add_argument(
"--load-format",
type=str,
@@ -394,6 +383,17 @@ class ServerArgs:
"name, a tag name, or a commit id. If unspecified, will use "
"the default version.",
)
parser.add_argument(
"--skip-tokenizer-init",
action="store_true",
help="If set, skip init tokenizer and pass input_ids in generate request",
)
parser.add_argument(
"--return-token-ids",
action="store_true",
default=ServerArgs.return_token_ids,
help="Whether to return token IDs in the output, this may introduce additional overhead.",
)
# Memory and scheduling
parser.add_argument(