[Docs] clean up structured outputs docs (#2654)
This commit is contained in:
@@ -171,15 +171,15 @@ class CompletionRequest(BaseModel):
|
||||
top_k: int = -1
|
||||
min_p: float = 0.0
|
||||
min_tokens: int = 0
|
||||
regex: Optional[str] = None
|
||||
json_schema: Optional[str] = None
|
||||
regex: Optional[str] = None
|
||||
ebnf: Optional[str] = None
|
||||
repetition_penalty: float = 1.0
|
||||
stop_token_ids: Optional[List[int]] = None
|
||||
no_stop_trim: bool = False
|
||||
ignore_eos: bool = False
|
||||
skip_special_tokens: bool = True
|
||||
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
|
||||
ebnf: Optional[str] = None
|
||||
|
||||
|
||||
class CompletionResponseChoice(BaseModel):
|
||||
@@ -315,13 +315,13 @@ class ChatCompletionRequest(BaseModel):
|
||||
min_p: float = 0.0
|
||||
min_tokens: int = 0
|
||||
regex: Optional[str] = None
|
||||
ebnf: Optional[str] = None
|
||||
repetition_penalty: float = 1.0
|
||||
stop_token_ids: Optional[List[int]] = None
|
||||
no_stop_trim: bool = False
|
||||
ignore_eos: bool = False
|
||||
skip_special_tokens: bool = True
|
||||
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
|
||||
ebnf: Optional[str] = None
|
||||
|
||||
|
||||
class FunctionResponse(BaseModel):
|
||||
|
||||
@@ -19,6 +19,14 @@ _SAMPLING_EPS = 1e-6
|
||||
|
||||
|
||||
class SamplingParams:
|
||||
"""
|
||||
The sampling parameters.
|
||||
|
||||
See docs/references/sampling_params.md or
|
||||
https://sgl-project.github.io/references/sampling_params.html
|
||||
for the documentation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_new_tokens: int = 128,
|
||||
@@ -33,9 +41,9 @@ class SamplingParams:
|
||||
repetition_penalty: float = 1.0,
|
||||
min_new_tokens: int = 0,
|
||||
spaces_between_special_tokens: bool = True,
|
||||
regex: Optional[str] = None,
|
||||
n: int = 1,
|
||||
json_schema: Optional[str] = None,
|
||||
regex: Optional[str] = None,
|
||||
ebnf: Optional[str] = None,
|
||||
no_stop_trim: bool = False,
|
||||
ignore_eos: bool = False,
|
||||
|
||||
@@ -578,6 +578,8 @@ def _set_envs_and_config(server_args: ServerArgs):
|
||||
os.environ["NCCL_NVLS_ENABLE"] = "0"
|
||||
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
|
||||
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
|
||||
if "GLOO_SOCKET_IFNAME" not in os.environ:
|
||||
os.environ["GLOO_SOCKET_IFNAME"] = "eth0"
|
||||
|
||||
# Set prometheus env vars
|
||||
if server_args.enable_metrics:
|
||||
|
||||
@@ -42,7 +42,6 @@ class ServerArgs:
|
||||
model_path: str
|
||||
tokenizer_path: Optional[str] = None
|
||||
tokenizer_mode: str = "auto"
|
||||
skip_tokenizer_init: bool = False
|
||||
load_format: str = "auto"
|
||||
trust_remote_code: bool = True
|
||||
dtype: str = "auto"
|
||||
@@ -54,6 +53,7 @@ class ServerArgs:
|
||||
chat_template: Optional[str] = None
|
||||
is_embedding: bool = False
|
||||
revision: Optional[str] = None
|
||||
skip_tokenizer_init: bool = False
|
||||
return_token_ids: bool = False
|
||||
|
||||
# Port for the HTTP server
|
||||
@@ -276,17 +276,6 @@ class ServerArgs:
|
||||
"tokenizer if available, and 'slow' will "
|
||||
"always use the slow tokenizer.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-tokenizer-init",
|
||||
action="store_true",
|
||||
help="If set, skip init tokenizer and pass input_ids in generate request",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--return-token-ids",
|
||||
action="store_true",
|
||||
default=ServerArgs.return_token_ids,
|
||||
help="Whether to return token IDs in the output, this may introduce additional overhead.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--load-format",
|
||||
type=str,
|
||||
@@ -394,6 +383,17 @@ class ServerArgs:
|
||||
"name, a tag name, or a commit id. If unspecified, will use "
|
||||
"the default version.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-tokenizer-init",
|
||||
action="store_true",
|
||||
help="If set, skip init tokenizer and pass input_ids in generate request",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--return-token-ids",
|
||||
action="store_true",
|
||||
default=ServerArgs.return_token_ids,
|
||||
help="Whether to return token IDs in the output, this may introduce additional overhead.",
|
||||
)
|
||||
|
||||
# Memory and scheduling
|
||||
parser.add_argument(
|
||||
|
||||
Reference in New Issue
Block a user