Returning a per request metric for number of cached_tokens read (#1599)
This commit is contained in:
@@ -73,6 +73,7 @@ class ServerArgs:
|
||||
# Other
|
||||
api_key: Optional[str] = None
|
||||
file_storage_pth: str = "SGLang_storage"
|
||||
enable_cache_report: bool = False
|
||||
|
||||
# Data parallelism
|
||||
dp_size: int = 1
|
||||
@@ -410,6 +411,11 @@ class ServerArgs:
|
||||
default=ServerArgs.file_storage_pth,
|
||||
help="The path of the file storage in backend.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-cache-report",
|
||||
action="store_true",
|
||||
help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
|
||||
)
|
||||
|
||||
# Data parallelism
|
||||
parser.add_argument(
|
||||
|
||||
Reference in New Issue
Block a user