Returning a per request metric for number of cached_tokens read (#1599)

This commit is contained in:
havetc
2024-10-16 20:49:22 +02:00
committed by GitHub
parent dbec2f1847
commit ecb8bad276
7 changed files with 245 additions and 3 deletions

View File

@@ -73,6 +73,7 @@ class ServerArgs:
# Other
api_key: Optional[str] = None
file_storage_pth: str = "SGLang_storage"
enable_cache_report: bool = False
# Data parallelism
dp_size: int = 1
@@ -410,6 +411,11 @@ class ServerArgs:
default=ServerArgs.file_storage_pth,
help="The path of the file storage in backend.",
)
parser.add_argument(
"--enable-cache-report",
action="store_true",
help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
)
# Data parallelism
parser.add_argument(