Returning a per request metric for number of cached_tokens read (#1599)

2024-10-16 20:49:22 +02:00
parent dbec2f1847
commit ecb8bad276
7 changed files with 245 additions and 3 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -73,6 +73,7 @@ class ServerArgs:
    # Other
    api_key: Optional[str] = None
    file_storage_pth: str = "SGLang_storage"
+    enable_cache_report: bool = False

    # Data parallelism
    dp_size: int = 1
@@ -410,6 +411,11 @@ class ServerArgs:
            default=ServerArgs.file_storage_pth,
            help="The path of the file storage in backend.",
        )
+        parser.add_argument(
+            "--enable-cache-report",
+            action="store_true",
+            help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
+        )

        # Data parallelism
        parser.add_argument(