[Feature] Support fp8 e5m2 kv cache with flashinfer (#1204)

Co-authored-by: Yineng Zhang <me@zhyncs.com>
2024-08-26 08:38:11 +08:00
parent 61bb223e0f
commit 2c615d120f
5 changed files with 116 additions and 16 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -33,6 +33,7 @@ class ServerArgs:
    skip_tokenizer_init: bool = False
    load_format: str = "auto"
    dtype: str = "auto"
+    kv_cache_dtype: str = "auto"
    trust_remote_code: bool = True
    context_length: Optional[int] = None
    quantization: Optional[str] = None
@@ -196,6 +197,13 @@ class ServerArgs:
            '* "float" is shorthand for FP32 precision.\n'
            '* "float32" for FP32 precision.',
        )
+        parser.add_argument(
+            "--kv-cache-dtype",
+            type=str,
+            default=ServerArgs.kv_cache_dtype,
+            choices=["auto", "fp8_e5m2"],
+            help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" is supported for CUDA 11.8+.',
+        )
        parser.add_argument(
            "--trust-remote-code",
            action="store_true",