Support FP8 E4M3 KV Cache (#2786)

Co-authored-by: root <bjmsong@126.com>
2025-01-13 13:17:11 +08:00
parent 85b2e05770
commit 0bb0f76311
9 changed files with 205 additions and 10 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -32,6 +32,7 @@ from sglang.srt.utils import (
    is_hip,
    is_ipv6,
    is_port_available,
+    nullable_str,
 )

 logger = logging.getLogger(__name__)
@@ -47,6 +48,7 @@ class ServerArgs:
    trust_remote_code: bool = True
    dtype: str = "auto"
    kv_cache_dtype: str = "auto"
+    quantization_param_path: nullable_str = None
    quantization: Optional[str] = None
    context_length: Optional[int] = None
    device: str = "cuda"
@@ -350,8 +352,17 @@ class ServerArgs:
            "--kv-cache-dtype",
            type=str,
            default=ServerArgs.kv_cache_dtype,
-            choices=["auto", "fp8_e5m2"],
-            help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" is supported for CUDA 11.8+.',
+            choices=["auto", "fp8_e5m2", "fp8_e4m3"],
+            help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
+        )
+        parser.add_argument(
+            "--quantization-param-path",
+            type=nullable_str,
+            default=None,
+            help="Path to the JSON file containing the KV cache "
+            "scaling factors. This should generally be supplied, when "
+            "KV cache dtype is FP8. Otherwise, KV cache scaling factors "
+            "default to 1.0, which may cause accuracy issues. ",
        )
        parser.add_argument(
            "--quantization",