Support FP8 E4M3 KV Cache (#2786)
Co-authored-by: root <bjmsong@126.com>
This commit is contained in:
@@ -32,6 +32,7 @@ from sglang.srt.utils import (
|
||||
is_hip,
|
||||
is_ipv6,
|
||||
is_port_available,
|
||||
nullable_str,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -47,6 +48,7 @@ class ServerArgs:
|
||||
trust_remote_code: bool = True
|
||||
dtype: str = "auto"
|
||||
kv_cache_dtype: str = "auto"
|
||||
quantization_param_path: nullable_str = None
|
||||
quantization: Optional[str] = None
|
||||
context_length: Optional[int] = None
|
||||
device: str = "cuda"
|
||||
@@ -350,8 +352,17 @@ class ServerArgs:
|
||||
"--kv-cache-dtype",
|
||||
type=str,
|
||||
default=ServerArgs.kv_cache_dtype,
|
||||
choices=["auto", "fp8_e5m2"],
|
||||
help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" is supported for CUDA 11.8+.',
|
||||
choices=["auto", "fp8_e5m2", "fp8_e4m3"],
|
||||
help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
|
||||
)
|
||||
parser.add_argument(
|
||||
"--quantization-param-path",
|
||||
type=nullable_str,
|
||||
default=None,
|
||||
help="Path to the JSON file containing the KV cache "
|
||||
"scaling factors. This should generally be supplied, when "
|
||||
"KV cache dtype is FP8. Otherwise, KV cache scaling factors "
|
||||
"default to 1.0, which may cause accuracy issues. ",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--quantization",
|
||||
|
||||
Reference in New Issue
Block a user