[Feature] Support fp8 e5m2 kv cache with flashinfer (#1204)
Co-authored-by: Yineng Zhang <me@zhyncs.com>
This commit is contained in:
@@ -33,6 +33,7 @@ class ServerArgs:
|
||||
skip_tokenizer_init: bool = False
|
||||
load_format: str = "auto"
|
||||
dtype: str = "auto"
|
||||
kv_cache_dtype: str = "auto"
|
||||
trust_remote_code: bool = True
|
||||
context_length: Optional[int] = None
|
||||
quantization: Optional[str] = None
|
||||
@@ -196,6 +197,13 @@ class ServerArgs:
|
||||
'* "float" is shorthand for FP32 precision.\n'
|
||||
'* "float32" for FP32 precision.',
|
||||
)
|
||||
parser.add_argument(
|
||||
"--kv-cache-dtype",
|
||||
type=str,
|
||||
default=ServerArgs.kv_cache_dtype,
|
||||
choices=["auto", "fp8_e5m2"],
|
||||
help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" is supported for CUDA 11.8+.',
|
||||
)
|
||||
parser.add_argument(
|
||||
"--trust-remote-code",
|
||||
action="store_true",
|
||||
|
||||
Reference in New Issue
Block a user