[Feature] Support fp8 e5m2 kv cache with flashinfer (#1204)

Co-authored-by: Yineng Zhang <me@zhyncs.com>
This commit is contained in:
Ke Bao
2024-08-26 08:38:11 +08:00
committed by GitHub
parent 61bb223e0f
commit 2c615d120f
5 changed files with 116 additions and 16 deletions

View File

@@ -33,6 +33,7 @@ class ServerArgs:
skip_tokenizer_init: bool = False
load_format: str = "auto"
dtype: str = "auto"
kv_cache_dtype: str = "auto"
trust_remote_code: bool = True
context_length: Optional[int] = None
quantization: Optional[str] = None
@@ -196,6 +197,13 @@ class ServerArgs:
'* "float" is shorthand for FP32 precision.\n'
'* "float32" for FP32 precision.',
)
parser.add_argument(
"--kv-cache-dtype",
type=str,
default=ServerArgs.kv_cache_dtype,
choices=["auto", "fp8_e5m2"],
help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" is supported for CUDA 11.8+.',
)
parser.add_argument(
"--trust-remote-code",
action="store_true",