Support Triton fp8 e5m2 kv cache (#1286)

Co-authored-by: Yineng Zhang <me@zhyncs.com>
This commit is contained in:
Ke Bao
2024-09-01 17:46:40 +08:00
committed by GitHub
parent 761b2cebd6
commit 6cb32ef92c
2 changed files with 13 additions and 11 deletions

View File

@@ -348,13 +348,7 @@ class ModelRunner:
if self.server_args.kv_cache_dtype == "auto":
self.kv_cache_dtype = self.dtype
elif self.server_args.kv_cache_dtype == "fp8_e5m2":
if self.server_args.disable_flashinfer or self.server_args.enable_mla:
logger.warning(
"FP8 KV cache is not supported for Triton kernel now, using auto kv cache dtype"
)
self.kv_cache_dtype = self.dtype
else:
self.kv_cache_dtype = torch.float8_e5m2
self.kv_cache_dtype = torch.float8_e5m2
else:
raise ValueError(
f"Unsupported kv_cache_dtype: {self.server_args.kv_cache_dtype}."