Optimized deepseek-v3/r1 model performance on mxfp4 run (#10008)

Co-authored-by: wunhuang <wunhuang@amd.com>
Co-authored-by: HAI <hixiao@gmail.com>
Co-authored-by: Hubert Lu <55214931+hubertlu-tw@users.noreply.github.com>
This commit is contained in:
kk
2025-09-05 06:11:22 +08:00
committed by GitHub
parent 93088b6975
commit e96973742c
8 changed files with 486 additions and 64 deletions

View File

@@ -2900,6 +2900,18 @@ def mxfp_supported():
return False
@lru_cache(maxsize=1)
def is_gfx95_supported():
"""
Returns whether the current platform supports MX types.
"""
if torch.version.hip:
gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
return any(gfx in gcn_arch for gfx in ["gfx95"])
else:
return False
# LoRA-related constants and utilities
SUPPORTED_LORA_TARGET_MODULES = [
"q_proj",