feat: support DeepSeek-R1-W4AFP8 model with ep-moe mode (#7762)

Signed-off-by: yangsijia.614 <yangsijia.614@bytedance.com>
This commit is contained in:
SijiaYang
2025-07-08 05:47:21 +08:00
committed by GitHub
parent 6a6e0bb7fd
commit cb9d91ea8a
10 changed files with 1006 additions and 9 deletions

View File

@@ -359,7 +359,17 @@ class ModelConfig:
if hf_api.file_exists(self.model_path, "hf_quant_config.json"):
quant_cfg = modelopt_quant_config
elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
quant_cfg = modelopt_quant_config
quant_config_file = os.path.join(
self.model_path, "hf_quant_config.json"
)
with open(quant_config_file) as f:
quant_config_dict = json.load(f)
json_quant_configs = quant_config_dict["quantization"]
quant_algo = json_quant_configs.get("quant_algo", None)
if quant_algo == "MIXED_PRECISION":
quant_cfg = {"quant_method": "w4afp8"}
else:
quant_cfg = modelopt_quant_config
return quant_cfg
# adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
@@ -389,6 +399,7 @@ class ModelConfig:
"w8a8_fp8",
"moe_wna16",
"qoq",
"w4afp8",
]
compatible_quantization_methods = {
"modelopt_fp4": ["modelopt"],