feat: support DeepSeek-R1-W4AFP8 model with ep-moe mode (#7762)

Signed-off-by: yangsijia.614 <yangsijia.614@bytedance.com>
2025-07-08 05:47:21 +08:00
parent 6a6e0bb7fd
commit cb9d91ea8a
10 changed files with 1006 additions and 9 deletions
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -359,7 +359,17 @@ class ModelConfig:
                if hf_api.file_exists(self.model_path, "hf_quant_config.json"):
                    quant_cfg = modelopt_quant_config
            elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
-                quant_cfg = modelopt_quant_config
+                quant_config_file = os.path.join(
+                    self.model_path, "hf_quant_config.json"
+                )
+                with open(quant_config_file) as f:
+                    quant_config_dict = json.load(f)
+                json_quant_configs = quant_config_dict["quantization"]
+                quant_algo = json_quant_configs.get("quant_algo", None)
+                if quant_algo == "MIXED_PRECISION":
+                    quant_cfg = {"quant_method": "w4afp8"}
+                else:
+                    quant_cfg = modelopt_quant_config
        return quant_cfg

    # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
@@ -389,6 +399,7 @@ class ModelConfig:
            "w8a8_fp8",
            "moe_wna16",
            "qoq",
+            "w4afp8",
        ]
        compatible_quantization_methods = {
            "modelopt_fp4": ["modelopt"],