From a437aa99870d805ac59de74b3962088b3a916bf2 Mon Sep 17 00:00:00 2001 From: Cheng Wan <54331508+ch-wan@users.noreply.github.com> Date: Sat, 2 Aug 2025 22:59:25 -0700 Subject: [PATCH] [hotfix] fix mixtral with tensor-level compressed-tensor quantization (#8721) --- .../compressed_tensors/compressed_tensors_moe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 525a75069..c6da7e149 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -23,6 +23,7 @@ from sglang.srt.layers.quantization.utils import ( from sglang.srt.utils import is_cpu, is_cuda, is_hip, is_npu, set_weight_attrs if TYPE_CHECKING: + from sglang.srt.layers.moe.fused_moe_triton import FusedMoE from sglang.srt.layers.moe.topk import TopKOutput from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import ( CompressedTensorsConfig, @@ -189,7 +190,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): layer.w13_input_scale = None layer.w2_input_scale = None - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: FusedMoE) -> None: # Fp8 moe kernels require a single activation scale. # We take the max of all the scales in case they differ. if self.static_input_scales: @@ -246,7 +247,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): assert layer.w13_weight_scale is not None shard_size = layer.intermediate_size_per_partition max_w13_scales = layer.w13_weight_scale.max(dim=1).values - for expert_id in range(layer.local_num_experts): + for expert_id in range(layer.num_local_experts): start = 0 for shard_id in range(2): dq_weight = per_tensor_dequantize(