[NVIDIA] Fix num_experts in modelopt_quant (#8811)
This commit is contained in:
@@ -764,8 +764,6 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
|
||||
)
|
||||
|
||||
# TODO(ch-wan): check if this is needed
|
||||
layer.num_experts = num_experts
|
||||
layer.num_local_experts = num_experts
|
||||
layer.intermediate_size_per_partition = intermediate_size_per_partition
|
||||
layer.params_dtype = params_dtype
|
||||
layer.quant_config = self.quant_config
|
||||
@@ -1106,7 +1104,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
|
||||
layer.w13_weight_scale,
|
||||
)
|
||||
|
||||
print("Applied flashinfer weight processing for both w13 and w2")
|
||||
logger.info_once("Applied flashinfer weight processing for both w13 and w2")
|
||||
|
||||
else:
|
||||
# CUTLASS processing - handle w13 and w2 separately
|
||||
@@ -1126,7 +1124,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
|
||||
layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
|
||||
|
||||
# Both flashinfer cutlass and regular cutlass use same processing for w2
|
||||
print("Applied weight processing for both w13 and w2")
|
||||
logger.info_once("Applied weight processing for both w13 and w2")
|
||||
|
||||
# Set up CUTLASS MoE parameters
|
||||
device = layer.w13_weight.device
|
||||
|
||||
Reference in New Issue
Block a user