From 433266c12567b9ce2fcb16a5d80267e2d5a1a311 Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Mon, 25 Aug 2025 15:02:31 +0800 Subject: [PATCH] Reintroduce memory usage fix (#9535) --- python/sglang/srt/layers/quantization/modelopt_quant.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py index 6d3b76950..9d7307c16 100755 --- a/python/sglang/srt/layers/quantization/modelopt_quant.py +++ b/python/sglang/srt/layers/quantization/modelopt_quant.py @@ -1212,11 +1212,13 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase): # Process w13 weights w13_blockscale_swizzled = self.swizzle_blockscale(layer.w13_weight_scale) + del layer.w13_weight_scale layer.w13_blockscale_swizzled.data.copy_(w13_blockscale_swizzled) layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False) # Process w2 weights w2_blockscale_swizzled = self.swizzle_blockscale(layer.w2_weight_scale) + del layer.w2_weight_scale layer.w2_blockscale_swizzled.data.copy_(w2_blockscale_swizzled) layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)