Refactor MoE (#2575)

Co-authored-by: zhyncs <me@zhyncs.com>
2024-12-26 00:02:14 +08:00
parent 8a56b43175
commit 53aed988cb
9 changed files with 1012 additions and 49 deletions
--- a/python/sglang/srt/layers/quantization/fp8.py
+++ b/python/sglang/srt/layers/quantization/fp8.py
@@ -9,6 +9,7 @@ import torch.nn.functional as F
 from torch.nn import Module
 from torch.nn.parameter import Parameter
 from vllm import _custom_ops as ops
+from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.linear import LinearBase
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
@@ -32,7 +33,11 @@ from sglang.srt.layers.quantization.base_config import (
    QuantizationConfig,
    QuantizeMethodBase,
 )
-from sglang.srt.layers.quantization.fp8_utils import normalize_e4m3fn_to_e4m3fnuz
+from sglang.srt.layers.quantization.fp8_utils import (
+    BlockQuantScaleParameter,
+    apply_w8a8_block_fp8_linear,
+    normalize_e4m3fn_to_e4m3fnuz,
+)
 from sglang.srt.utils import (
    get_bool_env_var,
    is_hip,
@@ -53,6 +58,7 @@ class Fp8Config(QuantizationConfig):
        is_checkpoint_fp8_serialized: bool = False,
        activation_scheme: str = "dynamic",
        ignored_layers: Optional[List[str]] = None,
+        weight_block_size: List[int] = None,
    ) -> None:
        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
        if is_checkpoint_fp8_serialized:
@@ -64,6 +70,20 @@ class Fp8Config(QuantizationConfig):
            raise ValueError(f"Unsupported activation scheme {activation_scheme}")
        self.activation_scheme = activation_scheme
        self.ignored_layers = ignored_layers or []
+        if weight_block_size is not None:
+            if not is_checkpoint_fp8_serialized:
+                raise ValueError(
+                    f"The block-wise quantization only supports fp8-serialized checkpoint for now."
+                )
+            if len(weight_block_size) != 2:
+                raise ValueError(
+                    f"The quantization block size of weight must have 2 dimensions, but got {len(weight_block_size)} dimensions."
+                )
+            if activation_scheme != "dynamic":
+                raise ValueError(
+                    f"The block-wise quantization only supports dynamic activation scheme for now, but got {activation_scheme} activation scheme."
+                )
+        self.weight_block_size = weight_block_size

    @classmethod
    def get_name(cls) -> str:
@@ -87,10 +107,12 @@ class Fp8Config(QuantizationConfig):
        is_checkpoint_fp8_serialized = "fp8" in quant_method
        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
        ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
+        weight_block_size = cls.get_from_keys_or(config, ["weight_block_size"], None)
        return cls(
            is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
            activation_scheme=activation_scheme,
            ignored_layers=ignored_layers,
+            weight_block_size=weight_block_size,
        )

    def get_quant_method(
@@ -143,6 +165,11 @@ class Fp8LinearMethod(LinearMethodBase):
        if is_hip():
            self.use_marlin = False

+        self.block_quant = self.quant_config.weight_block_size is not None
+        if self.block_quant:
+            # Marlin doesn't support block-wise fp8
+            self.use_marlin = False
+
    def create_weights(
        self,
        layer: torch.nn.Module,
@@ -153,10 +180,35 @@ class Fp8LinearMethod(LinearMethodBase):
        params_dtype: torch.dtype,
        **extra_weight_attrs,
    ):
-        del input_size, output_size
        output_size_per_partition = sum(output_partition_sizes)
        weight_loader = extra_weight_attrs.get("weight_loader")

+        tp_size = get_tensor_model_parallel_world_size()
+        if self.block_quant:
+            block_n, block_k = (
+                self.quant_config.weight_block_size[0],
+                self.quant_config.weight_block_size[1],
+            )
+            # Required by row parallel
+            if tp_size > 1 and input_size // input_size_per_partition == tp_size:
+                if input_size_per_partition % block_k != 0:
+                    raise ValueError(
+                        f"Weight input_size_per_partition = "
+                        f"{input_size_per_partition} is not divisible by "
+                        f"weight quantization block_k = {block_k}."
+                    )
+            # Required by collum parallel or enabling merged weights
+            if (
+                tp_size > 1 and output_size // output_size_per_partition == tp_size
+            ) or len(output_partition_sizes) > 1:
+                for output_partition_size in output_partition_sizes:
+                    if output_partition_size % block_n != 0:
+                        raise ValueError(
+                            f"Weight output_partition_size = "
+                            f"{output_partition_size} is not divisible by "
+                            f"weight quantization block_n = {block_n}."
+                        )
+
        layer.logical_widths = output_partition_sizes

        layer.input_size_per_partition = input_size_per_partition
@@ -184,13 +236,27 @@ class Fp8LinearMethod(LinearMethodBase):
        # Otherwise, wait until process_weights_after_loading.
        if self.quant_config.is_checkpoint_fp8_serialized:
            # WEIGHT SCALE
-            scale = PerTensorScaleParameter(
-                data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
-                weight_loader=weight_loader,
-            )
-
-            scale[:] = torch.finfo(torch.float32).min
-            layer.register_parameter("weight_scale", scale)
+            if self.block_quant:
+                assert self.quant_config.activation_scheme == "dynamic"
+                scale = BlockQuantScaleParameter(
+                    data=torch.empty(
+                        (output_size_per_partition + block_n - 1) // block_n,
+                        (input_size_per_partition + block_k - 1) // block_k,
+                        dtype=torch.float32,
+                    ),
+                    input_dim=1,
+                    output_dim=0,
+                    weight_loader=weight_loader,
+                )
+                scale[:] = torch.finfo(torch.float32).min
+                layer.register_parameter("weight_scale_inv", scale)
+            else:
+                scale = PerTensorScaleParameter(
+                    data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                    weight_loader=weight_loader,
+                )
+                scale[:] = torch.finfo(torch.float32).min
+                layer.register_parameter("weight_scale", scale)

            # INPUT ACTIVATION SCALE
            if self.quant_config.activation_scheme == "static":
@@ -205,6 +271,9 @@ class Fp8LinearMethod(LinearMethodBase):
                layer.register_parameter("input_scale", None)

    def process_weights_after_loading(self, layer: Module) -> None:
+        # Block quant doesn't need to process weights after loading
+        if self.block_quant:
+            return
        layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False)
        # If checkpoint not serialized fp8, quantize the weights.
        if not self.quant_config.is_checkpoint_fp8_serialized:
@@ -295,6 +364,16 @@ class Fp8LinearMethod(LinearMethodBase):
                bias=bias,
            )

+        if self.block_quant:
+            return apply_w8a8_block_fp8_linear(
+                input=x,
+                weight=layer.weight,
+                block_size=self.quant_config.weight_block_size,
+                weight_scale=layer.weight_scale_inv,
+                input_scale=layer.input_scale,
+                bias=bias,
+            )
+
        return apply_fp8_linear(
            input=x,
            weight=layer.weight,
@@ -339,6 +418,7 @@ class Fp8MoEMethod:

    def __init__(self, quant_config):
        self.quant_config = quant_config
+        self.block_quant = self.quant_config.weight_block_size is not None

    def create_weights(
        self,
@@ -353,6 +433,28 @@ class Fp8MoEMethod:

        if self.quant_config.is_checkpoint_fp8_serialized:
            params_dtype = torch.float8_e4m3fn
+        tp_size = get_tensor_model_parallel_world_size()
+        if self.block_quant:
+            block_n, block_k = (
+                self.quant_config.weight_block_size[0],
+                self.quant_config.weight_block_size[1],
+            )
+            # NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
+            # Required by collum parallel or enabling merged weights
+            if intermediate_size % block_n != 0:
+                raise ValueError(
+                    f"The output_size of gate's and up's weight = "
+                    f"{intermediate_size} is not divisible by "
+                    f"weight quantization block_n = {block_n}."
+                )
+            if tp_size > 1:
+                # Required by row parallel
+                if intermediate_size % block_k != 0:
+                    raise ValueError(
+                        f"The input_size of down's weight = "
+                        f"{intermediate_size} is not divisible by "
+                        f"weight quantization block_k = {block_k}."
+                    )

        # WEIGHTS
        w13_weight = torch.nn.Parameter(
@@ -374,21 +476,45 @@ class Fp8MoEMethod:
        set_weight_attrs(w2_weight, extra_weight_attrs)

        # WEIGHT_SCALES
-        # Allocate 2 scales for w1 and w3 respectively.
-        # They will be combined to a single scale after weight loading.
-        w13_weight_scale = torch.nn.Parameter(
-            torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
-        )
-        layer.register_parameter("w13_weight_scale", w13_weight_scale)
-
-        w2_weight_scale = torch.nn.Parameter(
-            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
-        )
-        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        if self.block_quant:
+            w13_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    num_experts,
+                    2 * ((intermediate_size + block_n - 1) // block_n),
+                    (hidden_size + block_k - 1) // block_k,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            w2_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    num_experts,
+                    (hidden_size + block_n - 1) // block_n,
+                    (intermediate_size + block_k - 1) // block_k,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w13_weight_scale_inv", w13_weight_scale)
+            layer.register_parameter("w2_weight_scale_inv", w2_weight_scale)
+            assert self.quant_config.activation_scheme == "dynamic"
+        else:
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They will be combined to a single scale after weight loading.
+            w13_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
+            )
+            w2_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+            )
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
        # Add the quantization method used (per tensor/grouped/channel)
        # to ensure the weight scales are loaded in properly
        extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}
+            if self.block_quant
+            else {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
        )
        # If loading fp8 checkpoint, pass the weight loaders.
        # If loading an fp16 checkpoint, do not (we will quantize in
@@ -422,7 +548,9 @@ class Fp8MoEMethod:
            layer.w2_input_scale = None

    def process_weights_after_loading(self, layer: Module) -> None:
-
+        # Block quant doesn't need to process weights after loading
+        if self.block_quant:
+            return
        # If checkpoint is fp16 or bfloat16, quantize in place.
        if not self.quant_config.is_checkpoint_fp8_serialized:
            # If ROCm, use float8_e4m3fnuz instead (MI300x HW)
@@ -519,7 +647,6 @@ class Fp8MoEMethod:
                    layer.w2_input_scale = torch.nn.Parameter(
                        w2_input_scale, requires_grad=False
                    )
-
            # Fp8 moe kernel needs single weight scale for w13 per expert.
            # We take the max then dequant and requant each expert.
            assert layer.w13_weight_scale is not None
@@ -594,10 +721,17 @@ class Fp8MoEMethod:
            topk_ids=topk_ids,
            inplace=True,
            use_fp8_w8a8=True,
-            w1_scale=layer.w13_weight_scale,
-            w2_scale=layer.w2_weight_scale,
+            w1_scale=(
+                layer.w13_weight_scale_inv
+                if self.block_quant
+                else layer.w13_weight_scale
+            ),
+            w2_scale=(
+                layer.w2_weight_scale_inv if self.block_quant else layer.w2_weight_scale
+            ),
            a1_scale=layer.w13_input_scale,
            a2_scale=layer.w2_input_scale,
+            block_shape=self.quant_config.weight_block_size,
        )