First commit

2025-08-05 19:02:46 +08:00
parent 9efe891f99
commit 99fb9f5cb0
1412 changed files with 203615 additions and 0 deletions
--- a/pkgs/xformers/benchmarks/benchmark_triton_fused_linear.py
+++ b/pkgs/xformers/benchmarks/benchmark_triton_fused_linear.py
@@ -0,0 +1,160 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Any, Dict, List, Optional
+
+import torch
+import triton
+
+from xformers.benchmarks.utils import TestCase, pretty_plot, pretty_print
+from xformers.components import Activation, build_activation
+from xformers.triton.fused_linear_layer import FusedLinear
+
+SHAPES = [
+    (8, 512, 256),  # Batch x Seq x Embedding
+    (8, 512, 512),
+    (4, 512, 1024),
+    (2, 512, 2048),
+    (2, 512, 4096),
+    (2, 512, 8192),
+]
+
+# Switch PyTorch to TF32 accumulations, Triton does that also
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+
+
+def get_metrics_transform(
+    activation: Optional[Activation],
+    a: torch.Tensor,
+    w: torch.Tensor,
+    b: Optional[torch.Tensor],
+    backward: bool,
+):
+    # all operations will involve a * weight.
+    flop = a.shape[0] * a.shape[1] * w.shape[1] * (2 * a.shape[2] - 1)
+
+    # optional activation on top
+    if activation is not None:
+        flop += a.numel()
+
+    # optionally * 2 (before the bias) if backward
+    if backward:
+        flop *= 2
+
+        # backward will also output a gradient with respect to the bias
+        # which consolidates on all the activation gradient
+        flop += a.shape[0] * a.shape[1] * w.shape[1]
+
+        # backward will also ouput another gradient with respect to the weight,
+        # which is another matmul, in between the grad_out and the inputs this time
+        flop += a.shape[0] * a.shape[1] * w.shape[1] * (2 * a.shape[2] - 1)
+
+    # optional bias on top
+    if b is not None:
+        flop += b.numel()
+
+    def metric_conversion(ms):
+        # Returns TFlops/second
+        return flop * 1e-12 / (ms * 1e-3)
+
+    return metric_conversion
+
+
+def bench_linear(activations: List[Optional[Activation]]):
+    device = torch.device("cuda")
+
+    for dtype in [
+        torch.float32,
+        torch.float16,
+    ]:
+        for backward in [True, False]:
+
+            for activation in activations:
+                results: Dict[str, Any] = {}
+
+                for bias in [False, True]:
+                    for B, M, K in SHAPES:
+                        a = torch.rand(
+                            B, M, K, device=device, dtype=dtype, requires_grad=backward
+                        )
+
+                        # Pytorch linear layer + activation
+                        torch_linear = torch.nn.Linear(K, 4 * K, bias=bias).to(
+                            dtype=dtype, device=device
+                        )
+                        torch_activation = build_activation(activation)
+
+                        # Fused layer equivalent
+                        fused_linear = FusedLinear(
+                            K, 4 * K, bias=bias, activation=activation
+                        ).to(dtype=dtype, device=device)
+
+                        def torch_step(x):
+                            y = torch_activation(torch_linear(x))
+                            if backward:
+                                torch.norm(y).backward()
+                            return y
+
+                        def triton_step(x):
+                            y = fused_linear(x)
+
+                            if backward:
+                                torch.norm(y).backward()
+                            return y
+
+                        metrics_transform = get_metrics_transform(
+                            activation,
+                            a,
+                            torch_linear.weight,
+                            torch_linear.bias,
+                            backward,
+                        )
+
+                        for testcase in [
+                            TestCase(
+                                torch_step,
+                                "pytorch - {} - {} bias - fw{}".format(
+                                    activation,
+                                    "no" if not bias else "",
+                                    "+bw" if backward else "",
+                                ),
+                            ),
+                            TestCase(
+                                triton_step,
+                                "triton  - {} - {} bias - fw{}".format(
+                                    activation,
+                                    "no" if not bias else "",
+                                    "+bw" if backward else "",
+                                ),
+                            ),
+                        ]:
+                            time = triton.testing.do_bench(
+                                lambda: testcase.function(a)
+                            )[0]
+                            key = f"B={B}, M={M}, K={K}"
+                            if key not in results:
+                                results[key] = {}
+
+                            metric = metrics_transform(time)
+                            results[key][testcase.name] = f"{metric:.1f}"
+
+                pretty_print(
+                    results,
+                    title="\n --- Type: {} ---".format(dtype),
+                    units="TFlops/s",
+                )
+
+                _type = "_fp16" if dtype == torch.float16 else "_fp32"
+                title = "FusedLinear" + _type + "_FW"
+                if backward:
+                    title += "_BW"
+                title += "_" + activation.value if activation else "_none"
+                pretty_plot(results, title, "TFlops/s", dash_key="pytorch")
+
+
+activations = [ac for ac in Activation] + [None]  # type: ignore
+bench_linear(activations)  # type: ignore