First commit

2025-08-05 19:02:46 +08:00
parent 9efe891f99
commit 99fb9f5cb0
1412 changed files with 203615 additions and 0 deletions
--- a/pkgs/xformers/_flash_attn/ops/init.py
+++ b/pkgs/xformers/_flash_attn/ops/init.py
--- a/pkgs/xformers/_flash_attn/ops/pycache/init.cpython-310.pyc
+++ b/pkgs/xformers/_flash_attn/ops/pycache/init.cpython-310.pyc
--- a/pkgs/xformers/_flash_attn/ops/pycache/activations.cpython-310.pyc
+++ b/pkgs/xformers/_flash_attn/ops/pycache/activations.cpython-310.pyc
--- a/pkgs/xformers/_flash_attn/ops/pycache/fused_dense.cpython-310.pyc
+++ b/pkgs/xformers/_flash_attn/ops/pycache/fused_dense.cpython-310.pyc
--- a/pkgs/xformers/_flash_attn/ops/pycache/layer_norm.cpython-310.pyc
+++ b/pkgs/xformers/_flash_attn/ops/pycache/layer_norm.cpython-310.pyc
--- a/pkgs/xformers/_flash_attn/ops/pycache/rms_norm.cpython-310.pyc
+++ b/pkgs/xformers/_flash_attn/ops/pycache/rms_norm.cpython-310.pyc
--- a/pkgs/xformers/_flash_attn/ops/activations.py
+++ b/pkgs/xformers/_flash_attn/ops/activations.py
@@ -0,0 +1,99 @@
+# Copied from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/model/layers/activations.py
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+# 1/sqrt(2*pi)-> 0.3989423
+# 1/sqrt(2)   -> 0.70710678
+# sqrt(2/pi)  -> 0.79788456
+
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+@torch.jit.script
+def bias_gelu(y, bias):
+    x = bias + y
+    return (x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))).to(dtype=y.dtype)
+
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@torch.jit.script
+def bias_gelu_back(g, y, bias):
+    """Assume that y has shape (B, D) and bias has shape (D)
+    """
+    x = bias + y
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
+    grad_y = ff * g
+    return grad_y.to(dtype=y.dtype), grad_y.sum(dim=(0), dtype=bias.dtype)
+
+
+class GeLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias):
+        ctx.save_for_backward(input, bias)
+        return bias_gelu(input, bias)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        tmp = bias_gelu_back(grad_output, input, bias)
+        return tmp, tmp
+
+
+bias_gelu_impl = GeLUFunction.apply
+
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+@torch.jit.script
+def gelu_fwd(x):
+    return (x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))).to(dtype=x.dtype)
+
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@torch.jit.script
+def gelu_bwd(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
+    return (ff * g).to(dtype=x.dtype)
+
+
+class FastGeLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input):
+        ctx.save_for_backward(input)
+        return gelu_fwd(input)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, = ctx.saved_tensors
+        tmp = gelu_bwd(grad_output, input)
+        return tmp
+
+fast_gelu_impl = FastGeLUFunction.apply
+
+
+@torch.jit.script
+def relu_bwd(g, x):
+    return torch.where(x >= 0, g, 0.0).to(dtype=x.dtype)
+
+
+@torch.jit.script
+def sqrelu_fwd(x):
+    r = F.relu(x)
+    return (r * r).to(dtype=x.dtype)
+
+
+@torch.jit.script
+def sqrelu_bwd(g, x):
+    return (2.0 * g * F.relu(x)).to(dtype=x.dtype)
--- a/pkgs/xformers/_flash_attn/ops/fused_dense.py
+++ b/pkgs/xformers/_flash_attn/ops/fused_dense.py
@@ -0,0 +1,527 @@
+# Copyright (c) 2023, Tri Dao.
+# Inspired by https://github.com/NVIDIA/apex/blob/master/apex/fused_dense/fused_dense.py
+# We make it work with pytorch amp and with bfloat16.
+# The TensorParallel linear modules are inspired by https://github.com/NVIDIA/apex/blob/master/apex/transformer/tensor_parallel/layers.py
+from typing import Optional
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.distributed import ProcessGroup
+from torch.cuda.amp import custom_bwd, custom_fwd
+
+# import fused_dense_cuda  # from apex
+import fused_dense_lib as fused_dense_cuda
+
+from flash_attn.ops.activations import gelu_bwd, relu_bwd, sqrelu_fwd, sqrelu_bwd
+from flash_attn.utils.distributed import all_gather_raw, reduce_scatter_raw, all_reduce_raw
+from flash_attn.utils.distributed import reduce_scatter, all_reduce
+
+
+class FusedDenseFunc(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, x, weight, bias, return_residual=False, process_group=None,
+                sequence_parallel=True):
+        """
+        If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
+        with sequence parallelism: we do an all_gather_raw of x before doing the matmul.
+        """
+        ctx.compute_weight_gradient = weight.requires_grad
+        ctx.return_residual = return_residual
+        ctx.process_group = process_group
+        ctx.sequence_parallel = sequence_parallel
+
+        if torch.is_autocast_enabled():
+            x = x.to(dtype=torch.get_autocast_gpu_dtype())
+        x = x.contiguous()
+        if process_group is not None and sequence_parallel:
+            # We want to kick off the all_gather early, before weight dtype conversion
+            total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
+        else:
+            total_x = x
+
+        if torch.is_autocast_enabled():
+            weight = weight.to(dtype=torch.get_autocast_gpu_dtype())
+            bias = bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None
+        weight = weight.contiguous()
+        if process_group is not None and sequence_parallel:
+            handle_x.wait()
+        batch_shape, n = total_x.shape[:-1], total_x.shape[-1]
+        batch_dim = batch_shape.numel()
+        # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174
+        if min(batch_dim, n, *weight.shape) > 65535 * 32:
+            raise RuntimeError('fused_dense only supports matrix dims <= 2M')
+        output = F.linear(total_x, weight, bias)
+        if ctx.compute_weight_gradient:
+            ctx.save_for_backward(x, weight)
+        else:
+            ctx.save_for_backward(weight)
+        return output if not return_residual else (output, x)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output, *args):
+        grad_output = grad_output.contiguous()
+        if ctx.return_residual:
+            grad_input, = args
+            grad_input = grad_input.contiguous()
+        process_group = ctx.process_group
+        sequence_parallel = ctx.sequence_parallel
+        if ctx.compute_weight_gradient:
+            x, weight = ctx.saved_tensors
+            if process_group is not None and sequence_parallel:
+                total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
+            else:
+                total_x = x
+        else:
+            weight, = ctx.saved_tensors
+            total_x = None
+        batch_shape = grad_output.shape[:-1]
+        batch_dim = batch_shape.numel()
+        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
+        if ctx.needs_input_grad[0]:
+            if not ctx.return_residual:
+                grad_input = F.linear(grad_output, weight.t())
+            else:
+                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]),
+                                         grad_output, weight)
+            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
+            if process_group is not None:
+                reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
+                grad_input, handle_grad_input = reduce_fn(grad_input, process_group, async_op=True)
+        else:
+            grad_input = None
+        if ctx.needs_input_grad[1]:
+            assert ctx.compute_weight_gradient
+            if process_group is not None and sequence_parallel:
+                handle_x.wait()
+            grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad(
+                total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
+            )
+        else:
+            grad_weight = None
+            grad_bias = grad_output if ctx.needs_input_grad[2] else None
+        if process_group is not None and ctx.needs_input_grad[0]:
+            handle_grad_input.wait()
+        return grad_input, grad_weight, grad_bias, None, None, None
+
+
+def fused_dense_func(x: Tensor, weight: Tensor, bias: Optional[Tensor] = None,
+                     return_residual: bool = False, process_group: Optional[ProcessGroup] = None,
+                     sequence_parallel: bool = True):
+    dtype_eligible = (x.dtype in [torch.float16, torch.bfloat16]
+                      or (x.dtype == torch.float32 and torch.is_autocast_enabled()))
+    if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
+        return FusedDenseFunc.apply(x, weight, bias, return_residual, process_group,
+                                    sequence_parallel)
+    else:
+        assert process_group is None
+        out = F.linear(x, weight, bias)
+        return out if not return_residual else (out, x)
+
+
+class FusedDense(nn.Linear):
+
+    def __init__(self, in_features: int, out_features: int, bias: bool = True,
+                 return_residual: bool = False, device=None, dtype=None) -> None:
+        super().__init__(in_features, out_features, bias=bias, device=device, dtype=dtype)
+        self.return_residual = return_residual
+
+    def forward(self, x, process_group=None):
+        """
+        If process_group is not None, we're doing Tensor Parallel with sequence parallelism:
+        we do an all_gather of x before doing the matmul.
+        """
+        return fused_dense_func(x, self.weight, self.bias, return_residual=self.return_residual,
+                                process_group=process_group)
+
+
+class ColumnParallelLinear(nn.Linear):
+
+    def __init__(self, in_features: int, out_features: int, process_group: ProcessGroup,
+                 bias: bool = True, sequence_parallel=True, device=None, dtype=None) -> None:
+        world_size = torch.distributed.get_world_size(process_group)
+        if out_features % world_size != 0:
+            raise ValueError(f'out_features ({out_features}) must be divisible by '
+                             f'world_size ({world_size})')
+        super().__init__(in_features, out_features // world_size, bias=bias,
+                         device=device, dtype=dtype)
+        self.process_group = process_group
+        self.sequence_parallel = sequence_parallel
+
+    def forward(self, x):
+        # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
+        # we do an all_gather of x before doing the matmul.
+        # If not, then the input is already gathered.
+        return fused_dense_func(x, self.weight, self.bias, process_group=self.process_group,
+                                sequence_parallel=self.sequence_parallel)
+
+
+class RowParallelLinear(nn.Linear):
+
+    def __init__(self, in_features: int, out_features: int, process_group: ProcessGroup,
+                 bias: bool = True, sequence_parallel=True, device=None, dtype=None) -> None:
+        world_size = torch.distributed.get_world_size(process_group)
+        rank = torch.distributed.get_rank(process_group)
+        if in_features % world_size != 0:
+            raise ValueError(f'in_features ({in_features}) must be divisible by '
+                             f'world_size ({world_size})')
+        # Only rank 0 will have bias
+        super().__init__(in_features // world_size, out_features, bias=bias and rank == 0,
+                         device=device, dtype=dtype)
+        self.process_group = process_group
+        self.sequence_parallel = sequence_parallel
+
+    def forward(self, x):
+        """
+        We're doing Tensor Parallel with sequence parallelism: we do the matmul and then
+        a reduce_scatter of the result.
+        """
+        out = fused_dense_func(x, self.weight, self.bias)
+        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
+        return reduce_fn(out, self.process_group)
+
+
+class FusedMLPFunc(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, x, weight1, bias1, weight2, bias2, activation='gelu_approx', save_pre_act=True,
+                return_residual=False, checkpoint_lvl=0, heuristic=0, process_group=None,
+                sequence_parallel=True):
+        """
+        If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
+        with sequence parallelism: we do an all_gather of x before doing the matmul.
+        If sequence_parallel=False, then the input is already gathered.
+
+        checkpoint_lvl:
+        0: no recomputation in the bwd
+        1: recompute gelu_out / relu_out in the bwd
+        2: recompute pre_act and gelu_out / relu_out in the bwd
+        """
+        assert -1 <= heuristic <= 4
+        assert activation in ['gelu_approx', 'relu', 'sqrelu']
+        if activation == 'sqrelu':
+            assert heuristic == -1
+        if not save_pre_act:
+            checkpoint_lvl = 2
+        assert checkpoint_lvl in [0, 1, 2]
+        ctx.return_residual = return_residual
+        ctx.process_group = process_group
+        ctx.sequence_parallel = sequence_parallel
+        ctx.checkpoint_lvl = checkpoint_lvl
+        ctx.activation = activation
+        ctx.heuristic = heuristic
+
+        if torch.is_autocast_enabled():
+            x = x.to(dtype=torch.get_autocast_gpu_dtype())
+        x = x.contiguous()
+        if process_group is not None and sequence_parallel:
+            # We want to kick off the all_gather early, before weight dtype conversion
+            total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
+        else:
+            total_x = x
+
+        if torch.is_autocast_enabled():
+            dtype = torch.get_autocast_gpu_dtype()
+            weight1, weight2 = [a.to(dtype=dtype) for a in [weight1, weight2]]
+            bias1 = bias1.to(dtype=dtype) if bias1 is not None else None
+            bias2 = bias2.to(dtype=dtype) if bias2 is not None else None
+        weight1 = weight1.contiguous()
+        bias1 = bias1.contiguous() if bias1 is not None else None
+        weight2 = weight2.contiguous()
+        bias2 = bias2.contiguous() if bias2 is not None else None
+        if process_group is not None and sequence_parallel:
+            handle_x.wait()
+        batch_shape, n = total_x.shape[:-1], total_x.shape[-1]
+        batch_dim = batch_shape.numel()
+        # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174
+        if min(batch_dim, n, *weight1.shape, *weight2.shape) > 65535 * 32:
+            raise RuntimeError('fused_dense only supports matrix dims <= 2M')
+        if heuristic == -1:
+            pre_act = F.linear(total_x, weight1, bias1)
+            activation_fn = (partial(F.gelu, approximate='tanh') if activation == 'gelu_approx'
+                             else (sqrelu_fwd if activation == 'sqrelu' else F.relu))
+            with torch.jit.fuser('fuser2'):
+                output1 = activation_fn(pre_act)
+            # This is before adding bias1
+            # pre_act = F.linear(total_x.reshape(batch_dim, n), weight1)
+            # with torch.jit.fuser('fuser2'):
+            #     output1 = bias_gelu(pre_act, bias1)
+        else:
+            is_gelu = activation == 'gelu_approx'
+            output1, *rest = fused_dense_cuda.linear_act_forward(
+                total_x.reshape(batch_dim, n), weight1, bias1, is_gelu, save_pre_act, heuristic
+            )
+            if save_pre_act:
+                pre_act = rest[0]
+        output2 = F.linear(output1, weight2, bias2)
+        if checkpoint_lvl == 0 or (checkpoint_lvl == 1 and activation == 'relu'):
+            # For RELU the pre_act is very small (just a bit-mask) so we just save it
+            ctx.save_for_backward(x, weight1, weight2, pre_act, output1)
+        elif checkpoint_lvl == 1:
+            ctx.save_for_backward(x, weight1, weight2, pre_act)
+        elif checkpoint_lvl == 2:
+            ctx.save_for_backward(x, weight1, weight2, bias1)
+        output2 = output2.reshape(*batch_shape, output2.shape[-1])
+        return output2 if not return_residual else (output2, x)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output, *args):
+        grad_output = grad_output.contiguous()
+        checkpoint_lvl = ctx.checkpoint_lvl
+        activation = ctx.activation
+        activation_fn = (partial(F.gelu, approximate='tanh') if activation == 'gelu_approx'
+                         else (sqrelu_fwd if activation == 'sqrelu' else F.relu))
+        if ctx.return_residual:
+            grad_input, = args
+            grad_input = grad_input.contiguous()
+        process_group = ctx.process_group
+        sequence_parallel = ctx.sequence_parallel
+        x, weight1, weight2, *rest = ctx.saved_tensors
+        if process_group is None or not sequence_parallel:
+            total_x = x
+        batch_shape = grad_output.shape[:-1]
+        batch_dim = batch_shape.numel()
+        if checkpoint_lvl in [0, 1]:
+            if process_group is not None and sequence_parallel:
+                total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
+            if checkpoint_lvl == 0 or (checkpoint_lvl == 1 and activation == 'relu'):
+                pre_act, output1 = rest
+            elif checkpoint_lvl == 1:
+                pre_act, = rest
+                with torch.jit.fuser('fuser2'):
+                    output1 = activation_fn(pre_act)
+        elif checkpoint_lvl == 2:
+            bias1, = rest
+            if process_group is not None and sequence_parallel:
+                total_x, _ = all_gather_raw(x, process_group)
+            if ctx.heuristic == -1:
+                pre_act = F.linear(total_x, weight1, bias1)
+                with torch.jit.fuser('fuser2'):
+                    output1 = activation_fn(pre_act)
+            else:
+                output1, pre_act = fused_dense_cuda.linear_act_forward(
+                    total_x.reshape(batch_dim, total_x.shape[-1]), weight1, bias1,
+                    activation == 'gelu_approx', True, ctx.heuristic
+                )
+
+        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
+        output1 = output1.reshape(batch_dim, output1.shape[-1])
+        pre_act = pre_act.reshape(batch_dim, pre_act.shape[-1])
+        if ctx.needs_input_grad[3]:
+            grad_weight2, grad_bias2 = fused_dense_cuda.linear_bias_wgrad(
+                output1, grad_output, ctx.needs_input_grad[4]
+            )
+        else:
+            grad_weight2 = None
+            grad_bias2 = grad_output if ctx.needs_input_grad[4] else None
+        if ctx.heuristic == -1:
+            # grad_pre_act = matmul_dgelu(grad_output, weight2, pre_act)
+            grad_output1 = F.linear(grad_output, weight2.t())
+            activation_grad_fn = (gelu_bwd if activation == 'gelu_approx'
+                                  else (sqrelu_bwd if activation == 'sqrelu' else relu_bwd))
+            with torch.jit.fuser('fuser2'):
+                grad_pre_act = activation_grad_fn(grad_output1, pre_act)
+        else:
+            # The cublasLt epilogue has to compute both gelu/relu grad and bias grad, we can't
+            # just compute gelu/relu grad
+            grad_pre_act, grad_bias1 = fused_dense_cuda.bias_act_linear_dgrad_bgrad(
+                weight2, grad_output, pre_act, activation == 'gelu_approx', ctx.heuristic
+            )
+            if not ctx.needs_input_grad[2]:
+                grad_bias1 = None
+        if ctx.needs_input_grad[0]:
+            if not ctx.return_residual:
+                grad_input = F.linear(grad_pre_act, weight1.t())
+            else:
+                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]),
+                                         grad_pre_act, weight1)
+            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
+            if process_group is not None:
+                reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
+                grad_input, handle_grad_input = reduce_fn(grad_input, process_group, async_op=True)
+        else:
+            grad_input = None
+        if ctx.heuristic == -1:
+            if ctx.needs_input_grad[1]:
+                if process_group is not None and sequence_parallel:
+                    handle_x.wait()
+                grad_weight1, grad_bias1 = fused_dense_cuda.linear_bias_wgrad(
+                    total_x.reshape(batch_dim, total_x.shape[-1]), grad_pre_act,
+                    ctx.needs_input_grad[2]
+                )
+            else:
+                grad_weight1 = None
+                grad_bias1 = grad_pre_act if ctx.needs_input_grad[2] else None
+        else:
+            if ctx.needs_input_grad[1]:
+                if process_group is not None and sequence_parallel:
+                    handle_x.wait()
+                grad_weight1 = F.linear(grad_pre_act.t(),
+                                        total_x.reshape(batch_dim, total_x.shape[-1]).t())
+            else:
+                grad_weight1 = None
+        if process_group is not None and ctx.needs_input_grad[0]:
+            handle_grad_input.wait()
+        return (grad_input, grad_weight1, grad_bias1, grad_weight2, grad_bias2,
+                None, None, None, None, None, None, None)
+
+
+def fused_mlp_func(
+    x: Tensor, weight1: Tensor, weight2: Tensor, bias1: Optional[Tensor] = None,
+    bias2: Optional[Tensor] = None, activation: str = 'gelu_approx',
+    save_pre_act: bool = True, return_residual: bool = False,
+    checkpoint_lvl: int = 0, heuristic: int = 0,
+    process_group: Optional[ProcessGroup] = None,
+    sequence_parallel: bool = True
+):
+    assert activation in ['gelu_approx', 'relu', 'sqrelu']
+    dtype_eligible = (x.dtype in [torch.float16, torch.bfloat16]
+                      or (x.dtype == torch.float32 and torch.is_autocast_enabled()))
+    # If we save pre-activation, dimension must be divisible by 128 (relu) or 8 (gelu)
+    dim_eligible = not save_pre_act or (x.shape[-1] % (128 if activation == 'relu' else 8) == 0)
+    if (x.is_cuda and weight1.is_cuda and weight2.is_cuda and (bias1 is None or bias1.is_cuda)
+        and (bias2 is None or bias2.is_cuda) and dtype_eligible and dim_eligible):
+        return FusedMLPFunc.apply(
+            x, weight1, bias1, weight2, bias2, activation, save_pre_act, return_residual,
+            checkpoint_lvl, heuristic, process_group, sequence_parallel
+        )
+    else:
+        assert process_group is None
+        pre_act = F.linear(x, weight1, bias1)
+        activation_fn = (partial(F.gelu, approximate='tanh') if activation == 'gelu_approx'
+                         else partial(F.relu, inplace=True))
+        output1 = activation_fn(pre_act)
+        output2 = F.linear(output1, weight2, bias2)
+        return output2 if not return_residual else (output2, x)
+
+
+class FusedMLP(nn.Module):
+
+    def __init__(self, in_features, hidden_features=None, out_features=None, bias1=True,
+                 bias2=True, activation='gelu_approx', return_residual=False,
+                 checkpoint_lvl=0, heuristic='auto', device=None, dtype=None):
+        """
+        If process_group is not None, we're doing Tensor Parallel with sequence parallelism:
+        we do an all_gather of x before doing the matmul, gelu, then matmul.
+        Finally we do a reduce_scatter of the output.
+
+        checkpoint_lvl (increasing lvl means slower but more memory saving):
+            0: no recomputation in the bwd
+            1: recompute gelu_out in the bwd
+            2: recompute pre_act and gelu_out in the bwd
+        heuristic:
+            -1: don't fuse gemm + gelu (separate kernel)
+            0..4: use this heuristic for the algo section in the fused gemm + gelu
+            'auto': heuristic will be picked automatically:
+                For CUDA >= 11.8, we set heuristic=0 for both fp16 and bf16 for best perf.
+                For CUDA <= 11.7, we set heuristic=1 for fp16 and heuristic=-1 for bf16.
+                For H100, we set heuristic=-1 for both fp16 and bf16 as the fused cuBlasLt implementation
+                is slower than the unfused version.
+        return_residual: whether to return the input x along with the output. This is for
+            performance reason: for post-norm architecture, returning the input allows us
+            to fuse the backward of nn.Linear with the residual connection.
+        """
+        assert checkpoint_lvl in [0, 1, 2]
+        assert activation in ['gelu_approx', 'relu', 'sqrelu']
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features * 4
+        self.activation = activation
+        self.return_residual = return_residual
+        self.checkpoint_lvl = checkpoint_lvl
+        self.heuristic = heuristic if activation != 'sqrelu' else -1
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1, **factory_kwargs)
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
+
+    def forward(self, x, process_group=None):
+        dtype = x.dtype if not torch.is_autocast_enabled() else torch.get_autocast_gpu_dtype()
+        if self.heuristic == 'auto':
+            if self.activation == 'gelu_approx':
+                if torch.cuda.get_device_capability('cuda') == (9, 0):
+                    heuristic = -1
+                else:
+                    cuda_ver = tuple(map(int, torch.version.cuda.split('.')))
+                    heuristic = 0 if cuda_ver >= (11, 8) else (1 if dtype == torch.float16 else -1)
+            else:
+                heuristic = 0
+        else:
+            heuristic = self.heuristic
+        out = fused_mlp_func(
+            x, self.fc1.weight, self.fc2.weight, self.fc1.bias, self.fc2.bias,
+            activation=self.activation, save_pre_act=self.training,
+            return_residual=self.return_residual, checkpoint_lvl=self.checkpoint_lvl,
+            heuristic=heuristic, process_group=process_group
+        )
+        if self.return_residual:
+            out, x = out
+        if process_group is not None:
+            out = reduce_scatter(out, process_group)
+        return out if not self.return_residual else (out, x)
+
+
+class ParallelFusedMLP(nn.Module):
+
+    def __init__(self, in_features, hidden_features=None, out_features=None,
+                 activation='gelu_approx', process_group: ProcessGroup = None,
+                 bias1=True, bias2=True, sequence_parallel=True, checkpoint_lvl=0, heuristic='auto',
+                 device=None, dtype=None):
+        """
+        process_group is required. We're doing Tensor Parallel with sequence parallelism:
+        we do an all_gather of x before doing the matmul, gelu, then matmul.
+        Finally we do a reduce_scatter of the output.
+
+        checkpoint_lvl (increasing lvl means slower but more memory saving):
+            0: no recomputation in the bwd
+            1: recompute gelu_out in the bwd
+            2: recompute pre_act and gelu_out in the bwd
+        heuristic:
+            -1: don't fuse gemm + gelu (separate kernel)
+            0..4: use this heuristic for the algo section in the fused gemm + gelu
+            'auto': heuristic will be picked automatically:
+                For CUDA >= 11.8, we set heuristic=0 for both fp16 and bf16 for best perf.
+                For CUDA <= 11.7, we set heuristic=1 for fp16 and heuristic=-1 for bf16.
+        """
+        assert checkpoint_lvl in [0, 1, 2]
+        assert activation in ['gelu_approx', 'relu', 'sqrelu']
+        assert process_group is not None
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features * 4
+        self.activation = activation
+        self.process_group = process_group
+        self.sequence_parallel = sequence_parallel
+        self.checkpoint_lvl = checkpoint_lvl
+        self.heuristic = heuristic if activation != 'sqrelu' else -1
+        self.fc1 = ColumnParallelLinear(in_features, hidden_features, process_group,
+                                        bias=bias1, **factory_kwargs)
+        self.fc2 = RowParallelLinear(hidden_features, out_features, process_group,
+                                     bias=bias2, **factory_kwargs)
+
+    def forward(self, x):
+        dtype = x.dtype if not torch.is_autocast_enabled() else torch.get_autocast_gpu_dtype()
+        if self.heuristic == 'auto':
+            if self.activation == 'gelu_approx':
+                cuda_ver = tuple(map(int, torch.version.cuda.split('.')))
+                heuristic = 0 if cuda_ver >= (11, 8) else (1 if dtype == torch.float16 else -1)
+            else:
+                heuristic = 0
+        else:
+            heuristic = self.heuristic
+        out = fused_mlp_func(
+            x, self.fc1.weight, self.fc2.weight, self.fc1.bias, self.fc2.bias,
+            activation=self.activation, save_pre_act=self.training,
+            checkpoint_lvl=self.checkpoint_lvl, heuristic=heuristic,
+            process_group=self.process_group,
+            sequence_parallel=self.sequence_parallel
+        )
+        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
+        return reduce_fn(out, self.process_group)
--- a/pkgs/xformers/_flash_attn/ops/layer_norm.py
+++ b/pkgs/xformers/_flash_attn/ops/layer_norm.py
@@ -0,0 +1,375 @@
+# Copyright (c) 2022, Tri Dao.
+# Adapted from https://github.com/NVIDIA/apex/blob/master/apex/contrib/layer_norm/layer_norm.py
+
+import torch
+from torch.nn import init
+
+import dropout_layer_norm
+
+
+def maybe_align(x, alignment_in_bytes=16):
+    """Assume that x already has last dim divisible by alignment_in_bytes
+    """
+    # TD [2023-07-04] I'm not 100% sure that clone will align the memory
+    # https://discuss.pytorch.org/t/how-to-ensure-that-tensor-data-ptr-is-aligned-to-16-bytes/183440
+    return x if x.data_ptr() % alignment_in_bytes == 0 else x.clone()
+
+
+def _dropout_add_layer_norm_forward(x0, residual, gamma, beta, rowscale, colscale, dropout_p,
+                                    epsilon, residual_in_fp32=False, is_rms_norm=False):
+    """ Assume that arguments are contiguous and aligned to 16 bytes
+    """
+    hidden_size = gamma.numel()
+    x0mat = x0.view((-1, hidden_size))
+    residualmat = residual.view((-1, hidden_size)) if residual is not None else None
+    rowscale = rowscale.view(-1) if rowscale is not None else None
+    zmat, xmat, dmask, mu, rsigma = dropout_layer_norm.dropout_add_ln_fwd(
+        x0mat, residualmat, gamma, beta, rowscale, colscale, None, None, dropout_p, epsilon,
+        1.0, 0, None, residual_in_fp32, is_rms_norm
+    )
+    # dmask is None if dropout_p == 0.0
+    # xmat is None if dropout_p == 0.0 and residual is None and residual_dtype != input_dtype
+    return zmat, xmat if xmat is not None else x0mat, dmask, mu, rsigma
+
+
+def _dropout_add_layer_norm_backward(dz, dx, x, x0, dmask, mu, rsigma, gamma, rowscale, colscale,
+                                     dropout_p, has_residual, is_rms_norm=False):
+    """ Assume that arguments are contiguous and aligned to 16 bytes
+    dx == None means that it was a post-norm architecture
+    (x = drop(x0) + residual was not returned in the fwd).
+    x0 must not be None if we have colscale.
+    """
+    hidden_size = gamma.numel()
+    xmat = x.view((-1, hidden_size))
+    dzmat = dz.view(xmat.shape)
+    dxmat = dx.view(xmat.shape) if dx is not None else None
+    x0mat = x0.view((-1, hidden_size)) if x0 is not None else None
+    rowscale = rowscale.view(-1) if rowscale is not None else None
+    if colscale is not None:
+        assert x0 is not None, 'x0 is required to compute the gradient of colscale'
+    dx0mat, dresidualmat, dgamma, dbeta, _, _, *rest = dropout_layer_norm.dropout_add_ln_bwd(
+        dzmat, dxmat, xmat, x0mat, dmask, mu, rsigma, gamma, rowscale, colscale, None, None,
+        dropout_p, 1.0, 0, has_residual, is_rms_norm
+    )
+    # dresidualmat is None if not has_residual
+    if colscale is None:
+        return dx0mat, dresidualmat, dgamma, dbeta
+    else:
+        dcolscale = rest[0]
+        return dx0mat, dresidualmat, dgamma, dbeta, dcolscale
+
+
+def _dropout_add_layer_norm_subset_forward(x0, residual, gamma, beta, colscale, x0_subset,
+                                           out_subset, dropout_p, epsilon, rowscale_const,
+                                           out_numrows, residual_in_fp32=False, is_rms_norm=False):
+    """ Assume that arguments are contiguous and aligned to 16 bytes
+    """
+    hidden_size = gamma.numel()
+    x0mat = x0.view((-1, hidden_size))
+    residualmat = residual.view((-1, hidden_size)) if residual is not None else None
+    x0_subset = x0_subset.view(-1) if x0_subset is not None else None
+    out_subset = out_subset.view(-1) if out_subset is not None else None
+    zmat, xmat, dmask, mu, rsigma = dropout_layer_norm.dropout_add_ln_fwd(
+        x0mat, residualmat, gamma, beta, None, colscale, x0_subset, out_subset, dropout_p, epsilon,
+        rowscale_const, out_numrows, None, residual_in_fp32, is_rms_norm
+    )
+    # dmask is None if dropout_p == 0.0
+    # xmat is None if dropout_p == 0.0 and residual is None and residual_dtype != input_dtype
+    return zmat, xmat if xmat is not None else x0mat, dmask, mu, rsigma
+
+
+def _dropout_add_layer_norm_subset_backward(dz, dx, x, x0, dmask, mu, rsigma, gamma, colscale,
+                                            x0_subset, out_subset, dropout_p, rowscale_const,
+                                            x0_numrows, has_residual, is_rms_norm=False):
+    """ Assume that arguments are contiguous and aligned to 16 bytes
+    dx == None means that it was a post-norm architecture
+    (x = drop(x0) + residual was not returned in the fwd).
+    x0 must not be None if we have colscale.
+    """
+    hidden_size = gamma.numel()
+    xmat = x.view((-1, hidden_size))
+    dzmat = dz.view(-1, hidden_size)
+    dxmat = dx.view(xmat.shape) if dx is not None else None
+    x0mat = x0.view((-1, hidden_size)) if x0 is not None else None
+    x0_subset = x0_subset.view(-1) if x0_subset is not None else None
+    out_subset = out_subset.view(-1) if out_subset is not None else None
+    if colscale is not None:
+        assert x0 is not None, 'x0 is required to compute the gradient of colscale'
+    dx0mat, dresidualmat, dgamma, dbeta, _, _, *rest = dropout_layer_norm.dropout_add_ln_bwd(
+        dzmat, dxmat, xmat, x0mat, dmask, mu, rsigma, gamma, None, colscale, x0_subset, out_subset,
+        dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm
+    )
+    # dresidualmat is None if not has_residual
+    if colscale is None:
+        return dx0mat, dresidualmat, dgamma, dbeta
+    else:
+        dcolscale = rest[0]
+        return dx0mat, dresidualmat, dgamma, dbeta, dcolscale
+
+
+def _dropout_add_layer_norm_parallel_residual_forward(
+    x0, x1, residual, gamma0, beta0, gamma1, beta1, dropout_p,
+    epsilon, residual_in_fp32=False, is_rms_norm=False
+):
+    """ Assume that arguments are contiguous and aligned to 16 bytes
+    """
+    hidden_size = gamma0.numel()
+    x0mat = x0.view((-1, hidden_size))
+    x1mat = x1.view((-1, hidden_size)) if x1 is not None else None
+    residualmat = residual.view((-1, hidden_size)) if residual is not None else None
+    z0mat, z1mat, xmat, dmask0, dmask1, mu, rsigma = dropout_layer_norm.dropout_add_ln_parallel_residual_fwd(
+        x0mat, x1mat, residualmat, gamma0, beta0, gamma1, beta1, dropout_p, epsilon,
+        None, residual_in_fp32, is_rms_norm
+    )
+    # dmask0 and dmask1 are None if dropout_p == 0.0
+    # xmat is None if dropout_p == 0.0 and residual is None and residual_dtype != input_dtype
+    return z0mat, z1mat, xmat if xmat is not None else x0mat, dmask0, dmask1, mu, rsigma
+
+
+def _dropout_add_layer_norm_parallel_residual_backward(
+    dz0, dz1, dx, x, dmask0, dmask1, mu, rsigma, gamma0, gamma1,
+    dropout_p, has_x1, has_residual, is_rms_norm=False
+):
+    """ Assume that arguments are contiguous and aligned to 16 bytes
+    dx == None means that it was a post-norm architecture
+    (x = drop(x0) + residual was not returned in the fwd).
+    """
+    hidden_size = gamma0.numel()
+    xmat = x.view((-1, hidden_size))
+    dz0mat = dz0.view(xmat.shape)
+    dz1mat = dz1.view(xmat.shape) if dz1 is not None else None
+    dxmat = dx.view(xmat.shape) if dx is not None else None
+    dx0mat, dx1mat, dresidualmat, dgamma0, dbeta0, dgamma1, dbeta1, *rest = dropout_layer_norm.dropout_add_ln_parallel_residual_bwd(
+        dz0mat, dz1mat, dxmat, xmat, dmask0, dmask1, mu, rsigma, gamma0, gamma1,
+        dropout_p, has_x1, has_residual, is_rms_norm
+    )
+    # dresidualmat is None if not has_residual
+    return dx0mat, dx1mat, dresidualmat, dgamma0, dbeta0, dgamma1, dbeta1
+
+
+class DropoutAddLayerNormFn(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x0, residual, gamma, beta, rowscale, colscale, dropout_p, epsilon,
+                residual_in_fp32=False, prenorm=False, is_rms_norm=False, return_dmask=False):
+        x0 = maybe_align(x0.contiguous(), 16)
+        residual = maybe_align(residual.contiguous(), 16) if residual is not None else None
+        gamma = maybe_align(gamma.contiguous(), 16)
+        beta = maybe_align(beta.contiguous(), 16) if beta is not None else None
+        rowscale = maybe_align(rowscale.contiguous(), 16) if rowscale is not None else None
+        colscale = maybe_align(colscale.contiguous(), 16) if colscale is not None else None
+        zmat, xmat, dmask, mu, rsigma = _dropout_add_layer_norm_forward(
+            x0, residual, gamma, beta, rowscale, colscale, dropout_p, epsilon,
+            residual_in_fp32, is_rms_norm
+        )
+        # Only need to save x0 if we need to compute gradient wrt colscale
+        x0_saved = x0 if colscale is not None else None
+        ctx.save_for_backward(xmat.view(x0.shape), x0_saved, dmask, gamma, mu, rsigma, rowscale, colscale)
+        ctx.prenorm = prenorm
+        ctx.dropout_p = dropout_p
+        ctx.has_residual = residual is not None
+        ctx.is_rms_norm = is_rms_norm
+        ctx.has_beta = beta is not None
+        if not return_dmask:
+            return (zmat.view(x0.shape) if not prenorm
+                    else (zmat.view(x0.shape), xmat.view(x0.shape)))
+        else:
+            dmask = (dmask.view(x0.shape) if dropout_p > 0.
+                     else torch.ones(x0.shape, dtype=torch.uint8, device=x0.device))
+            ctx.mark_non_differentiable(dmask)
+            return ((zmat.view(x0.shape), dmask) if not prenorm
+                    else (zmat.view(x0.shape), xmat.view(x0.shape), dmask))
+
+    @staticmethod
+    def backward(ctx, dz, *args):
+        # assert dz.is_contiguous()
+        dz = maybe_align(dz.contiguous(), 16)  # this happens!
+        dx = maybe_align(args[0].contiguous(), 16) if ctx.prenorm else None
+        x, x0, dmask, gamma, mu, rsigma, rowscale, colscale = ctx.saved_tensors
+        # x0 is None if colscale is None
+        dropout_p = ctx.dropout_p
+        has_residual = ctx.has_residual
+        dx0mat, dresidualmat, dgamma, dbeta, *rest = _dropout_add_layer_norm_backward(
+            dz, dx, x, x0, dmask, mu, rsigma, gamma, rowscale, colscale, dropout_p, has_residual,
+            ctx.is_rms_norm
+        )
+        dx0 = dx0mat.view(x.shape)
+        dresidual = dresidualmat.view(x.shape) if dresidualmat is not None else None
+        dcolscale = rest[0] if colscale is not None else None
+        return (dx0, dresidual, dgamma, dbeta if ctx.has_beta else None, None, dcolscale, None,
+                None, None, None, None, None)
+
+
+class DropoutAddLayerNormSubsetFn(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x0, residual, gamma, beta, colscale, x0_subset, out_subset, dropout_p, epsilon,
+                rowscale_const, out_numrows, residual_in_fp32=False,
+                prenorm=False, is_rms_norm=False, return_dmask=False):
+        x0 = maybe_align(x0.contiguous(), 16)
+        residual = maybe_align(residual.contiguous(), 16) if residual is not None else None
+        gamma = maybe_align(gamma.contiguous(), 16)
+        beta = maybe_align(beta.contiguous(), 16) if beta is not None else None
+        colscale = maybe_align(colscale.contiguous(), 16) if colscale is not None else None
+        zmat, xmat, dmask, mu, rsigma = _dropout_add_layer_norm_subset_forward(
+            x0, residual, gamma, beta, colscale, x0_subset, out_subset, dropout_p, epsilon,
+            rowscale_const, out_numrows, residual_in_fp32, is_rms_norm
+        )
+        # Only need to save x0 if we need to compute gradient wrt colscale
+        x0_saved = x0 if colscale is not None else None
+        x_shape = (-1, *x0.shape[1:])
+        ctx.save_for_backward(xmat.view(x_shape), x0_saved, dmask, gamma, mu, rsigma, colscale,
+                              x0_subset, out_subset)
+        ctx.prenorm = prenorm
+        ctx.dropout_p = dropout_p
+        ctx.rowscale_const = rowscale_const
+        ctx.x0_numrows = x0.shape[:-1].numel()
+        ctx.has_residual = residual is not None
+        ctx.is_rms_norm = is_rms_norm
+        ctx.has_beta = beta is not None
+        z_shape = (-1, *x0.shape[1:])
+        if not return_dmask:
+            return (zmat.view(z_shape) if not prenorm
+                    else (zmat.view(z_shape), xmat.view(x0.shape)))
+        else:
+            z = zmat.view(z_shape)
+            dmask = (dmask.view(x0.shape) if dropout_p > 0.
+                     else torch.ones(x0.shape, dtype=torch.uint8, device=x0.device))
+            ctx.mark_non_differentiable(dmask)
+            return ((z, dmask) if not prenorm else (z, xmat.view(x_shape), dmask))
+
+    @staticmethod
+    def backward(ctx, dz, *args):
+        # assert dz.is_contiguous()
+        dz = maybe_align(dz.contiguous(), 16)  # this happens!
+        dx = maybe_align(args[0].contiguous(), 16) if ctx.prenorm else None
+        x, x0, dmask, gamma, mu, rsigma, colscale, x0_subset, out_subset = ctx.saved_tensors
+        # x0 is None if colscale is None
+        dropout_p = ctx.dropout_p
+        has_residual = ctx.has_residual
+        dx0mat, dresidualmat, dgamma, dbeta, *rest = _dropout_add_layer_norm_subset_backward(
+            dz, dx, x, x0, dmask, mu, rsigma, gamma, colscale, x0_subset, out_subset, dropout_p,
+            ctx.rowscale_const, ctx.x0_numrows, has_residual, ctx.is_rms_norm
+        )
+        dx0 = dx0mat.view(-1, *x.shape[1:])
+        dresidual = dresidualmat.view(x.shape) if dresidualmat is not None else None
+        dcolscale = rest[0] if colscale is not None else None
+        return (dx0, dresidual, dgamma, dbeta if ctx.has_beta else None, dcolscale, None, None,
+                None, None, None, None, None, None, None, None)
+
+
+class DropoutAddLayerNormParallelResidualFn(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x0, x1, residual, gamma0, beta0, gamma1, beta1, dropout_p, epsilon,
+                residual_in_fp32=False, prenorm=False, is_rms_norm=False, return_dmask=False):
+        x0 = maybe_align(x0.contiguous(), 16)
+        x1 = maybe_align(x1.contiguous(), 16) if x1 is not None else None
+        residual = maybe_align(residual.contiguous(), 16) if residual is not None else None
+        gamma0 = maybe_align(gamma0.contiguous(), 16)
+        beta0 = maybe_align(beta0.contiguous(), 16) if beta0 is not None else None
+        gamma1 = maybe_align(gamma1.contiguous(), 16) if gamma1 is not None else None
+        beta1 = maybe_align(beta1.contiguous(), 16) if beta1 is not None else None
+        z0mat, z1mat, xmat, dmask0, dmask1, mu, rsigma = _dropout_add_layer_norm_parallel_residual_forward(
+            x0, x1, residual, gamma0, beta0, gamma1, beta1, dropout_p, epsilon,
+            residual_in_fp32, is_rms_norm
+        )
+        ctx.save_for_backward(xmat.view(x0.shape), dmask0, dmask1, gamma0, gamma1, mu, rsigma)
+        ctx.prenorm = prenorm
+        ctx.dropout_p = dropout_p
+        ctx.has_x1 = x1 is not None
+        ctx.has_residual = residual is not None
+        ctx.is_rms_norm = is_rms_norm
+        ctx.has_beta = beta0 is not None
+        z = (z0mat.view(x0.shape), z1mat.view(x0.shape) if z1mat is not None else None)
+        if not return_dmask:
+            return z if not prenorm else (*z, xmat.view(x0.shape))
+        else:
+            dmask0 = (dmask0.view(x0.shape) if dropout_p > 0.
+                      else torch.ones(x0.shape, dtype=torch.uint8, device=x0.device))
+            dmask1 = (dmask1.view(x0.shape) if dropout_p > 0. and x1 is not None
+                      else torch.ones(x0.shape, dtype=torch.uint8, device=x0.device))
+            ctx.mark_non_differentiable(dmask0)
+            ctx.mark_non_differentiable(dmask1)
+            return (*z, dmask0, dmask1) if not prenorm else (*z, xmat.view(x0.shape), dmask0, dmask1)
+
+    @staticmethod
+    def backward(ctx, dz0, dz1, *args):
+        dz0 = maybe_align(dz0.contiguous(), 16)  # this happens!
+        dz1 = maybe_align(dz1.contiguous(), 16) if dz1 is not None else None
+        dx = maybe_align(args[0].contiguous(), 16) if ctx.prenorm else None
+        x, dmask0, dmask1, gamma0, gamma1, mu, rsigma = ctx.saved_tensors
+        dropout_p = ctx.dropout_p
+        has_x1 = ctx.has_x1
+        has_residual = ctx.has_residual
+        dx0mat, dx1mat, dresidualmat, dgamma0, dbeta0, dgamma1, dbeta1 = _dropout_add_layer_norm_parallel_residual_backward(
+            dz0, dz1, dx, x, dmask0, dmask1, mu, rsigma, gamma0, gamma1, dropout_p, has_x1,
+            has_residual, ctx.is_rms_norm
+        )
+        dx0 = dx0mat.view(x.shape)
+        dx1 = dx1mat.view(x.shape) if dx1mat is not None else None
+        dresidual = dresidualmat.view(x.shape) if dresidualmat is not None else None
+        return (dx0, dx1, dresidual, dgamma0, dbeta0 if ctx.has_beta else None, dgamma1,
+                dbeta1 if ctx.has_beta else None, None, None, None, None, None, None)
+
+
+def layer_norm(x, weight, bias, epsilon):
+    return DropoutAddLayerNormFn.apply(x, None, weight, bias, None, None, 0.0, epsilon, False)
+
+
+def dropout_add_layer_norm(x0, residual, weight, bias, dropout_p, epsilon, rowscale=None,
+                           layerscale=None, prenorm=False, residual_in_fp32=False,
+                           return_dropout_mask=False):
+    """residual_in_fp32 only has an effect if residual is None.
+    Otherwise residual dtype is residual.dtype.
+    """
+    return DropoutAddLayerNormFn.apply(
+        x0, residual, weight, bias, rowscale, layerscale, dropout_p, epsilon, residual_in_fp32, prenorm,
+        False, return_dropout_mask
+    )
+
+
+def dropout_add_layer_norm_subset(x0, residual, weight, bias, dropout_p, epsilon, layerscale=None,
+                                  x0_subset=None, out_subset=None, rowscale_const=1.0,
+                                  out_numrows=0, prenorm=False, residual_in_fp32=False,
+                                  return_dropout_mask=False):
+    """residual_in_fp32 only has an effect if residual is None.
+    Otherwise residual dtype is residual.dtype.
+    """
+    return DropoutAddLayerNormSubsetFn.apply(
+        x0, residual, weight, bias, layerscale, x0_subset, out_subset, dropout_p, epsilon,
+        rowscale_const, out_numrows, residual_in_fp32, prenorm, False, return_dropout_mask
+    )
+
+
+def dropout_add_layer_norm_parallel_residual(
+    x0, x1, residual, weight0, bias0, weight1, bias1, dropout_p, epsilon, prenorm=False,
+    residual_in_fp32=False, return_dropout_mask=False
+):
+    """residual_in_fp32 only has an effect if residual is None.
+    Otherwise residual dtype is residual.dtype.
+    """
+    return DropoutAddLayerNormParallelResidualFn.apply(
+        x0, x1, residual, weight0, bias0, weight1, bias1, dropout_p, epsilon, residual_in_fp32, prenorm,
+        False, return_dropout_mask
+    )
+
+
+class DropoutAddLayerNorm(torch.nn.Module):
+    def __init__(self, hidden_size, prenorm=False, p=0.0, eps=1e-5, residual_in_fp32=False,
+                 device=None, dtype=None):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.prenorm = prenorm
+        self.p = p
+        self.eps = eps
+        self.residual_in_fp32 = residual_in_fp32
+        self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.bias = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        init.ones_(self.weight)
+        init.zeros_(self.bias)
+
+    def forward(self, x0, residual=None):
+        return dropout_add_layer_norm(x0, residual, self.weight, self.bias,
+                                      self.p if self.training else 0.0, self.eps,
+                                      prenorm=self.prenorm, residual_in_fp32=self.residual_in_fp32)
--- a/pkgs/xformers/_flash_attn/ops/rms_norm.py
+++ b/pkgs/xformers/_flash_attn/ops/rms_norm.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2022, Tri Dao.
+# Adapted from https://github.com/NVIDIA/apex/blob/master/apex/contrib/layer_norm/layer_norm.py
+
+import torch
+from torch.nn import init
+
+from flash_attn.ops.layer_norm import DropoutAddLayerNormFn, DropoutAddLayerNormSubsetFn
+from flash_attn.ops.layer_norm import DropoutAddLayerNormParallelResidualFn
+
+
+def rms_norm(x, weight, epsilon):
+    return DropoutAddLayerNormFn.apply(x, None, weight, None, None, None, 0.0, epsilon, False,
+                                       False, True)
+
+
+def dropout_add_rms_norm(x0, residual, weight, bias, dropout_p, epsilon, rowscale=None,
+                         layerscale=None, prenorm=False, residual_in_fp32=False,
+                         return_dropout_mask=False):
+    """residual_in_fp32 only has an effect if residual is None.
+    Otherwise residual dtype is residual.dtype.
+    """
+    return DropoutAddLayerNormFn.apply(
+        x0, residual, weight, bias, rowscale, layerscale, dropout_p, epsilon, residual_in_fp32, prenorm,
+        True, return_dropout_mask
+    )
+
+
+def dropout_add_rms_norm_subset(x0, residual, weight, bias, dropout_p, epsilon, layerscale=None,
+                                  x0_subset=None, out_subset=None, rowscale_const=1.0,
+                                  out_numrows=0, prenorm=False, residual_in_fp32=False,
+                                  return_dropout_mask=False):
+    """residual_in_fp32 only has an effect if residual is None.
+    Otherwise residual dtype is residual.dtype.
+    """
+    return DropoutAddLayerNormSubsetFn.apply(
+        x0, residual, weight, bias, layerscale, x0_subset, out_subset, dropout_p, epsilon,
+        rowscale_const, out_numrows, residual_in_fp32, prenorm, True, return_dropout_mask
+    )
+
+
+def dropout_add_rms_norm_parallel_residual(
+   x0, x1, residual, weight0, bias0, weight1, bias1,
+   dropout_p, epsilon, prenorm=False, residual_in_fp32=False, return_dropout_mask=False
+):
+    """residual_in_fp32 only has an effect if residual is None.
+    Otherwise residual dtype is residual.dtype.
+    """
+    return DropoutAddLayerNormParallelResidualFn.apply(
+        x0, x1, residual, weight0, bias0, weight1, bias1, dropout_p, epsilon, residual_in_fp32, prenorm,
+        True, return_dropout_mask
+    )
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, hidden_size, eps=1e-5, device=None, dtype=None):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        init.ones_(self.weight)
+
+    def forward(self, x):
+        return rms_norm(x, self.weight, self.eps)
+
+
+class DropoutAddRMSNorm(torch.nn.Module):
+    def __init__(self, hidden_size, prenorm=False, p=0.0, eps=1e-5, residual_in_fp32=False,
+                 device=None, dtype=None):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.prenorm = prenorm
+        self.p = p
+        self.eps = eps
+        self.residual_in_fp32 = residual_in_fp32
+        self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        init.ones_(self.weight)
+
+    def forward(self, x0, residual=None):
+        return dropout_add_rms_norm(x0, residual, self.weight, None,
+                                    self.p if self.training else 0.0, self.eps,
+                                    prenorm=self.prenorm, residual_in_fp32=self.residual_in_fp32)