# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Conv Layer Class.""" import math import torch import torch.nn as nn import torch.nn.functional as F from vllm.model_executor.custom_op import CustomOp from vllm.utils.torch_utils import is_torch_equal class ConvLayerBase(CustomOp): """Conv layer base class.""" num_dim: int def __init__( self, in_channels: int, out_channels: int, kernel_size: int | tuple[int, ...], stride: int | tuple[int, ...] = 1, padding: int | tuple[int, ...] = 0, dilation: int | tuple[int, ...] = 1, groups: int = 1, bias: bool = True, padding_mode: str = "zeros", *, params_dtype: torch.dtype | None = None, ) -> None: super().__init__() if params_dtype is None: params_dtype = torch.get_default_dtype() kernel_size = ( (kernel_size,) * self.num_dim if isinstance(kernel_size, int) else kernel_size ) stride = (stride,) * self.num_dim if isinstance(stride, int) else stride padding = (padding,) * self.num_dim if isinstance(padding, int) else padding dilation = (dilation,) * self.num_dim if isinstance(dilation, int) else dilation self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = kernel_size self.stride = stride self.padding = padding self.dilation = dilation self.groups = groups self.padding_mode = padding_mode self.enable_linear = ( (self.kernel_size == self.stride) and not any(self.padding) and self.groups == 1 ) self.input_size = in_channels * math.prod(self.kernel_size) self.weight = nn.Parameter( torch.empty( out_channels, in_channels // groups, *kernel_size, dtype=params_dtype, ), ) if bias: self.bias = nn.Parameter(torch.empty(self.out_channels, dtype=params_dtype)) else: self.register_parameter("bias", None) def extra_repr(self) -> str: s = f"in_channels={self.in_channels}, " s += f"out_channels={self.out_channels}, " s += f"kernel_size={self.kernel_size}, " s += f"stride={self.stride}, " s += f"padding={self.padding}, " s += f"bias={self.bias is not None}" return s @CustomOp.register("conv2d") class Conv2dLayer(ConvLayerBase): """Conv layer with Conv2d.""" num_dim = 2 def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor: assert x.dim() == 4 B, C, H, W = x.shape K1, K2 = self.kernel_size H, W = H // K1, W // K2 x = x.unfold(2, K1, K1).unfold(3, K2, K2) x = x.permute(0, 2, 3, 1, 4, 5).reshape(-1, self.input_size) x = F.linear( x, self.weight.view(self.out_channels, self.input_size), self.bias, ) x = x.view(B, H, W, self.out_channels).permute(0, 3, 1, 2) return x def _forward_conv(self, x: torch.Tensor) -> torch.Tensor: assert x.dim() == 4 x = F.conv2d( x, self.weight, self.bias, stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups, ) return x def forward_native(self, x: torch.Tensor) -> torch.Tensor: """Expected input shape: (batch_size, in_channels, height, width)""" assert x.dim() == 4 if self.enable_linear: return self._forward_mulmat(x) else: return self._forward_conv(x) def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: # By default, we use CUDNN's convolution ops with optimization. return self._forward_conv(x) class CausalConv2dLayer(Conv2dLayer): """ A causal version of nn.Conv2d where each location in the 2D matrix would have no access to locations on its right or down All arguments are the same as nn.Conv2d except padding which should be set as None """ def __init__( self, in_channels: int, out_channels: int, kernel_size: int, stride: int, padding: int = 0, dilation: int = 1, groups: int = 1, bias: bool = True, padding_mode: str = "zeros", *, params_dtype: torch.dtype | None = None, ) -> None: if padding is not None: raise ValueError( "Argument padding should be set to None for CausalConv2dLayer." ) self._left_padding: int = kernel_size - 1 self._right_padding: int = stride - 1 padding = 0 super().__init__( in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, padding_mode, params_dtype=params_dtype, ) def forward( self, x: torch.Tensor, ) -> torch.Tensor: x = F.pad(x, pad=(self._left_padding, self._right_padding, 0, 0)) x = super().forward(x) return x @CustomOp.register("conv3d") class Conv3dLayer(ConvLayerBase): """Conv layer with Conv3d.""" num_dim = 3 def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor: assert x.dim() == 5 B, C, T, H, W = x.shape K1, K2, K3 = self.kernel_size T, H, W = T // K1, H // K2, W // K3 x = x.unfold(2, K1, K1).unfold(3, K2, K2).unfold(4, K3, K3) x = x.permute(0, 2, 3, 4, 1, 5, 6, 7).reshape(-1, self.input_size) x = F.linear( x, self.weight.view(self.out_channels, self.input_size), self.bias, ) x = x.view(B, T, H, W, self.out_channels).permute(0, 4, 1, 2, 3) return x def _forward_conv(self, x: torch.Tensor) -> torch.Tensor: assert x.dim() == 5 x = F.conv3d( x, self.weight, self.bias, stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups, ) return x def forward_native(self, x: torch.Tensor) -> torch.Tensor: """Expected input shape: (batch_size, in_channels, time, height, width)""" if self.enable_linear: return self._forward_mulmat(x) else: return self._forward_conv(x) def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: # PyTorch2.9.0 disabled CUDNN's Conv3D, which caused a # significant performance regression. # See: https://github.com/vllm-project/vllm/issues/27406 # and https://github.com/pytorch/pytorch/issues/166122 # By default, we use CUDNN's convolution ops with optimization. if self.enable_linear and is_torch_equal("2.9.0"): return self._forward_mulmat(x) return self._forward_conv(x)