enginex-mthreads-vllm/vllm/model_executor/models/dbrx.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from collections.abc import Iterable
from itertools import islice

import torch
import torch.nn as nn
from transformers import DbrxConfig

from vllm.attention.layer import Attention
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import (
    get_pp_group,
    get_tensor_model_parallel_rank,
    get_tensor_model_parallel_world_size,
)
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.linear import (
    QKVParallelLinear,
    ReplicatedLinear,
    RowParallelLinear,
)
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import (
    ParallelLMHead,
    VocabParallelEmbedding,
)
from vllm.model_executor.model_loader.weight_utils import (
    default_weight_loader,
    maybe_remap_kv_scale_name,
)
from vllm.sequence import IntermediateTensors

from .interfaces import SupportsPP
from .utils import (
    AutoWeightsLoader,
    is_pp_missing_parameter,
    make_empty_intermediate_tensors_factory,
    make_layers,
    maybe_prefix,
)


class DbrxRouter(nn.Module):
    """A Router implementation for DBRX that returns logits for each expert
    per token.
    """

    def __init__(
        self,
        config: DbrxConfig,
        params_dtype: torch.dtype | None = None,
    ):
        super().__init__()
        self.tp_size = get_tensor_model_parallel_world_size()
        self.num_total_experts = config.ffn_config.moe_num_experts
        self.d_model = config.d_model
        self.layer = ReplicatedLinear(
            self.d_model,
            self.num_total_experts,
            bias=False,
            params_dtype=params_dtype,
            quant_config=None,
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        router_logits, _ = self.layer(hidden_states)
        return router_logits


class DbrxExperts(FusedMoE):
    def __init__(
        self,
        config: DbrxConfig,
        quant_config: QuantizationConfig | None = None,
        params_dtype: torch.dtype | None = None,
        prefix: str = "",
    ):
        super().__init__(
            num_experts=config.ffn_config.moe_num_experts,
            top_k=config.ffn_config.moe_top_k,
            hidden_size=config.d_model,
            intermediate_size=config.ffn_config.ffn_hidden_size,
            params_dtype=params_dtype,
            reduce_results=True,
            renormalize=True,
            quant_config=quant_config,
            tp_size=get_tensor_model_parallel_world_size(),
            prefix=prefix,
        )
        self.config = config
        self.d_model = config.d_model
        self.intermediate_size = self.config.ffn_config.ffn_hidden_size // self.tp_size

    # Define custom weight loader for dbrx model
    def weight_loader(
        self,
        param: nn.Parameter,
        loaded_weight: torch.Tensor,
        weight_name: str,
        param_name: str,
    ):
        tp_rank = get_tensor_model_parallel_rank()
        param_data = param.data
        shard_size = self.intermediate_size
        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
        # DBRX uses GLU for each experts.
        # GLU has 3 linear layers: w1, v1 and w2.
        if weight_name.endswith("w1"):
            if param_name.endswith("weight"):
                loaded_weight = torch.reshape(
                    loaded_weight,
                    [-1, self.intermediate_size * self.tp_size, self.d_model],
                )
                param_data[:, 0:shard_size, :] = loaded_weight[:, shard, :]
            elif param_name.endswith("weight_scale"):
                param_data[:, 0] = loaded_weight
            else:
                param_data = loaded_weight
        if weight_name.endswith("v1"):
            if param_name.endswith("weight"):
                loaded_weight = torch.reshape(
                    loaded_weight,
                    [-1, self.intermediate_size * self.tp_size, self.d_model],
                )
                param_data[:, shard_size : 2 * shard_size, :] = loaded_weight[
                    :, shard, :
                ]
            elif param_name.endswith("weight_scale"):
                param_data[:, 1] = loaded_weight
            else:
                param_data[:] = loaded_weight
        if weight_name.endswith("w2"):
            if param_name.endswith("weight"):
                loaded_weight = torch.reshape(
                    loaded_weight,
                    [-1, self.intermediate_size * self.tp_size, self.d_model],
                ).transpose(1, 2)
                param_data[:] = loaded_weight[:, :, shard]
            else:
                param_data[:] = loaded_weight


class DbrxMoE(nn.Module):
    """A tensor-parallel MoE implementation for DBRX.

    Each expert's weights are sharded across all ranks and a fused MoE
    kernel is used for the forward pass, and finally we reduce the outputs
    across ranks.
    """

    def __init__(
        self,
        config: DbrxConfig,
        quant_config: QuantizationConfig | None = None,
        params_dtype: torch.dtype | None = None,
        prefix: str = "",
    ):
        super().__init__()
        self.d_model = config.d_model
        if params_dtype is None:
            params_dtype = torch.get_default_dtype()
        self.params_dtype = params_dtype

        self.router = DbrxRouter(config, self.params_dtype)

        self.experts = DbrxExperts(
            config=config,
            quant_config=quant_config,
            params_dtype=self.params_dtype,
            prefix=f"{prefix}.experts",
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        orig_shape = hidden_states.shape
        hidden_states = hidden_states.view(-1, self.d_model)
        # router_logits: (num_tokens, n_experts)
        router_logits = self.router(hidden_states)
        final_hidden_states = self.experts(hidden_states, router_logits)
        return final_hidden_states.view(orig_shape)


class DbrxAttention(nn.Module):
    def __init__(
        self,
        config: DbrxConfig,
        cache_config: CacheConfig | None = None,
        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
    ):
        super().__init__()
        self.d_model = config.d_model
        self.total_num_heads = config.n_heads
        self.head_dim = self.d_model // self.total_num_heads
        self.total_num_kv_heads = config.attn_config.kv_n_heads
        self.clip_qkv = config.attn_config.clip_qkv
        rope_parameters = {
            "rope_type": "default",
            "rope_theta": int(config.attn_config.rope_theta),
        }
        self.max_position = config.max_seq_len

        # pylint: disable=invalid-name
        self.Wqkv = QKVParallelLinear(
            self.d_model,
            self.head_dim,
            self.total_num_heads,
            self.total_num_kv_heads,
            bias=False,
            quant_config=quant_config,
            prefix=f"{prefix}.Wqkv",
        )
        self.out_proj = RowParallelLinear(
            self.d_model,
            self.d_model,
            bias=False,
            quant_config=quant_config,
            prefix=f"{prefix}.out_proj",
        )
        self.rotary_emb = get_rope(
            self.head_dim,
            max_position=self.max_position,
            rope_parameters=rope_parameters,
            is_neox_style=True,
        )

        tp_world_size = get_tensor_model_parallel_world_size()
        self.tp_size = tp_world_size
        assert self.total_num_heads % tp_world_size == 0
        self.num_heads = self.total_num_heads // tp_world_size
        if self.total_num_kv_heads >= tp_world_size:
            # Number of KV heads is greater than TP size, so we partition
            # the KV heads across multiple tensor parallel GPUs.
            assert self.total_num_kv_heads % tp_world_size == 0
        else:
            # Number of KV heads is less than TP size, so we replicate
            # the KV heads across multiple tensor parallel GPUs.
            assert tp_world_size % self.total_num_kv_heads == 0
        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_world_size)
        self.q_size = self.num_heads * self.head_dim
        self.kv_size = self.num_kv_heads * self.head_dim
        self.scaling = self.head_dim**-0.5
        self.attn = Attention(
            self.num_heads,
            self.head_dim,
            self.scaling,
            num_kv_heads=self.num_kv_heads,
            cache_config=cache_config,
            quant_config=quant_config,
            prefix=f"{prefix}.attn",
        )

    def forward(
        self,
        position_ids: torch.Tensor,
        hidden_states: torch.Tensor,
    ) -> torch.Tensor:
        qkv, _ = self.Wqkv(hidden_states)
        if self.clip_qkv is not None:
            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
        q, k = self.rotary_emb(position_ids, q, k)
        attn_output = self.attn(q, k, v)
        hidden_states, _ = self.out_proj(attn_output)
        return hidden_states


class DbrxFusedNormAttention(nn.Module):
    def __init__(
        self,
        config: DbrxConfig,
        cache_config: CacheConfig | None = None,
        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
    ):
        super().__init__()
        self.d_model = config.d_model
        self.attn = DbrxAttention(
            config, cache_config, quant_config, prefix=f"{prefix}.attn"
        )
        self.norm_1 = nn.LayerNorm(self.d_model)
        self.norm_2 = nn.LayerNorm(self.d_model)

    def forward(
        self,
        position_ids: torch.Tensor,
        hidden_states: torch.Tensor,
    ) -> torch.Tensor:
        residual = hidden_states
        hidden_states = self.norm_1(hidden_states)
        x = self.attn(
            position_ids=position_ids,
            hidden_states=hidden_states,
        )
        hidden_states = residual + x
        residual = hidden_states
        hidden_states = self.norm_2(hidden_states)
        return hidden_states, residual


class DbrxBlock(nn.Module):
    def __init__(
        self,
        config: DbrxConfig,
        cache_config: CacheConfig | None = None,
        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
    ):
        super().__init__()
        self.norm_attn_norm = DbrxFusedNormAttention(
            config, cache_config, quant_config, prefix=f"{prefix}.norm_attn_norm"
        )
        self.ffn = DbrxMoE(config, quant_config, prefix=f"{prefix}.ffn")

    def forward(
        self,
        position_ids: torch.Tensor,
        hidden_states: torch.Tensor,
    ) -> torch.Tensor:
        hidden_states, residual = self.norm_attn_norm(
            position_ids=position_ids,
            hidden_states=hidden_states,
        )
        hidden_states = self.ffn(hidden_states)
        hidden_states = hidden_states + residual
        return hidden_states


class DbrxModel(nn.Module):
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()

        config = vllm_config.model_config.hf_config
        cache_config = vllm_config.cache_config
        quant_config = vllm_config.quant_config

        self.quant_config = quant_config
        self.wte = VocabParallelEmbedding(
            config.vocab_size,
            config.d_model,
        )
        self.start_layer, self.end_layer, self.blocks = make_layers(
            config.n_layers,
            lambda prefix: DbrxBlock(config, cache_config, quant_config, prefix=prefix),
            prefix=f"{prefix}.blocks",
        )
        self.norm_f = nn.LayerNorm(config.d_model, eps=1e-5)
        for module in self.modules():
            if hasattr(module, "bias") and isinstance(module.bias, nn.Parameter):
                # Remove the bias term in Linear and LayerNorm.
                module.register_parameter("bias", None)
        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
            ["hidden_states"], config.d_model
        )

    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.wte(input_ids)

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        intermediate_tensors: IntermediateTensors | None,
        inputs_embeds: torch.Tensor | None = None,
    ) -> torch.Tensor | IntermediateTensors:
        if get_pp_group().is_first_rank:
            if inputs_embeds is not None:
                hidden_states = inputs_embeds
            else:
                hidden_states = self.embed_input_ids(input_ids)
        else:
            assert intermediate_tensors
            hidden_states = intermediate_tensors["hidden_states"]
        for block in islice(self.blocks, self.start_layer, self.end_layer):
            hidden_states = block(position_ids, hidden_states)
        if not get_pp_group().is_last_rank:
            return IntermediateTensors({"hidden_states": hidden_states})
        hidden_states = self.norm_f(hidden_states)
        return hidden_states

    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
        expert_params_mapping = [
            (
                "w13" if weight_name in ["w1", "v1"] else "w2",
                f"mlp.{weight_name}",
            )
            for weight_name in ["w1", "v1", "w2"]
        ]
        params_dict = dict(self.named_parameters(remove_duplicate=False))
        loaded_params: set[str] = set()

        for name, loaded_weight in weights:
            if self.quant_config is not None and (
                scale_name := self.quant_config.get_cache_scale(name)
            ):
                # Loading kv cache quantization scales
                param = params_dict[scale_name]
                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                loaded_weight = (
                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
                )
                weight_loader(param, loaded_weight)
                loaded_params.add(scale_name)
                continue

            if name.endswith(("w1", "w2", "v1")):
                name = name + "_weight"
            for param_name, weight_name in expert_params_mapping:
                if weight_name not in name:
                    continue
                name = name.replace(weight_name, param_name)
                if is_pp_missing_parameter(name, self):
                    continue
                param = params_dict[name]
                weight_loader = param.weight_loader
                weight_loader(param, loaded_weight, weight_name, name)
                break

            else:
                if is_pp_missing_parameter(name, self):
                    continue
                # Remapping the name of FP8 kv-scale.
                name = maybe_remap_kv_scale_name(name, params_dict)
                if name is None:
                    continue
                param = params_dict[name]
                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                weight_loader(param, loaded_weight)
            loaded_params.add(name)
        return loaded_params


class DbrxForCausalLM(nn.Module, SupportsPP):
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        config = vllm_config.model_config.hf_config
        quant_config = vllm_config.quant_config
        self.config = config
        if config.tie_word_embeddings:
            raise ValueError("tie_word_embeddings is not supported for Dbrx models.")
        self.quant_config = quant_config

        self.transformer = DbrxModel(
            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "transformer")
        )
        self.lm_head = ParallelLMHead(
            config.vocab_size,
            config.d_model,
            quant_config=quant_config,
            prefix=maybe_prefix(prefix, "lm_head"),
        )
        self.logits_processor = LogitsProcessor(config.vocab_size)
        self.make_empty_intermediate_tensors = (
            self.transformer.make_empty_intermediate_tensors
        )

    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.transformer.embed_input_ids(input_ids)

    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        intermediate_tensors: IntermediateTensors | None = None,
        inputs_embeds: torch.Tensor | None = None,
    ) -> torch.Tensor | IntermediateTensors:
        hidden_states = self.transformer(
            input_ids, positions, intermediate_tensors, inputs_embeds
        )
        return hidden_states

    def compute_logits(
        self,
        hidden_states: torch.Tensor,
    ) -> torch.Tensor | None:
        logits = self.logits_processor(self.lm_head, hidden_states)
        return logits

    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
        loader = AutoWeightsLoader(self)
        return loader.load_weights(weights)
Sync from v0.13 2026-01-19 10:38:50 +08:00			`# SPDX-License-Identifier: Apache-2.0`
			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`

			`from collections.abc import Iterable`
			`from itertools import islice`
init 2026-01-09 13:34:11 +08:00
			`import torch`
			`import torch.nn as nn`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`from transformers import DbrxConfig`

			`from vllm.attention.layer import Attention`
			`from vllm.config import CacheConfig, VllmConfig`
			`from vllm.distributed import (`
			`get_pp_group,`
			`get_tensor_model_parallel_rank,`
			`get_tensor_model_parallel_world_size,`
			`)`
			`from vllm.model_executor.layers.fused_moe import FusedMoE`
			`from vllm.model_executor.layers.linear import (`
			`QKVParallelLinear,`
			`ReplicatedLinear,`
			`RowParallelLinear,`
			`)`
init 2026-01-09 13:34:11 +08:00			`from vllm.model_executor.layers.logits_processor import LogitsProcessor`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`from vllm.model_executor.layers.quantization import QuantizationConfig`
init 2026-01-09 13:34:11 +08:00			`from vllm.model_executor.layers.rotary_embedding import get_rope`
			`from vllm.model_executor.layers.vocab_parallel_embedding import (`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`ParallelLMHead,`
			`VocabParallelEmbedding,`
			`)`
			`from vllm.model_executor.model_loader.weight_utils import (`
			`default_weight_loader,`
			`maybe_remap_kv_scale_name,`
			`)`
			`from vllm.sequence import IntermediateTensors`

			`from .interfaces import SupportsPP`
			`from .utils import (`
			`AutoWeightsLoader,`
			`is_pp_missing_parameter,`
			`make_empty_intermediate_tensors_factory,`
			`make_layers,`
			`maybe_prefix,`
			`)`
init 2026-01-09 13:34:11 +08:00

			`class DbrxRouter(nn.Module):`
			`"""A Router implementation for DBRX that returns logits for each expert`
			`per token.`
			`"""`

			`def __init__(`
			`self,`
			`config: DbrxConfig,`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`params_dtype: torch.dtype \| None = None,`
init 2026-01-09 13:34:11 +08:00			`):`
			`super().__init__()`
			`self.tp_size = get_tensor_model_parallel_world_size()`
			`self.num_total_experts = config.ffn_config.moe_num_experts`
			`self.d_model = config.d_model`
			`self.layer = ReplicatedLinear(`
			`self.d_model,`
			`self.num_total_experts,`
			`bias=False,`
			`params_dtype=params_dtype,`
			`quant_config=None,`
			`)`

			`def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:`
			`router_logits, _ = self.layer(hidden_states)`
			`return router_logits`


Sync from v0.13 2026-01-19 10:38:50 +08:00			`class DbrxExperts(FusedMoE):`
			`def __init__(`
			`self,`
			`config: DbrxConfig,`
			`quant_config: QuantizationConfig \| None = None,`
			`params_dtype: torch.dtype \| None = None,`
			`prefix: str = "",`
			`):`
			`super().__init__(`
			`num_experts=config.ffn_config.moe_num_experts,`
			`top_k=config.ffn_config.moe_top_k,`
			`hidden_size=config.d_model,`
			`intermediate_size=config.ffn_config.ffn_hidden_size,`
			`params_dtype=params_dtype,`
			`reduce_results=True,`
			`renormalize=True,`
			`quant_config=quant_config,`
			`tp_size=get_tensor_model_parallel_world_size(),`
			`prefix=prefix,`
			`)`
			`self.config = config`
			`self.d_model = config.d_model`
			`self.intermediate_size = self.config.ffn_config.ffn_hidden_size // self.tp_size`

			`# Define custom weight loader for dbrx model`
			`def weight_loader(`
			`self,`
			`param: nn.Parameter,`
			`loaded_weight: torch.Tensor,`
			`weight_name: str,`
			`param_name: str,`
			`):`
			`tp_rank = get_tensor_model_parallel_rank()`
			`param_data = param.data`
			`shard_size = self.intermediate_size`
			`shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)`
			`# DBRX uses GLU for each experts.`
			`# GLU has 3 linear layers: w1, v1 and w2.`
			`if weight_name.endswith("w1"):`
			`if param_name.endswith("weight"):`
			`loaded_weight = torch.reshape(`
			`loaded_weight,`
			`[-1, self.intermediate_size * self.tp_size, self.d_model],`
			`)`
			`param_data[:, 0:shard_size, :] = loaded_weight[:, shard, :]`
			`elif param_name.endswith("weight_scale"):`
			`param_data[:, 0] = loaded_weight`
			`else:`
			`param_data = loaded_weight`
			`if weight_name.endswith("v1"):`
			`if param_name.endswith("weight"):`
			`loaded_weight = torch.reshape(`
			`loaded_weight,`
			`[-1, self.intermediate_size * self.tp_size, self.d_model],`
			`)`
			`param_data[:, shard_size : 2 * shard_size, :] = loaded_weight[`
			`:, shard, :`
			`]`
			`elif param_name.endswith("weight_scale"):`
			`param_data[:, 1] = loaded_weight`
			`else:`
			`param_data[:] = loaded_weight`
			`if weight_name.endswith("w2"):`
			`if param_name.endswith("weight"):`
			`loaded_weight = torch.reshape(`
			`loaded_weight,`
			`[-1, self.intermediate_size * self.tp_size, self.d_model],`
			`).transpose(1, 2)`
			`param_data[:] = loaded_weight[:, :, shard]`
			`else:`
			`param_data[:] = loaded_weight`


			`class DbrxMoE(nn.Module):`
init 2026-01-09 13:34:11 +08:00			`"""A tensor-parallel MoE implementation for DBRX.`

			`Each expert's weights are sharded across all ranks and a fused MoE`
			`kernel is used for the forward pass, and finally we reduce the outputs`
			`across ranks.`
			`"""`

			`def __init__(`
			`self,`
			`config: DbrxConfig,`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`quant_config: QuantizationConfig \| None = None,`
			`params_dtype: torch.dtype \| None = None,`
			`prefix: str = "",`
init 2026-01-09 13:34:11 +08:00			`):`
			`super().__init__()`
			`self.d_model = config.d_model`
			`if params_dtype is None:`
			`params_dtype = torch.get_default_dtype()`
			`self.params_dtype = params_dtype`

			`self.router = DbrxRouter(config, self.params_dtype)`

Sync from v0.13 2026-01-19 10:38:50 +08:00			`self.experts = DbrxExperts(`
			`config=config,`
			`quant_config=quant_config,`
			`params_dtype=self.params_dtype,`
			`prefix=f"{prefix}.experts",`
			`)`
init 2026-01-09 13:34:11 +08:00
			`def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`orig_shape = hidden_states.shape`
init 2026-01-09 13:34:11 +08:00			`hidden_states = hidden_states.view(-1, self.d_model)`
			`# router_logits: (num_tokens, n_experts)`
			`router_logits = self.router(hidden_states)`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`final_hidden_states = self.experts(hidden_states, router_logits)`
			`return final_hidden_states.view(orig_shape)`
init 2026-01-09 13:34:11 +08:00

			`class DbrxAttention(nn.Module):`
			`def __init__(`
			`self,`
			`config: DbrxConfig,`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`cache_config: CacheConfig \| None = None,`
			`quant_config: QuantizationConfig \| None = None,`
			`prefix: str = "",`
init 2026-01-09 13:34:11 +08:00			`):`
			`super().__init__()`
			`self.d_model = config.d_model`
			`self.total_num_heads = config.n_heads`
			`self.head_dim = self.d_model // self.total_num_heads`
			`self.total_num_kv_heads = config.attn_config.kv_n_heads`
			`self.clip_qkv = config.attn_config.clip_qkv`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`rope_parameters = {`
			`"rope_type": "default",`
			`"rope_theta": int(config.attn_config.rope_theta),`
			`}`
init 2026-01-09 13:34:11 +08:00			`self.max_position = config.max_seq_len`

			`# pylint: disable=invalid-name`
			`self.Wqkv = QKVParallelLinear(`
			`self.d_model,`
			`self.head_dim,`
			`self.total_num_heads,`
			`self.total_num_kv_heads,`
			`bias=False,`
			`quant_config=quant_config,`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`prefix=f"{prefix}.Wqkv",`
init 2026-01-09 13:34:11 +08:00			`)`
			`self.out_proj = RowParallelLinear(`
			`self.d_model,`
			`self.d_model,`
			`bias=False,`
			`quant_config=quant_config,`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`prefix=f"{prefix}.out_proj",`
init 2026-01-09 13:34:11 +08:00			`)`
			`self.rotary_emb = get_rope(`
			`self.head_dim,`
			`max_position=self.max_position,`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`rope_parameters=rope_parameters,`
init 2026-01-09 13:34:11 +08:00			`is_neox_style=True,`
			`)`

			`tp_world_size = get_tensor_model_parallel_world_size()`
			`self.tp_size = tp_world_size`
			`assert self.total_num_heads % tp_world_size == 0`
			`self.num_heads = self.total_num_heads // tp_world_size`
			`if self.total_num_kv_heads >= tp_world_size:`
			`# Number of KV heads is greater than TP size, so we partition`
			`# the KV heads across multiple tensor parallel GPUs.`
			`assert self.total_num_kv_heads % tp_world_size == 0`
			`else:`
			`# Number of KV heads is less than TP size, so we replicate`
			`# the KV heads across multiple tensor parallel GPUs.`
			`assert tp_world_size % self.total_num_kv_heads == 0`
			`self.num_kv_heads = max(1, self.total_num_kv_heads // tp_world_size)`
			`self.q_size = self.num_heads * self.head_dim`
			`self.kv_size = self.num_kv_heads * self.head_dim`
			`self.scaling = self.head_dim**-0.5`
			`self.attn = Attention(`
			`self.num_heads,`
			`self.head_dim,`
			`self.scaling,`
			`num_kv_heads=self.num_kv_heads,`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`cache_config=cache_config,`
			`quant_config=quant_config,`
			`prefix=f"{prefix}.attn",`
init 2026-01-09 13:34:11 +08:00			`)`

			`def forward(`
			`self,`
			`position_ids: torch.Tensor,`
			`hidden_states: torch.Tensor,`
			`) -> torch.Tensor:`
			`qkv, _ = self.Wqkv(hidden_states)`
			`if self.clip_qkv is not None:`
			`qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)`
			`q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)`
			`q, k = self.rotary_emb(position_ids, q, k)`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`attn_output = self.attn(q, k, v)`
init 2026-01-09 13:34:11 +08:00			`hidden_states, _ = self.out_proj(attn_output)`
			`return hidden_states`


			`class DbrxFusedNormAttention(nn.Module):`
			`def __init__(`
			`self,`
			`config: DbrxConfig,`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`cache_config: CacheConfig \| None = None,`
			`quant_config: QuantizationConfig \| None = None,`
			`prefix: str = "",`
init 2026-01-09 13:34:11 +08:00			`):`
			`super().__init__()`
			`self.d_model = config.d_model`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`self.attn = DbrxAttention(`
			`config, cache_config, quant_config, prefix=f"{prefix}.attn"`
			`)`
init 2026-01-09 13:34:11 +08:00			`self.norm_1 = nn.LayerNorm(self.d_model)`
			`self.norm_2 = nn.LayerNorm(self.d_model)`

			`def forward(`
			`self,`
			`position_ids: torch.Tensor,`
			`hidden_states: torch.Tensor,`
			`) -> torch.Tensor:`
			`residual = hidden_states`
			`hidden_states = self.norm_1(hidden_states)`
			`x = self.attn(`
			`position_ids=position_ids,`
			`hidden_states=hidden_states,`
			`)`
			`hidden_states = residual + x`
			`residual = hidden_states`
			`hidden_states = self.norm_2(hidden_states)`
			`return hidden_states, residual`


			`class DbrxBlock(nn.Module):`
			`def __init__(`
			`self,`
			`config: DbrxConfig,`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`cache_config: CacheConfig \| None = None,`
			`quant_config: QuantizationConfig \| None = None,`
			`prefix: str = "",`
init 2026-01-09 13:34:11 +08:00			`):`
			`super().__init__()`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`self.norm_attn_norm = DbrxFusedNormAttention(`
			`config, cache_config, quant_config, prefix=f"{prefix}.norm_attn_norm"`
			`)`
			`self.ffn = DbrxMoE(config, quant_config, prefix=f"{prefix}.ffn")`
init 2026-01-09 13:34:11 +08:00
			`def forward(`
			`self,`
			`position_ids: torch.Tensor,`
			`hidden_states: torch.Tensor,`
			`) -> torch.Tensor:`
			`hidden_states, residual = self.norm_attn_norm(`
			`position_ids=position_ids,`
			`hidden_states=hidden_states,`
			`)`
			`hidden_states = self.ffn(hidden_states)`
			`hidden_states = hidden_states + residual`
			`return hidden_states`


			`class DbrxModel(nn.Module):`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):`
init 2026-01-09 13:34:11 +08:00			`super().__init__()`
Sync from v0.13 2026-01-19 10:38:50 +08:00
			`config = vllm_config.model_config.hf_config`
			`cache_config = vllm_config.cache_config`
			`quant_config = vllm_config.quant_config`

			`self.quant_config = quant_config`
init 2026-01-09 13:34:11 +08:00			`self.wte = VocabParallelEmbedding(`
			`config.vocab_size,`
			`config.d_model,`
			`)`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`self.start_layer, self.end_layer, self.blocks = make_layers(`
			`config.n_layers,`
			`lambda prefix: DbrxBlock(config, cache_config, quant_config, prefix=prefix),`
			`prefix=f"{prefix}.blocks",`
			`)`
init 2026-01-09 13:34:11 +08:00			`self.norm_f = nn.LayerNorm(config.d_model, eps=1e-5)`
			`for module in self.modules():`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`if hasattr(module, "bias") and isinstance(module.bias, nn.Parameter):`
init 2026-01-09 13:34:11 +08:00			`# Remove the bias term in Linear and LayerNorm.`
			`module.register_parameter("bias", None)`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(`
			`["hidden_states"], config.d_model`
			`)`

			`def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:`
			`return self.wte(input_ids)`
init 2026-01-09 13:34:11 +08:00
			`def forward(`
			`self,`
			`input_ids: torch.Tensor,`
			`position_ids: torch.Tensor,`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`intermediate_tensors: IntermediateTensors \| None,`
			`inputs_embeds: torch.Tensor \| None = None,`
			`) -> torch.Tensor \| IntermediateTensors:`
			`if get_pp_group().is_first_rank:`
			`if inputs_embeds is not None:`
			`hidden_states = inputs_embeds`
			`else:`
			`hidden_states = self.embed_input_ids(input_ids)`
			`else:`
			`assert intermediate_tensors`
			`hidden_states = intermediate_tensors["hidden_states"]`
			`for block in islice(self.blocks, self.start_layer, self.end_layer):`
			`hidden_states = block(position_ids, hidden_states)`
			`if not get_pp_group().is_last_rank:`
			`return IntermediateTensors({"hidden_states": hidden_states})`
init 2026-01-09 13:34:11 +08:00			`hidden_states = self.norm_f(hidden_states)`
			`return hidden_states`

Sync from v0.13 2026-01-19 10:38:50 +08:00			`def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:`
			`expert_params_mapping = [`
			`(`
			`"w13" if weight_name in ["w1", "v1"] else "w2",`
			`f"mlp.{weight_name}",`
			`)`
			`for weight_name in ["w1", "v1", "w2"]`
			`]`
			`params_dict = dict(self.named_parameters(remove_duplicate=False))`
			`loaded_params: set[str] = set()`

			`for name, loaded_weight in weights:`
			`if self.quant_config is not None and (`
			`scale_name := self.quant_config.get_cache_scale(name)`
			`):`
			`# Loading kv cache quantization scales`
			`param = params_dict[scale_name]`
			`weight_loader = getattr(param, "weight_loader", default_weight_loader)`
			`loaded_weight = (`
			`loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]`
			`)`
			`weight_loader(param, loaded_weight)`
			`loaded_params.add(scale_name)`
			`continue`
init 2026-01-09 13:34:11 +08:00
Sync from v0.13 2026-01-19 10:38:50 +08:00			`if name.endswith(("w1", "w2", "v1")):`
			`name = name + "_weight"`
			`for param_name, weight_name in expert_params_mapping:`
			`if weight_name not in name:`
			`continue`
			`name = name.replace(weight_name, param_name)`
			`if is_pp_missing_parameter(name, self):`
			`continue`
			`param = params_dict[name]`
			`weight_loader = param.weight_loader`
			`weight_loader(param, loaded_weight, weight_name, name)`
			`break`
init 2026-01-09 13:34:11 +08:00
Sync from v0.13 2026-01-19 10:38:50 +08:00			`else:`
			`if is_pp_missing_parameter(name, self):`
			`continue`
			`# Remapping the name of FP8 kv-scale.`
			`name = maybe_remap_kv_scale_name(name, params_dict)`
			`if name is None:`
			`continue`
			`param = params_dict[name]`
			`weight_loader = getattr(param, "weight_loader", default_weight_loader)`
			`weight_loader(param, loaded_weight)`
			`loaded_params.add(name)`
			`return loaded_params`


			`class DbrxForCausalLM(nn.Module, SupportsPP):`
			`def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):`
init 2026-01-09 13:34:11 +08:00			`super().__init__()`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`config = vllm_config.model_config.hf_config`
			`quant_config = vllm_config.quant_config`
init 2026-01-09 13:34:11 +08:00			`self.config = config`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`if config.tie_word_embeddings:`
			`raise ValueError("tie_word_embeddings is not supported for Dbrx models.")`
init 2026-01-09 13:34:11 +08:00			`self.quant_config = quant_config`
Sync from v0.13 2026-01-19 10:38:50 +08:00
			`self.transformer = DbrxModel(`
			`vllm_config=vllm_config, prefix=maybe_prefix(prefix, "transformer")`
			`)`
init 2026-01-09 13:34:11 +08:00			`self.lm_head = ParallelLMHead(`
			`config.vocab_size,`
			`config.d_model,`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`quant_config=quant_config,`
			`prefix=maybe_prefix(prefix, "lm_head"),`
init 2026-01-09 13:34:11 +08:00			`)`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`self.logits_processor = LogitsProcessor(config.vocab_size)`
			`self.make_empty_intermediate_tensors = (`
			`self.transformer.make_empty_intermediate_tensors`
			`)`

			`def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:`
			`return self.transformer.embed_input_ids(input_ids)`
init 2026-01-09 13:34:11 +08:00
			`def forward(`
			`self,`
			`input_ids: torch.Tensor,`
			`positions: torch.Tensor,`
Sync from v0.13 2026-01-19 10:38:50 +08:00			`intermediate_tensors: IntermediateTensors \| None = None,`
			`inputs_embeds: torch.Tensor \| None = None,`
			`) -> torch.Tensor \| IntermediateTensors:`
			`hidden_states = self.transformer(`
			`input_ids, positions, intermediate_tensors, inputs_embeds`
			`)`
init 2026-01-09 13:34:11 +08:00			`return hidden_states`

Sync from v0.13 2026-01-19 10:38:50 +08:00			`def compute_logits(`
			`self,`
			`hidden_states: torch.Tensor,`
			`) -> torch.Tensor \| None:`
			`logits = self.logits_processor(self.lm_head, hidden_states)`
init 2026-01-09 13:34:11 +08:00			`return logits`

Sync from v0.13 2026-01-19 10:38:50 +08:00			`def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:`
			`loader = AutoWeightsLoader(self)`
			`return loader.load_weights(weights)`