[FEAT] Add transformers backend support (#5929)

2025-06-04 06:05:29 +02:00
parent 8a5480528d
commit 37f1547587
11 changed files with 636 additions and 3 deletions
--- a/docs/backend/server_arguments.md
+++ b/docs/backend/server_arguments.md
@@ -63,6 +63,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s
 | `kv_cache_dtype` | Dtype of the kv cache. | `auto` |
 | `context_length` | The model's maximum context length. Defaults to None (will use the value from the model's config.json instead). Note that extending the default might lead to strange behavior. | None |
 | `device` | The device we put the model. | None |
+| `impl` | The implementation of the model to use. Defaults to SGlang implementation and fall back to transformers if needed | `auto` |
 | `served_model_name` | Override the model name returned by the v1/models endpoint in OpenAI API server.| None |
 | `is_embedding` | Set to `true` to perform [embedding](./openai_api_embeddings.ipynb) / [encode](https://docs.sglang.ai/backend/native_api#Encode-(embedding-model)) and [reward](https://docs.sglang.ai/backend/native_api#Classify-(reward-model)) tasks. | `False` |
 | `revision` | Adjust if a specific version of the model should be used. | None |
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -47,6 +47,7 @@ The core features include:
   supported_models/embedding_models.md
   supported_models/reward_models.md
   supported_models/support_new_models.md
+   supported_models/transformers_fallback.md

 .. toctree::
   :maxdepth: 1
--- a/docs/supported_models/transformers_fallback.md
+++ b/docs/supported_models/transformers_fallback.md
@@ -0,0 +1,58 @@
+# Transformers fallback in SGLang
+
+`sglang` can fall back to using models that are available in `transformers`. This works for most decoder-style language models and support for vision-language models is coming soon!
+
+## Example launch Command
+
+By default, we will use sglang implementation if it is available. Otherwise, we will fall back to transformers one. However, you can switch the implementation by setting `impl` to `transformers`.
+
+```shell
+python3 -m sglang.launch_server \
+  --model-path meta-llama/Llama-3.2-1B-Instruct \
+  --host 0.0.0.0 \
+  --port 30000 \
+  --impl transformers
+```
+
+#### Supported features
+
+##### Quantization
+
+Transformers fall back has supported most of available quantization in SGLang (except GGUF). See [Quantization page](https://docs.sglang.ai/backend/quantization.html) for more information about supported quantization in SGLang.
+
+##### Remote code
+
+This fallback also means that any model on the hub that can be used in `transformers` with `trust_remote_code=True` that correctly implements attention can be used in production!
+
+A model just needs the following two things:
+
+```python
+from transformers import PreTrainedModel
+from torch import nn
+
+class MyAttention(nn.Module):
+
+  def forward(self, hidden_states, **kwargs): # <- kwargs are required
+
+    ...
+    attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+    attn_output, attn_weights = attention_interface(
+      self,
+      query_states,
+      key_states,
+      value_states,
+      **kwargs,
+    )
+    ...
+
+class MyModel(PreTrainedModel):
+  _supports_attention_backend = True
+```
+
+Here is what happens in the background:
+
+1. The config is loaded
+2. `MyModel` python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
+3. The `TransformersModel` backend is used. See `/srt/models/transformers`, which leverages `self.config._attn_implementation = "sglang"`, thus the need to use `ALL_ATTENTION_FUNCTIONS`.
+
+That's it!
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -16,7 +16,7 @@ import json
 import logging
 import math
 import os
-from enum import IntEnum, auto
+from enum import Enum, IntEnum, auto
 from typing import List, Optional, Set, Union

 import torch
@@ -39,6 +39,12 @@ class AttentionArch(IntEnum):
    MHA = auto()


+class ModelImpl(str, Enum):
+    AUTO = "auto"
+    SGLANG = "sglang"
+    TRANSFORMERS = "transformers"
+
+
 class ModelConfig:
    def __init__(
        self,
@@ -53,11 +59,13 @@ class ModelConfig:
        quantization: Optional[str] = None,
        override_config_file: Optional[str] = None,
        is_draft_model: bool = False,
+        impl: Union[str, ModelImpl] = ModelImpl.AUTO,
    ) -> None:

        self.model_path = model_path
        self.revision = revision
        self.quantization = quantization
+        self.impl = impl

        # Parse args
        self.maybe_pull_model_tokenizer_from_remote()
@@ -256,6 +264,7 @@ class ModelConfig:
            enable_multimodal=server_args.enable_multimodal,
            dtype=server_args.dtype,
            quantization=server_args.quantization,
+            impl=server_args.impl,
            **kwargs,
        )

--- a/python/sglang/srt/model_loader/utils.py
+++ b/python/sglang/srt/model_loader/utils.py
@@ -2,12 +2,17 @@

 """Utilities for selecting and loading models."""
 import contextlib
+import logging
 from typing import Tuple, Type

 import torch
+import transformers
 from torch import nn
+from transformers.dynamic_module_utils import get_class_from_dynamic_module

-from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.configs.model_config import ModelConfig, ModelImpl
+
+logger = logging.getLogger(__name__)


@contextlib.contextmanager
@@ -19,6 +24,61 @@ def set_default_torch_dtype(dtype: torch.dtype):
    torch.set_default_dtype(old_dtype)


+def resolve_transformers_arch(model_config: ModelConfig, architectures: list[str]):
+    for i, arch in enumerate(architectures):
+        if arch == "TransformersForCausalLM":
+            continue
+        auto_map: dict[str, str] = (
+            getattr(model_config.hf_config, "auto_map", None) or dict()
+        )
+        # Make sure that config class is always initialized before model class,
+        # otherwise the model class won't be able to access the config class,
+        # the expected auto_map should have correct order like:
+        # "auto_map": {
+        #     "AutoConfig": "<your-repo-name>--<config-name>",
+        #     "AutoModel": "<your-repo-name>--<config-name>",
+        #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
+        # },
+        auto_modules = {
+            name: get_class_from_dynamic_module(
+                module, model_config.model_path, revision=model_config.revision
+            )
+            for name, module in sorted(auto_map.items(), key=lambda x: x[0])
+        }
+        model_module = getattr(transformers, arch, None)
+        if model_module is None:
+            if "AutoModel" not in auto_map:
+                raise ValueError(
+                    f"Cannot find model module. '{arch}' is not a registered "
+                    "model in the Transformers library (only relevant if the "
+                    "model is meant to be in Transformers) and 'AutoModel' is "
+                    "not present in the model config's 'auto_map' (relevant "
+                    "if the model is custom)."
+                )
+            model_module = auto_modules["AutoModel"]
+        if model_config.impl == ModelImpl.TRANSFORMERS:
+            if not model_module.is_backend_compatible():
+                raise ValueError(
+                    f"The Transformers implementation of {arch} is not "
+                    "compatible with vLLM."
+                )
+            architectures[i] = "TransformersForCausalLM"
+        if model_config.impl == ModelImpl.AUTO:
+            if not model_module.is_backend_compatible():
+                raise ValueError(
+                    f"{arch} has no SGlang implementation and the Transformers "
+                    "implementation is not compatible with SGLang."
+                )
+            logger.warning(
+                "%s has no SGLang implementation, falling back to Transformers "
+                "implementation. Some features may not be supported and "
+                "performance may not be optimal.",
+                arch,
+            )
+            architectures[i] = "TransformersForCausalLM"
+    return architectures
+
+
 def get_model_architecture(model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
    from sglang.srt.models.registry import ModelRegistry

@@ -34,6 +94,12 @@ def get_model_architecture(model_config: ModelConfig) -> Tuple[Type[nn.Module],
    ):
        architectures = ["QuantMixtralForCausalLM"]

+    supported_archs = ModelRegistry.get_supported_archs()
+    is_native_supported = any(arch in supported_archs for arch in architectures)
+
+    if not is_native_supported or model_config.impl == ModelImpl.TRANSFORMERS:
+        architectures = resolve_transformers_arch(model_config, architectures)
+
    return ModelRegistry.resolve_model_cls(architectures)


--- a/python/sglang/srt/models/registry.py
+++ b/python/sglang/srt/models/registry.py
@@ -49,7 +49,15 @@ class _ModelRegistry:
        if not architectures:
            logger.warning("No model architectures are specified")

-        return architectures
+        # filter out support architectures
+        normalized_arch = list(
+            filter(lambda model: model in self.models, architectures)
+        )
+
+        # make sure Transformers backend is put at the last as a fallback
+        if len(normalized_arch) != len(architectures):
+            normalized_arch.append("TransformersForCausalLM")
+        return normalized_arch

    def resolve_model_cls(
        self,
--- a/python/sglang/srt/models/transformers.py
+++ b/python/sglang/srt/models/transformers.py
@@ -0,0 +1,291 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/a1a2aaadb9122f05667140e39cf67e5736c8b6d6/vllm/model_executor/models/transformers.py
+"""Wrapper around `transformers` models"""
+import logging
+import re
+from typing import Iterable, Literal, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import AutoModel, PretrainedConfig, PreTrainedModel
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+
+from sglang.srt.distributed import divide, get_tensor_model_parallel_world_size
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+
+logger = logging.getLogger(__name__)
+
+
+def maybe_prefix(prefix: str, name: str) -> str:
+    """Add a prefix to a name if the prefix is non-empty.
+
+    Args:
+        prefix: The prefix to add. If empty, no prefix will be added.
+        name: The name to potentially prefix.
+
+    Returns:
+        The string "prefix.name" if prefix was non-empty, otherwise just "name".
+    """
+    return name if not prefix else f"{prefix}.{name}"
+
+
+def sglang_flash_attention_forward(
+    # Transformers args
+    module: torch.nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: torch.Tensor,
+    # sglang kwargs
+    forward_batch: ForwardBatch,
+    # Transformers kwargs
+    scaling: float = None,
+    attention_instances: list[RadixAttention] = None,
+    **kwargs,
+):
+    self_attn: RadixAttention = attention_instances[module.layer_idx]
+    if scaling is not None:
+        self_attn.scaling = float(scaling)
+    hidden = query.shape[-2]
+    query, key, value = (x.transpose(1, 2) for x in (query, key, value))
+    query, key, value = (x.reshape(hidden, -1) for x in (query, key, value))
+    return self_attn.forward(query, key, value, forward_batch=forward_batch), None
+
+
+ALL_ATTENTION_FUNCTIONS["sglang"] = sglang_flash_attention_forward
+
+
+class HFColumnParallelLinear(ColumnParallelLinear):
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return super().forward(input)[0]
+
+
+class HFRowParallelLinear(RowParallelLinear):
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return super().forward(input)[0]
+
+
+def replace_linear_class(
+    linear: nn.Linear,
+    style: Literal["colwise", "rowwise"],
+    quant_config: QuantizationConfig,
+) -> Union[ColumnParallelLinear, RowParallelLinear]:
+    """
+    Replace nn.Linear with one of vLLM's tensor parallel linear classes.
+
+    Args:
+        linear (nn.Linear): `nn.Linear` to be replaced.
+        style (str): Tensor parallel style of the new linear, e.g. "colwise".
+        quant_config (QuantConfig): Quantization config for the new linear.
+    Returns:
+        Union[ColumnParallelLinear, RowParallelLinear]: The new linear.
+    """
+
+    if not isinstance(style, str):
+        raise ValueError(f"Unsupported parallel style type {type(style)}, expected str")
+
+    sglang_linear_cls = {
+        "colwise": ColumnParallelLinear,
+        "rowwise": RowParallelLinear,
+    }.get(style, ReplicatedLinear)
+
+    class HFCompatibleLinear(sglang_linear_cls):
+        """
+        Wrapper class that removes `output_bias` from returned output.
+        """
+
+        @property
+        def parent_cls(self) -> type:
+            return sglang_linear_cls
+
+        def forward(self, input: torch.Tensor) -> torch.Tensor:
+            return super().forward(input)[0]
+
+    return HFCompatibleLinear(
+        input_size=linear.in_features,
+        output_size=linear.out_features,
+        bias=linear.bias is not None,
+        quant_config=quant_config,
+    )
+
+
+class TransformersForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        logger.info("Using Transformers backend.")
+
+        self.quant_config = quant_config
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.unpadded_vocab_size = config.vocab_size
+
+        # model is loaded under set_default_torch_dtype(model_config.dtype)
+        self.model: PreTrainedModel = AutoModel.from_config(
+            self.config,
+            torch_dtype=torch.get_default_dtype(),
+            attn_implementation="sglang",
+            trust_remote_code=True,
+        )
+
+        # Attention modifications (assumes 1 attention op per hidden layer)
+        tp_size = get_tensor_model_parallel_world_size()
+
+        # MLP modifications
+        self.tensor_parallel(tp_size)
+
+        head_dim = (
+            (config.hidden_size // config.num_attention_heads)
+            if not hasattr(config, "head_dim")
+            else config.head_dim
+        )
+        self.attention_instances = [
+            RadixAttention(
+                num_heads=divide(config.num_attention_heads, tp_size),
+                head_dim=head_dim,
+                # NOTE: We use Llama scale as default, if it's set by
+                # Transformers, it's updated in sglang_flash_attention_forward
+                scaling=head_dim**-0.5,
+                num_kv_heads=divide(config.num_key_value_heads, tp_size),
+                layer_id=i,
+                quant_config=self.quant_config,
+                prefix=f"{i}.attn",
+            )
+            for i in range(config.num_hidden_layers)
+        ]
+
+        # Model modifications
+        self.replace_vocab_embed_class(self.model)
+
+        # ForCausalLM modifications
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=self.quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.get_input_embeddings().weight
+
+        self.logits_processor = LogitsProcessor(config)
+
+    def log_replacement(self, name: str, old_module: nn.Module, new_module: nn.Module):
+        logger.debug("%s: %s -> %s", name, old_module, new_module)
+
+    def tensor_parallel(self, tp_size: int):
+        """
+        Apply the model's tensor parallelization plan.
+        Currently only supports linear layers.
+        """
+        if not self.model.supports_tp_plan:
+            if tp_size <= 1:
+                return
+
+            raise ValueError(
+                f"{type(self.model)} does not support tensor parallel yet!"
+            )
+
+        tp_plan = self.model._tp_plan
+
+        def _tensor_parallel(module: nn.Module, prefix: str = ""):
+            for child_name, child_module in module.named_children():
+                qual_name = maybe_prefix(prefix, child_name)
+                for pattern, style in tp_plan.items():
+                    if re.match(pattern, qual_name) and isinstance(
+                        child_module, nn.Linear
+                    ):
+                        new_module = replace_linear_class(
+                            child_module, style, self.quant_config
+                        )
+                        setattr(module, child_name, new_module)
+                        self.log_replacement(qual_name, child_module, new_module)
+                else:
+                    _tensor_parallel(child_module, prefix=qual_name)
+
+        _tensor_parallel(self.model)
+
+    def replace_vocab_embed_class(self, module: nn.Module):
+        # Use native set input embeddings
+        new_module = VocabParallelEmbedding(
+            self.vocab_size,
+            self.config.hidden_size,
+            org_num_embeddings=self.config.vocab_size,
+            quant_config=None,
+        )
+        self.log_replacement(
+            "input embedding", self.model.get_input_embeddings(), new_module
+        )
+        self.model.set_input_embeddings(new_module)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+        assert get_embedding is False, "embedding is not supported yet"
+        aux_hidden_states = None
+        hidden_states = self.model(
+            input_ids[None, ...],
+            use_cache=False,
+            position_ids=positions[None, ...],
+            forward_batch=forward_batch,
+            attention_instances=self.attention_instances,
+            return_dict=False,
+        )[0][
+            0, ...
+        ]  # we remove batch dimension for now
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch, aux_hidden_states
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if name not in params_dict:
+                name = f"{self.model.base_model_prefix}.{name}"
+            if name in params_dict:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = [TransformersForCausalLM]
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -61,6 +61,7 @@ class ServerArgs:
    is_embedding: bool = False
    enable_multimodal: Optional[bool] = None
    revision: Optional[str] = None
+    impl: str = "auto"

    # Port for the HTTP server
    host: str = "127.0.0.1"
@@ -726,6 +727,18 @@ class ServerArgs:
            default=ServerArgs.page_size,
            help="The number of tokens in a page.",
        )
+        parser.add_argument(
+            "--impl",
+            type=str,
+            default=ServerArgs.impl,
+            help="Which implementation of the model to use.\n\n"
+            '* "auto" will try to use the SGLang implementation if it exists '
+            "and fall back to the Transformers implementation if no SGLang "
+            "implementation is available.\n"
+            '* "sglang" will use the SGLang model implementation.\n'
+            '* "transformers" will use the Transformers model '
+            "implementation.\n",
+        )

        # Other runtime options
        parser.add_argument(
--- a/python/sglang/test/runners.py
+++ b/python/sglang/test/runners.py
@@ -455,6 +455,7 @@ class SRTRunner:
        torch_dtype: torch.dtype,
        model_type: str,
        tp_size: int = 1,
+        impl: str = "auto",
        port: int = DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
        lora_paths: List[str] = None,
        max_loras_per_batch: int = 4,
@@ -475,6 +476,7 @@ class SRTRunner:
        speculative_num_draft_tokens: Optional[int] = None,
        disable_overlap_schedule: bool = False,
        disable_custom_all_reduce: bool = False,
+        torchao_config: Optional[str] = None,
    ):
        self.model_type = model_type
        self.is_generation = model_type == "generation"
@@ -493,6 +495,8 @@ class SRTRunner:
            tp_size=tp_size,
            dtype=get_dtype_str(torch_dtype),
            port=port,
+            impl=impl,
+            torchao_config=torchao_config,
            mem_fraction_static=mem_fraction_static,
            trust_remote_code=trust_remote_code,
            is_embedding=not self.is_generation,
--- a/test/srt/models/test_transformers_models.py
+++ b/test/srt/models/test_transformers_models.py
@@ -0,0 +1,181 @@
+import dataclasses
+import multiprocessing as mp
+import unittest
+from types import SimpleNamespace
+from typing import List
+
+import torch
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.runners import DEFAULT_PROMPTS, SRTRunner, check_close_model_outputs
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+)
+
+
+class TestTransformersFallbackEndpoint(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--impl", "transformers"],
+        )
+        cls.mmlu_lower_bound = 0.65
+        cls.gsm8k_lower_bound = 0.65
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+        from sglang.test.run_eval import run_eval
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], self.mmlu_lower_bound)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        from sglang.test.few_shot_gsm8k import run_eval
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], self.gsm8k_lower_bound)
+
+
+class TestTransformersFallbackTorchAO(TestTransformersFallbackEndpoint):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--impl",
+                "transformers",
+                "--torchao-config",
+                "int4wo-128",
+            ],
+        )
+        cls.mmlu_lower_bound = 0.65
+        cls.gsm8k_lower_bound = 0.65
+
+
+@dataclasses.dataclass
+class ModelCase:
+    model_path: str
+    tp_size: int = 1
+    prefill_tolerance: float = 5e-2
+    decode_tolerance: float = 5e-2
+    rouge_l_tolerance: float = 1
+    skip_long_prompt: bool = False
+    trust_remote_code: bool = False
+    torchao_config: str = None
+    torch_dtype: torch.dtype = torch.float16
+
+
+# Popular models that run on the CI
+CI_MODELS = [
+    ModelCase(DEFAULT_MODEL_NAME_FOR_TEST),
+]
+
+ALL_OTHER_MODELS = [
+    ModelCase(DEFAULT_MODEL_NAME_FOR_TEST, tp_size=2),
+]
+
+
+class TestTransformersFallbackEngine(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def assert_close_logits_and_output_strs(
+        self,
+        prompts: List[str],
+        model_case: ModelCase,
+    ) -> None:
+        model_path = model_case.model_path
+        max_new_tokens = 32
+        # force to use transformers impl
+        with SRTRunner(
+            model_path,
+            tp_size=model_case.tp_size,
+            torch_dtype=model_case.torch_dtype,
+            model_type="generation",
+            impl="transformers",
+            trust_remote_code=model_case.trust_remote_code,
+            torchao_config=model_case.torchao_config,
+        ) as srt_runner:
+            srt_outputs = srt_runner.forward(prompts, max_new_tokens=max_new_tokens)
+
+        with SRTRunner(
+            model_path,
+            tp_size=model_case.tp_size,
+            torch_dtype=model_case.torch_dtype,
+            model_type="generation",
+            trust_remote_code=model_case.trust_remote_code,
+            torchao_config=model_case.torchao_config,
+        ) as srt_runner:
+            srt_transformers_outputs = srt_runner.forward(
+                prompts, max_new_tokens=max_new_tokens
+            )
+
+        check_close_model_outputs(
+            hf_outputs=srt_transformers_outputs,
+            srt_outputs=srt_outputs,
+            prefill_tolerance=model_case.prefill_tolerance,
+            decode_tolerance=model_case.decode_tolerance,
+            rouge_l_tolerance=model_case.rouge_l_tolerance,
+            debug_text=f"model_path={model_path} prompts={prompts}",
+        )
+
+    def test_ci_models(self):
+        for model_case in CI_MODELS:
+            # Skip long prompts for models that do not have a long context
+            prompts = DEFAULT_PROMPTS
+            if model_case.skip_long_prompt:
+                prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]
+            # Assert the logits and output strs are close
+            self.assert_close_logits_and_output_strs(prompts, model_case)
+
+    def test_others(self):
+        if is_in_ci():
+            return
+
+        # Skip long prompts for models that do not have a long context
+        prompts = DEFAULT_PROMPTS
+        for model_case in ALL_OTHER_MODELS:
+            if model_case.skip_long_prompt:
+                prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]
+
+            # Assert the logits and output strs are close
+            self.assert_close_logits_and_output_strs(prompts, model_case)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -26,6 +26,7 @@ suites = {
        TestFile("models/test_qwen_models.py", 82),
        TestFile("models/test_reward_models.py", 132),
        TestFile("models/test_vlm_models.py", 437),
+        TestFile("models/test_transformers_models.py", 320),
        TestFile("test_abort.py", 51),
        TestFile("test_block_int8.py", 22),
        TestFile("test_create_kvindices.py", 2),