Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/models/quantization/init.py
+++ b/tests/models/quantization/init.py
--- a/tests/models/quantization/test_awq.py
+++ b/tests/models/quantization/test_awq.py
@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import pytest
+import torch
+
+from vllm.multimodal.image import rescale_image_size
+
+from ...conftest import IMAGE_ASSETS, ImageTestAssets, VllmRunner
+from ..utils import check_logprobs_close
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
+        "cherry_blossom": "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
+    }
+)
+
+
+def run_awq_test(
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+    source_model: str,
+    quant_model: str,
+    *,
+    size_factors: list[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: str | None = None,
+):
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [
+        (
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+        )
+        for image, prompt in zip(images, HF_IMAGE_PROMPTS)
+    ]
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(
+        source_model,
+        max_model_len=4096,
+        dtype=dtype,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=True,
+        default_torch_num_threads=1,
+    ) as vllm_model:
+        source_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(
+                prompts, max_tokens, num_logprobs=num_logprobs, images=images
+            )
+            for prompts, images in inputs_per_image
+        ]
+
+    with vllm_runner(
+        quant_model,
+        quantization="awq",
+        max_model_len=4096,
+        dtype=dtype,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=True,
+        default_torch_num_threads=1,
+    ) as vllm_model:
+        quant_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(
+                prompts, max_tokens, num_logprobs=num_logprobs, images=images
+            )
+            for prompts, images in inputs_per_image
+        ]
+
+    for source_outputs, quant_outputs in zip(
+        source_outputs_per_image, quant_outputs_per_image
+    ):
+        # TODO: Check whether using original CLIPVisionModel can improve
+        # consistency against HF
+        check_logprobs_close(
+            outputs_0_lst=source_outputs,
+            outputs_1_lst=quant_outputs,
+            name_0="source",
+            name_1="awq",
+        )
+
+
+@pytest.mark.parametrize(
+    ("source_model", "quant_model"),
+    [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")],
+)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@torch.inference_mode()
+def test_awq_models(
+    vllm_runner,
+    image_assets,
+    source_model,
+    quant_model,
+    size_factors,
+    dtype,
+    max_tokens,
+    num_logprobs,
+) -> None:
+    run_awq_test(
+        vllm_runner,
+        image_assets,
+        source_model,
+        quant_model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
--- a/tests/models/quantization/test_bitblas.py
+++ b/tests/models/quantization/test_bitblas.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Compare the outputs of a GPTQ model to a bitblas model.
+
+Note: GPTQ and bitblas do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+bitblas/GPTQ models are in the top 3 selections of each other.
+
+Note: bitblas internally uses locks to synchronize the threads. This can
+result in very slight nondeterminism for bitblas. As a result, we re-run the
+test up to 3 times to see if we pass.
+"""
+
+from dataclasses import dataclass
+
+import pytest
+
+from ..utils import check_logprobs_close
+
+
+@dataclass
+class ModelPair:
+    model_bitblas: str
+    model_gptq: str
+
+
+model_pairs = [
+    ModelPair(
+        model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
+        model_gptq="hxbgsyxh/opt-125m-4bit-128g",
+    ),
+]
+
+
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(True, reason="BitBLAS takes too much time for tuning.")
+@pytest.mark.parametrize("model_pair", model_pairs)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model_pair: ModelPair,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(
+        model_pair.model_bitblas, dtype=dtype, quantization="bitblas"
+    ) as bitblas_model:
+        bitblas_outputs = bitblas_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+    with vllm_runner(
+        model_pair.model_gptq, dtype=dtype, quantization="gptq"
+    ) as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+    check_logprobs_close(
+        outputs_0_lst=gptq_outputs,
+        outputs_1_lst=bitblas_outputs,
+        name_0="gptq",
+        name_1="bitblas",
+    )
--- a/tests/models/quantization/test_bitsandbytes.py
+++ b/tests/models/quantization/test_bitsandbytes.py
@@ -0,0 +1,290 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests whether bitsandbytes computation is enabled correctly.
+
+Run `pytest tests/quantization/test_bitsandbytes.py`.
+"""
+
+import pytest
+from transformers import BitsAndBytesConfig
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm.platforms import current_platform
+
+from ...utils import compare_two_settings, multi_gpu_test
+from ..utils import check_embeddings_close, check_logprobs_close
+
+if current_platform.is_rocm():
+    from vllm.platforms.rocm import on_gfx9
+
+    pytestmark = pytest.mark.skipif(
+        on_gfx9(),
+        reason="bitsandbytes not supported on gfx9 (warp size 64 limitation)",
+    )
+
+models_4bit_to_test = [
+    ("facebook/opt-125m", "quantize opt model inflight"),
+    (
+        "mistralai/Mistral-7B-Instruct-v0.3",
+        "quantize inflight model with both HF and Mistral format weights",
+    ),
+]
+
+models_4bit_to_embedding_test = [
+    ("intfloat/e5-mistral-7b-instruct", "quantize embedding model inflight"),
+]
+
+models_4bit_to_moe_test = [
+    ("allenai/OLMoE-1B-7B-0125-Instruct", "quantize moe model inflight"),
+]
+
+models_pre_qaunt_4bit_to_test = [
+    (
+        "PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed",
+        "read pre-quantized 4-bit FP4 model",
+    ),
+    ("poedator/opt-125m-bnb-4bit", "read pre-quantized 4-bit NF4 opt model"),
+]
+
+models_pre_quant_8bit_to_test = [
+    ("meta-llama/Llama-Guard-3-8B-INT8", "read pre-quantized llama 8-bit model"),
+    ("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
+]
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
+def test_load_4bit_bnb_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
+    validate_generated_texts(
+        hf_runner, vllm_runner, example_prompts[:1], model_name, False, hf_model_kwargs
+    )
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_name, description", models_pre_qaunt_4bit_to_test)
+def test_load_pre_quant_4bit_bnb_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    validate_generated_texts(
+        hf_runner, vllm_runner, example_prompts[:1], model_name, True
+    )
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_name, description", models_pre_quant_8bit_to_test)
+def test_load_8bit_bnb_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    validate_generated_texts(
+        hf_runner, vllm_runner, example_prompts[:1], model_name, True
+    )
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
+@multi_gpu_test(num_gpus=2)
+def test_load_tp_4bit_bnb_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
+    validate_generated_texts(
+        hf_runner,
+        vllm_runner,
+        example_prompts[:1],
+        model_name,
+        False,
+        hf_model_kwargs,
+        vllm_tp_size=2,
+    )
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
+@multi_gpu_test(num_gpus=2)
+def test_load_pp_4bit_bnb_model(model_name, description) -> None:
+    common_args = [
+        "--disable-log-stats",
+        "--dtype",
+        "bfloat16",
+        "--enable-prefix-caching",
+        "--quantization",
+        "bitsandbytes",
+        "--gpu-memory-utilization",
+        "0.7",
+    ]
+    pp_args = [
+        *common_args,
+        "--pipeline-parallel-size",
+        "2",
+    ]
+    compare_two_settings(model_name, common_args, pp_args)
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_name, description", models_4bit_to_moe_test)
+def test_4bit_bnb_moe_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    hf_model_kwargs = dict(
+        quantization_config=BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+        )
+    )
+    with vllm_runner(
+        model_name,
+        quantization="bitsandbytes",
+        enforce_eager=False,
+        default_torch_num_threads=1,
+    ) as llm:
+        vllm_outputs = llm.generate_greedy_logprobs(
+            example_prompts, max_tokens=32, num_logprobs=5
+        )
+
+    with hf_runner(
+        model_name, model_kwargs=hf_model_kwargs, default_torch_num_threads=1
+    ) as llm:
+        transformers_outputs = llm.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens=32, num_logprobs=5
+        )
+    check_logprobs_close(
+        outputs_0_lst=transformers_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="transformers",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_name, description", models_4bit_to_embedding_test)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_4bit_bnb_embedding_model(
+    model_name,
+    description,
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    dtype: str,
+) -> None:
+    # The example_prompts has ending "\n", for example:
+    # "Write a short story about a robot that dreams for the first time.\n"
+    # sentence_transformers will strip the input texts, see:
+    # https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
+    # This makes the input_ids different between hf_model and vllm_model.
+    # So we need to strip the input texts to avoid test failing.
+    example_prompts = [str(s).strip() for s in example_prompts]
+
+    # Inflight 4bit quantization
+    with vllm_runner(
+        model_name,
+        runner="pooling",
+        dtype=dtype,
+        gpu_memory_utilization=0.5,
+        quantization="bitsandbytes",
+        default_torch_num_threads=1,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.embed(example_prompts)
+
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
+    with hf_runner(
+        model_name,
+        dtype=dtype,
+        model_kwargs=hf_model_kwargs,
+        is_sentence_transformer=True,
+        default_torch_num_threads=1,
+    ) as hf_model:
+        hf_outputs = hf_model.encode(example_prompts)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=5e-2,
+    )
+
+
+def log_generated_texts(prompts, outputs, runner_name):
+    logged_texts = []
+    for i, (_, generated_text) in enumerate(outputs):
+        log_entry = {
+            "prompt": prompts[i],
+            "runner_name": runner_name,
+            "generated_text": generated_text,
+        }
+        logged_texts.append(log_entry)
+    return logged_texts
+
+
+def validate_generated_texts(
+    hf_runner,
+    vllm_runner,
+    prompts,
+    model_name,
+    pre_quant=False,
+    hf_model_kwargs=None,
+    vllm_tp_size=1,
+    max_tokens=8,
+):
+    # NOTE: run vLLM first, as it requires a clean process
+    # when using distributed inference
+    with vllm_runner(
+        model_name,
+        quantization=None if pre_quant else "bitsandbytes",
+        tensor_parallel_size=vllm_tp_size,
+        enforce_eager=False,
+        default_torch_num_threads=1,
+        tokenizer_mode="hf",
+        load_format="hf",
+        config_format="hf",
+    ) as llm:
+        vllm_outputs = llm.generate_greedy(prompts, max_tokens)
+        vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
+
+    if hf_model_kwargs is None:
+        hf_model_kwargs = {}
+
+    # Run with HF runner
+    with hf_runner(
+        model_name, model_kwargs=hf_model_kwargs, default_torch_num_threads=1
+    ) as llm:
+        hf_outputs = llm.generate_greedy(prompts, max_tokens)
+        hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
+
+    # Compare the generated strings
+    for hf_log, vllm_log in zip(hf_logs, vllm_logs):
+        hf_str = hf_log["generated_text"]
+        vllm_str = vllm_log["generated_text"]
+        prompt = hf_log["prompt"]
+        assert hf_str == vllm_str, (
+            f"Model: {model_name}"
+            f"Mismatch between HF and vLLM outputs:\n"
+            f"Prompt: {prompt}\n"
+            f"HF Output: '{hf_str}'\n"
+            f"vLLM Output: '{vllm_str}'"
+        )
--- a/tests/models/quantization/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# flake8: noqa
+"""Tests fp8 models against ground truth generation
+Note: these tests will only pass on L4 GPU.
+"""
+
+import pytest
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm.attention.utils.fa_utils import flash_attn_supports_fp8
+from vllm.platforms import current_platform
+from ..utils import check_logprobs_close
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="fp8 is not supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "kv_cache_dtype,base_model,test_model",
+    [
+        # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
+        (
+            "fp8_e4m3",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "nm-testing/Llama-3.2-1B-Instruct-FP8-KV",
+        ),
+        # Test BF16 checkpoint w. fp8_e5m2 kv-cache.
+        (
+            "fp8_e5m2",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "meta-llama/Llama-3.2-1B-Instruct",
+        ),
+        # Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
+        (
+            "fp8_e4m3",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "meta-llama/Llama-3.2-1B-Instruct",
+        ),
+    ],
+)
+# Due to low-precision numerical divergence, we only test logprob of 4 tokens
+@pytest.mark.parametrize("max_tokens", [4])
+@pytest.mark.parametrize("enforce_eager", [True])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
+# NOTE: Increasing this in this suite will fail CI because we currently cannot
+# reset distributed env properly. Use a value > 1 just when you test.
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    kv_cache_dtype: str,
+    base_model: str,
+    test_model: str,
+    max_tokens: int,
+    enforce_eager: bool,
+    backend: str,
+    tensor_parallel_size: int,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """
+    Only checks log probs match to cover the discrepancy in
+    numerical sensitive kernels.
+    """
+
+    if kv_cache_dtype == "fp8_e5m2" and current_platform.is_rocm():
+        pytest.skip(f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
+
+    if not flash_attn_supports_fp8():
+        pytest.skip(
+            f"{kv_cache_dtype} is not supported on this GPU type with {backend} attention."
+        )
+
+    with monkeypatch.context() as m:
+        m.setenv("TOKENIZERS_PARALLELISM", "true")
+        m.setenv("VLLM_ATTENTION_BACKEND", backend)
+
+        MAX_MODEL_LEN = 1024
+        NUM_LOG_PROBS = 8
+
+        with vllm_runner(
+            base_model,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            kv_cache_dtype="auto",
+        ) as vllm_model:
+            baseline_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS
+            )
+
+        with vllm_runner(
+            test_model,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            kv_cache_dtype=kv_cache_dtype,
+        ) as vllm_model:
+            test_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS
+            )
+
+        check_logprobs_close(
+            outputs_0_lst=baseline_outputs,
+            outputs_1_lst=test_outputs,
+            name_0="fp16_kv_cache",
+            name_1="fp8_kv_cache",
+        )
+
+
+@pytest.mark.cpu_model
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="test for the CPU backend.")
+@pytest.mark.parametrize(
+    "kv_cache_dtype,base_model,test_model",
+    [
+        # Test BF16 checkpoint w. fp8_e5m2 kv-cache.
+        (
+            "fp8_e5m2",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "meta-llama/Llama-3.2-1B-Instruct",
+        ),
+    ],
+)
+# Due to low-precision numerical divergence, we only test logprob of 4 tokens
+@pytest.mark.parametrize("max_tokens", [4])
+def test_cpu_models(
+    vllm_runner,
+    example_prompts,
+    kv_cache_dtype: str,
+    base_model: str,
+    test_model: str,
+    max_tokens: int,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """
+    Only checks log probs match to cover the discrepancy in
+    numerical sensitive kernels.
+    """
+    with monkeypatch.context() as m:
+        m.setenv("TOKENIZERS_PARALLELISM", "true")
+
+        MAX_MODEL_LEN = 1024
+        NUM_LOG_PROBS = 8
+
+        with vllm_runner(
+            base_model,
+            max_model_len=MAX_MODEL_LEN,
+            dtype="bfloat16",
+            kv_cache_dtype="auto",
+        ) as vllm_model:
+            baseline_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS
+            )
+
+        with vllm_runner(
+            test_model,
+            max_model_len=MAX_MODEL_LEN,
+            dtype="bfloat16",
+            kv_cache_dtype=kv_cache_dtype,
+        ) as vllm_model:
+            test_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS
+            )
+
+        check_logprobs_close(
+            outputs_0_lst=baseline_outputs,
+            outputs_1_lst=test_outputs,
+            name_0="bf16_kv_cache",
+            name_1="fp8_kv_cache",
+        )
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -0,0 +1,204 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests gguf models against unquantized models generations
+Note: To pass the test, quantization higher than Q4 should be used
+"""
+
+import os
+from typing import NamedTuple
+
+import pytest
+from huggingface_hub import hf_hub_download
+from pytest import MarkDecorator
+from transformers import AutoTokenizer
+
+from tests.quantization.utils import is_quant_method_supported
+
+from ...conftest import VllmRunner
+from ...utils import multi_gpu_test
+from ..utils import check_logprobs_close
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+MAX_MODEL_LEN = 1024
+
+
+class GGUFTestConfig(NamedTuple):
+    original_model: str
+    gguf_repo: str
+    gguf_filename: str
+    marks: list[MarkDecorator] = []
+
+    @property
+    def gguf_model(self):
+        return hf_hub_download(self.gguf_repo, filename=self.gguf_filename)
+
+
+LLAMA_CONFIG = GGUFTestConfig(
+    original_model="meta-llama/Llama-3.2-1B-Instruct",
+    gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
+    gguf_filename="Llama-3.2-1B-Instruct-Q6_K.gguf",
+)
+
+QWEN2_CONFIG = GGUFTestConfig(
+    original_model="Qwen/Qwen2.5-1.5B-Instruct",
+    gguf_repo="Qwen/Qwen2.5-1.5B-Instruct-GGUF",
+    gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf",
+)
+
+QWEN3_CONFIG = GGUFTestConfig(
+    original_model="Qwen/Qwen3-0.6B",
+    gguf_repo="unsloth/Qwen3-0.6B-GGUF",
+    gguf_filename="Qwen3-0.6B-BF16.gguf",
+)
+
+PHI3_CONFIG = GGUFTestConfig(
+    original_model="microsoft/Phi-3.5-mini-instruct",
+    gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF",
+    gguf_filename="Phi-3.5-mini-instruct-IQ4_XS.gguf",
+)
+
+GPT2_CONFIG = GGUFTestConfig(
+    original_model="openai-community/gpt2-large",
+    gguf_repo="QuantFactory/gpt2-large-GGUF",
+    gguf_filename="gpt2-large.Q4_K_M.gguf",
+)
+
+STABLELM_CONFIG = GGUFTestConfig(
+    original_model="stabilityai/stablelm-3b-4e1t",
+    gguf_repo="afrideva/stablelm-3b-4e1t-GGUF",
+    gguf_filename="stablelm-3b-4e1t.q4_k_m.gguf",
+)
+
+STARCODER_CONFIG = GGUFTestConfig(
+    original_model="bigcode/starcoder2-3b",
+    gguf_repo="QuantFactory/starcoder2-3b-GGUF",
+    gguf_filename="starcoder2-3b.Q6_K.gguf",
+)
+
+DOLPHIN_CONFIG = GGUFTestConfig(
+    # Test VocabParallelEmbedding sharding issue.
+    original_model="cognitivecomputations/TinyDolphin-2.8-1.1b",
+    gguf_repo="tsunemoto/TinyDolphin-2.8-1.1b-GGUF",
+    gguf_filename="tinydolphin-2.8-1.1b.Q6_K.gguf",
+)
+
+GEMMA3_CONFIG = GGUFTestConfig(
+    original_model="google/gemma-3-270m-it",
+    gguf_repo="ggml-org/gemma-3-270m-it-qat-GGUF",
+    gguf_filename="gemma-3-270m-it-qat-Q4_0.gguf",
+)
+
+MODELS = [
+    # LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
+    QWEN2_CONFIG,
+    QWEN3_CONFIG,
+    PHI3_CONFIG,
+    GPT2_CONFIG,
+    STABLELM_CONFIG,
+    DOLPHIN_CONFIG,
+    GEMMA3_CONFIG,
+    # STARCODER_CONFIG, # broken
+]
+
+
+def check_model_outputs(
+    vllm_runner: type[VllmRunner],
+    prompts: list[str],
+    model: GGUFTestConfig,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tp_size: int,
+):
+    tokenizer = AutoTokenizer.from_pretrained(model.original_model)
+    if tokenizer.chat_template is not None:
+        messages = [[{"role": "user", "content": prompt}] for prompt in prompts]
+        prompts = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+
+    # Run gguf model.
+    with vllm_runner(
+        model_name=model.gguf_model,
+        enforce_eager=True,
+        tokenizer_name=model.original_model,
+        dtype=dtype,
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=tp_size,
+    ) as gguf_model:
+        gguf_outputs = gguf_model.generate_greedy_logprobs(
+            prompts[:-1], max_tokens, num_logprobs
+        )
+
+    # Run unquantized model.
+    # Should run with tp=1, otherwise the test will stuck at
+    # nccl initialization.
+    with vllm_runner(
+        model_name=model.original_model,
+        enforce_eager=True,  # faster tests
+        dtype=dtype,
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=1,
+    ) as original_model:
+        original_outputs = original_model.generate_greedy_logprobs(
+            prompts[:-1], max_tokens, num_logprobs
+        )
+
+    check_logprobs_close(
+        outputs_0_lst=original_outputs,
+        outputs_1_lst=gguf_outputs,
+        name_0="original",
+        name_1="gguf",
+    )
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("gguf"),
+    reason="gguf is not supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "model",
+    [pytest.param(test_config, marks=test_config.marks) for test_config in MODELS],
+)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("tp_size", [1])
+def test_models(
+    vllm_runner: type[VllmRunner],
+    example_prompts: list[str],
+    model: GGUFTestConfig,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tp_size: int,
+) -> None:
+    check_model_outputs(
+        vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
+    )
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("gguf"),
+    reason="gguf is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model", [LLAMA_CONFIG])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [8])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("tp_size", [2])
+@multi_gpu_test(num_gpus=2)
+def test_distributed(
+    vllm_runner: type[VllmRunner],
+    example_prompts: list[str],
+    model: GGUFTestConfig,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tp_size: int,
+) -> None:
+    check_model_outputs(
+        vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
+    )
--- a/tests/models/quantization/test_gpt_oss_attn_quantization.py
+++ b/tests/models/quantization/test_gpt_oss_attn_quantization.py
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test attention quantization of gpt-oss model.
+The qkv_proj and o_proj in self_attention can be either quantized or excluded.
+
+Run `pytest tests/models/quantization/test_gpt_oss_attn_quantization.py`.
+
+"""
+
+import importlib
+import importlib.metadata
+from dataclasses import dataclass
+
+import huggingface_hub
+import lm_eval
+import pytest
+from packaging import version
+
+MODEL_NAMES = ["amd/gpt-oss-20b-customized-attention-quantization"]
+
+QUARK_MXFP4_AVAILABLE = importlib.util.find_spec("quark") is not None and version.parse(
+    importlib.metadata.version("amd-quark")
+) >= version.parse("0.8.99")
+
+
+def has_huggingface_access(repo):
+    try:
+        huggingface_hub.list_repo_refs(repo)
+        return True
+    except huggingface_hub.errors.RepositoryNotFoundError:
+        return False
+
+
+HF_HUB_AMD_ORG_ACCESS = all(
+    [has_huggingface_access(model_name) for model_name in MODEL_NAMES]
+)
+
+
+@dataclass
+class ModelCase:
+    model_id: str
+    tp: int
+
+
+@dataclass
+class EvaluationConfig:
+    model_name: str
+
+    def get_model_args(self) -> str:
+        return (
+            f"pretrained={self.model_name},"
+            "tensor_parallel_size=4,dtype=auto,gpu_memory_utilization=0.9,trust_remote_code=False"
+        )
+
+
+EXPECTED_ACCURACIES = {"arc_challenge": 0.20}
+
+
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
+@pytest.mark.skipif(
+    not HF_HUB_AMD_ORG_ACCESS,
+    reason="Read access to huggingface.co/amd is required for this test.",
+)
+@pytest.mark.parametrize("model_name", MODEL_NAMES)
+@pytest.mark.parametrize("task_name, expected_accuracy", EXPECTED_ACCURACIES.items())
+def test_gpt_oss_attention_quantization(
+    model_name: str, task_name: str, expected_accuracy: float
+):
+    measured_accuracy = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=EvaluationConfig(model_name).get_model_args(),
+        tasks=task_name,
+        batch_size="auto",
+    )["results"][task_name]["acc,none"]
+
+    rtol = 0.05
+    assert (
+        measured_accuracy - rtol < expected_accuracy
+        and measured_accuracy + rtol > expected_accuracy
+    ), f"Expected: {expected_accuracy} |  Measured: {measured_accuracy}"
--- a/tests/models/quantization/test_gptq_bitblas.py
+++ b/tests/models/quantization/test_gptq_bitblas.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Compare the outputs of a GPTQ model to a bitblas model.
+
+Note: GPTQ and bitblas do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+bitblas/GPTQ models are in the top 3 selections of each other.
+
+Note: bitblas internally uses locks to synchronize the threads. This can
+result in very slight nondeterminism for bitblas. As a result, we re-run the
+test up to 3 times to see if we pass.
+"""
+
+from dataclasses import dataclass
+
+import pytest
+
+from ..utils import check_logprobs_close
+
+
+@dataclass
+class ModelPair:
+    model_gptq: str
+
+
+model_pairs = [
+    ModelPair(model_gptq="hxbgsyxh/opt-125m-4bit-128g"),
+]
+
+
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(True, reason="BitBLAS takes too much time for tuning.")
+@pytest.mark.parametrize("model_pair", model_pairs)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model_pair: ModelPair,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(
+        model_pair.model_gptq, dtype=dtype, quantization="bitblas"
+    ) as bitblas_model:
+        bitblas_outputs = bitblas_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+    with vllm_runner(
+        model_pair.model_gptq, dtype=dtype, quantization="gptq"
+    ) as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+    check_logprobs_close(
+        outputs_0_lst=gptq_outputs,
+        outputs_1_lst=bitblas_outputs,
+        name_0="gptq",
+        name_1="gptq_bitblas",
+    )
--- a/tests/models/quantization/test_gptq_marlin.py
+++ b/tests/models/quantization/test_gptq_marlin.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Compares the outputs of gptq vs gptq_marlin.
+
+Note: GPTQ and Marlin do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+Marlin/GPTQ models are in the top 5 selections of each other.
+Note: Marlin internally uses locks to synchronize the threads. This can
+result in very slight nondeterminism for Marlin. As a result, we re-run the test
+up to 3 times to see if we pass.
+"""
+
+import os
+
+import pytest
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
+from vllm.platforms import current_platform
+
+from ..utils import check_logprobs_close
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+MAX_MODEL_LEN = 1024
+
+MODELS = [
+    # act_order==True, group_size=128
+    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
+    # 8-bit, act_order==True, group_size=channelwise
+    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
+    # 4-bit, act_order==True, group_size=128
+    ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main"),
+]
+
+
+@pytest.mark.flaky(reruns=3)
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin")
+    or current_platform.is_rocm()
+    or not current_platform.is_cuda(),
+    reason="gptq_marlin is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    model_name, revision = model
+
+    # Run marlin.
+    with vllm_runner(
+        model_name=model_name,
+        revision=revision,
+        dtype=dtype,
+        quantization="marlin",
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=1,
+    ) as gptq_marlin_model:
+        gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
+            example_prompts[:-1], max_tokens, num_logprobs
+        )
+    _ROPE_DICT.clear()  # clear rope cache to avoid rope dtype error
+
+    # Run gptq.
+    # The naive gptq kernel doesn't support bf16 yet.
+    # Here we always compare fp16/bf16 gpt marlin kernel
+    # to fp16 gptq kernel.
+    with vllm_runner(
+        model_name=model_name,
+        revision=revision,
+        dtype="half",
+        quantization="gptq",
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=1,
+    ) as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts[:-1], max_tokens, num_logprobs
+        )
+
+    check_logprobs_close(
+        outputs_0_lst=gptq_outputs,
+        outputs_1_lst=gptq_marlin_outputs,
+        name_0="gptq",
+        name_1="gptq_marlin",
+    )
--- a/tests/models/quantization/test_gptq_marlin_24.py
+++ b/tests/models/quantization/test_gptq_marlin_24.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Compare the outputs of a GPTQ model to a Marlin_24 model.
+
+Note: GPTQ and Marlin_24 do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+Marlin/GPTQ models are in the top 3 selections of each other.
+"""
+
+from dataclasses import dataclass
+
+import pytest
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm.platforms import current_platform
+
+from ..utils import check_logprobs_close
+
+
+@dataclass
+class ModelPair:
+    model_marlin: str
+    model_gptq: str
+
+
+model_pairs = [
+    # 4-bit, group_size == 128
+    ModelPair(
+        model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
+        model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128",
+    ),
+    # # 4-bit, group_size == channelwise
+    # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
+    #           model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
+    # 8-bit, group_size == 128
+    ModelPair(
+        model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
+        model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128",
+    ),
+    # # 8-bit, group_size == channelwise
+    # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
+    #           model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
+]
+
+
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin_24")
+    or current_platform.is_rocm()
+    or not current_platform.is_cuda(),
+    reason="Marlin24 is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_pair", model_pairs)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [8])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model_pair: ModelPair,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(
+        model_pair.model_marlin, dtype=dtype, quantization="gptq_marlin_24"
+    ) as marlin_24_model:
+        marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+    with vllm_runner(
+        model_pair.model_gptq, dtype=dtype, quantization="gptq"
+    ) as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+    check_logprobs_close(
+        outputs_0_lst=gptq_outputs,
+        outputs_1_lst=marlin_24_outputs,
+        name_0="gptq",
+        name_1="marlin_24",
+    )
--- a/tests/models/quantization/test_modelopt.py
+++ b/tests/models/quantization/test_modelopt.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# flake8: noqa
+"""Tests Model Optimizer fp8 models against ground truth generation
+Note: these tests will only pass on H100
+"""
+
+import os
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm import LLM, SamplingParams
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+MAX_MODEL_LEN = 1024
+
+MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"]
+
+EXPECTED_STRS_MAP = {
+    "nvidia/Llama-3.1-8B-Instruct-FP8": [
+        "You're referring to VLLM, a high-performance Large Language Model (LLM) inference and",
+        "Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ",
+        "The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and",
+        'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
+        "**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir",
+        "The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to",
+        "The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of",
+        "Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる",
+    ]
+}
+
+
+# This test compares against golden strings for exact match since
+# there is no baseline implementation to compare against
+# and is unstable w.r.t specifics of the fp8 implementation or
+# the hardware being run on.
+# Disabled to prevent it from breaking the build
+@pytest.mark.skip(
+    reason="Prevent unstable test based on golden strings from breaking the build."
+)
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="fp8 is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_name", MODELS)
+def test_models(example_prompts, model_name) -> None:
+    llm = LLM(
+        model=model_name,
+        max_model_len=MAX_MODEL_LEN,
+        trust_remote_code=True,
+        enforce_eager=True,
+        quantization="modelopt",
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    formatted_prompts = [
+        tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        for prompt in example_prompts
+    ]
+    params = SamplingParams(max_tokens=20, temperature=0)
+    generations: list[str] = []
+    # Note: these need to be run 1 at a time due to numerical precision,
+    # since the expected strs were generated this way.
+    for prompt in formatted_prompts:
+        outputs = llm.generate(prompt, params)
+        generations.append(outputs[0].outputs[0].text)
+    del llm
+
+    print(model_name, generations)
+    expected_strs = EXPECTED_STRS_MAP[model_name]
+    for i in range(len(example_prompts)):
+        generated_str = generations[i]
+        expected_str = expected_strs[i]
+        assert expected_str == generated_str, (
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}"
+        )
--- a/tests/models/quantization/test_mxfp4.py
+++ b/tests/models/quantization/test_mxfp4.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# flake8: noqa
+"""Tests Quark mxfp4 models against ground truth generation"""
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+MODELS = ["amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8"]
+
+EXPECTED_STRS_MAP = {
+    "amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8": [
+        "\n### Key Features\n\n* **High-throughput Inference**: vLL",
+        "\nArtificial intelligence (AI) has evolved significantly since its inception in the 1",
+        "Artificial intelligence (AI) and human intelligence (HI) are two distinct concepts that have been",
+        "A neural network is a machine learning model inspired by the structure of the human brain. It consists of",
+        "\nTitle: The Dreaming Robot\n\nAs the sun set on the bustling metropol",
+        "\nThe COVID-19 pandemic has had a profound impact on global economic structures and business",
+        "The Mona Lisa painting, created by Leonardo da Vinci in the early 16th",
+        " everybody knows this proverbial saying, but did you know that it's not entirely accurate?",
+    ]
+}
+
+
+@pytest.mark.skip(reason="Model to be released in the future")
+@pytest.mark.quant_model
+@pytest.mark.parametrize("model_name", MODELS)
+def test_models(example_prompts, model_name) -> None:
+    sampling_params = SamplingParams(max_tokens=20, temperature=0)
+    llm = LLM(
+        model=model_name,
+        kv_cache_dtype="fp8",
+        quantization="quark",
+    )
+    outputs = llm.generate(example_prompts, sampling_params)
+    for i, output in enumerate(outputs):
+        output_str = output.outputs[0].text
+        expected_str = EXPECTED_STRS_MAP[model_name][i]
+        assert expected_str == output_str, (
+            f"Expected: {expected_str!r}\nvLLM: {output_str!r}"
+        )
--- a/tests/models/quantization/test_nvfp4.py
+++ b/tests/models/quantization/test_nvfp4.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# flake8: noqa
+"""Tests Model Optimizer nvfp4 models against ground truth generation
+Note: these tests will only pass on B200
+"""
+
+import os
+from typing import List
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm import LLM, SamplingParams
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+MAX_MODEL_LEN = 1024
+
+MODELS = ["nvidia/Llama-3.3-70B-Instruct-FP4"]
+
+EXPECTED_STRS_MAP = {
+    "nvidia/Llama-3.3-70B-Instruct-FP4": [
+        "vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference",
+        "Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ",
+        "Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process",
+        "A neural network is a type of machine learning model inspired by the structure and function of the human brain",
+        "In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push",
+        "The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading",
+        "The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of",
+        "Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts",
+    ]
+}
+
+
+# This test compares against golden strings for exact match since
+# there is no baseline implementation to compare against
+# and is unstable w.r.t specifics of the fp4 implementation or
+# the hardware being run on.
+# Disabled to prevent it from breaking the build
+@pytest.mark.skip(
+    reason="Prevent unstable test based on golden strings from breaking the build "
+    " and test input model being too large and hanging the system."
+)
+@pytest.mark.skipif(
+    not is_quant_method_supported("modelopt_fp4"),
+    reason="modelopt_fp4 is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_name", MODELS)
+def test_models(example_prompts, model_name) -> None:
+    llm = LLM(
+        model=model_name,
+        max_model_len=MAX_MODEL_LEN,
+        trust_remote_code=True,
+        enforce_eager=True,
+        quantization="modelopt_fp4",
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    formatted_prompts = [
+        tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        for prompt in example_prompts
+    ]
+    params = SamplingParams(max_tokens=20, temperature=0)
+    generations: List[str] = []
+    # Note: these need to be run 1 at a time due to numerical precision,
+    # since the expected strs were generated this way.
+    for prompt in formatted_prompts:
+        outputs = llm.generate(prompt, params)
+        generations.append(outputs[0].outputs[0].text)
+    del llm
+
+    print(model_name, generations)
+    expected_strs = EXPECTED_STRS_MAP[model_name]
+    for i in range(len(example_prompts)):
+        generated_str = generations[i]
+        expected_str = expected_strs[i]
+        assert expected_str == generated_str, (
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}"
+        )