init

2026-01-09 13:34:11 +08:00
parent dfa6476b58
commit b2ef04d792
538 changed files with 105693 additions and 2 deletions
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -0,0 +1,90 @@
+# flake8: noqa
+"""Tests fp8 models against ground truth generation
+Note: these tests will only pass on L4 GPU.
+"""
+import os
+
+import pytest
+import torch
+from transformers import AutoTokenizer
+
+from vllm import LLM, SamplingParams
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+MAX_MODEL_LEN = 1024
+
+MODELS = [
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8",
+    "meta-llama/Meta-Llama-3-8B-Instruct",
+]
+
+EXPECTED_STRS_MAP = {
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8": [
+        'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
+        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+        'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
+        'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
+        'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep',
+        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here',
+        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+        'Here are the translations:\n\n**Japanese:** (Haya tori, nemuri nemuri)\n\n**'
+    ],
+    "meta-llama/Meta-Llama-3-8B-Instruct": [
+        'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
+        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+        'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
+        'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
+        'In the year 2154, the robotics lab at NeuroSpark Industries was on the cusp of',
+        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
+        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+        'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu'
+    ],
+}
+
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+fp8_not_supported = (capability <
+                     QUANTIZATION_METHODS["fp8"].get_min_capability())
+
+
+@pytest.mark.skipif(fp8_not_supported,
+                    reason="fp8 is not supported on this GPU type.")
+@pytest.mark.parametrize("model_name", MODELS)
+def test_models(
+    example_prompts,
+    model_name,
+) -> None:
+    model = LLM(model=model_name,
+                max_model_len=MAX_MODEL_LEN,
+                enforce_eager=True,
+                quantization="fp8")
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    formatted_prompts = [
+        tokenizer.apply_chat_template([{
+            "role": "user",
+            "content": prompt
+        }],
+                                      tokenize=False,
+                                      add_generation_prompt=True)
+        for prompt in example_prompts
+    ]
+
+    params = SamplingParams(max_tokens=20, temperature=0)
+    generations = []
+    # Note: these need to be run 1 at a time due to numerical precision,
+    # since the expected strs were generated this way.
+    for prompt in formatted_prompts:
+        outputs = model.generate(prompt, params)
+        generations.append(outputs[0].outputs[0].text)
+    del model
+
+    print(generations)
+    expected_strs = EXPECTED_STRS_MAP[model_name]
+    for i in range(len(example_prompts)):
+        generated_str = generations[i]
+        expected_str = expected_strs[i]
+        assert expected_str == generated_str, (
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")