sglang/test/srt/models/test_generation_models.py

"""
Usage:

To test a specific model:
1. Add it to ALL_OTHER_MODELS
2. Run `ONLY_RUN=Qwen/Qwen2-1.5B python3 -m unittest test_generation_models.TestGenerationModels.test_others`
"""

"""
Copyright 2023-2024 SGLang Team
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import dataclasses
import multiprocessing as mp
import os
import unittest
from typing import List

import torch

from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
from sglang.test.test_utils import calculate_rouge_l, is_in_ci


@dataclasses.dataclass
class ModelCase:
    model_path: str
    tp_size: int = 1
    prefill_tolerance: float = 5e-2
    decode_tolerance: float = 5e-2
    rouge_l_tolerance: float = 1
    skip_long_prompt: bool = False


# Popular models that run on the CI
CI_MODELS = [
    ModelCase("meta-llama/Llama-3.1-8B-Instruct"),
    ModelCase("google/gemma-2-2b"),
]

# All other models that do not run on the CI
ALL_OTHER_MODELS = [
    ModelCase("Qwen/Qwen2-1.5B"),
    ModelCase("Qwen/Qwen2.5-14B-Instruct"),
    ModelCase("HuggingFaceTB/SmolLM-135M-Instruct", skip_long_prompt=True),
    ModelCase("allenai/OLMo-1B-0724-hf", decode_tolerance=8e-2, skip_long_prompt=True),
    ModelCase("THUDM/glm-4-9b-chat"),
    ModelCase("openai-community/gpt2")
]

TORCH_DTYPES = [torch.float16]


class TestGenerationModels(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        mp.set_start_method("spawn", force=True)

    def assert_close_logits_and_output_strs(
        self,
        prompts: List[str],
        model_case: ModelCase,
        torch_dtype: torch.dtype,
    ) -> None:
        model_path = model_case.model_path
        prefill_tolerance, decode_tolerance, rouge_l_tolerance = (
            model_case.prefill_tolerance,
            model_case.decode_tolerance,
            model_case.rouge_l_tolerance,
        )
        max_new_tokens = 32

        with HFRunner(
            model_path,
            torch_dtype=torch_dtype,
            model_type="generation",
        ) as hf_runner:
            hf_outputs = hf_runner.forward(prompts, max_new_tokens=max_new_tokens)

        with SRTRunner(
            model_path,
            tp_size=model_case.tp_size,
            torch_dtype=torch_dtype,
            model_type="generation",
        ) as srt_runner:
            srt_outputs = srt_runner.forward(prompts, max_new_tokens=max_new_tokens)

        for i in range(len(prompts)):
            # Compare input logprobs
            hf_logprobs = torch.Tensor(hf_outputs.top_input_logprobs[i])
            srt_logprobs = torch.Tensor(srt_outputs.top_input_logprobs[i])
            input_len = hf_logprobs.shape[0]
            print(
                "prefill logprobs max_diff", torch.max(abs(hf_logprobs - srt_logprobs))
            )
            if input_len <= 100:
                assert torch.all(abs(hf_logprobs - srt_logprobs) < prefill_tolerance), (
                    f"prefill logprobs are not all close with model_path={model_path} prompts={prompts} "
                    f"prefill_tolerance={prefill_tolerance}."
                    f"{hf_logprobs=}, {srt_logprobs=}"
                )

            # Compare output logprobs
            hf_logprobs = torch.Tensor(hf_outputs.top_output_logprobs[i])
            srt_logprobs = torch.Tensor(srt_outputs.top_output_logprobs[i])

            print(
                "decode logprobs max_diff", torch.max(abs(hf_logprobs - srt_logprobs))
            )
            if input_len <= 100:
                assert torch.all(abs(hf_logprobs - srt_logprobs) < decode_tolerance), (
                    f"decode logprobs are not all close with model_path={model_path} prompts={prompts} "
                    f"decode_tolerance={decode_tolerance}."
                    f"{hf_logprobs=}, {srt_logprobs=}"
                )

        # Compare output strings
        print(f"{hf_outputs.output_strs=}")
        print(f"{srt_outputs.output_strs=}")
        rouge_l_scores = calculate_rouge_l(
            hf_outputs.output_strs, srt_outputs.output_strs
        )
        print(f"{rouge_l_scores=}")
        assert all(
            score >= rouge_l_tolerance for score in rouge_l_scores
        ), f"Not all ROUGE-L scores are greater than rouge_l_tolerance={rouge_l_tolerance}"

    def test_ci_models(self):
        for model_case in CI_MODELS:
            for torch_dtype in TORCH_DTYPES:

                # Skip long prompts for models that do not have a long context
                prompts = DEFAULT_PROMPTS
                if model_case.skip_long_prompt:
                    prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]

                # Assert the logits and output strs are close
                self.assert_close_logits_and_output_strs(
                    prompts, model_case, torch_dtype
                )

    def test_others(self):
        if is_in_ci():
            return

        for model_case in ALL_OTHER_MODELS:
            # Only run a specified model
            if (
                "ONLY_RUN" in os.environ
                and os.environ["ONLY_RUN"] != model_case.model_path
            ):
                continue

            # Skip long prompts for models that do not have a long context
            prompts = DEFAULT_PROMPTS
            if model_case.skip_long_prompt:
                prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]

            # Assert the logits and output strs are close
            self.assert_close_logits_and_output_strs(prompts, model_case, torch.float16)


if __name__ == "__main__":
    unittest.main()
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`"""`
			`Usage:`

			`To test a specific model:`
			`1. Add it to ALL_OTHER_MODELS`
			2. Run `ONLY_RUN=Qwen/Qwen2-1.5B python3 -m unittest test_generation_models.TestGenerationModels.test_others`
			`"""`

Add model accuracy test - step 1 (#866) 2024-08-03 18:20:50 -07:00			`"""`
			`Copyright 2023-2024 SGLang Team`
			`Licensed under the Apache License, Version 2.0 (the "License");`
			`you may not use this file except in compliance with the License.`
			`You may obtain a copy of the License at`
Add e5-mistral embedding model - step 3/3 (#988) 2024-08-08 16:31:19 -07:00
Add model accuracy test - step 1 (#866) 2024-08-03 18:20:50 -07:00			`http://www.apache.org/licenses/LICENSE-2.0`
Add e5-mistral embedding model - step 3/3 (#988) 2024-08-08 16:31:19 -07:00
Add model accuracy test - step 1 (#866) 2024-08-03 18:20:50 -07:00			`Unless required by applicable law or agreed to in writing, software`
			`distributed under the License is distributed on an "AS IS" BASIS,`
			`WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`See the License for the specific language governing permissions and`
			`limitations under the License.`
			`"""`

Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`import dataclasses`
[CI] Fix the issue of unit test hanging (#1211) 2024-08-25 16:21:37 -07:00			`import multiprocessing as mp`
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`import os`
Add model accuracy test - step 1 (#866) 2024-08-03 18:20:50 -07:00			`import unittest`
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`from typing import List`
Add model accuracy test - step 1 (#866) 2024-08-03 18:20:50 -07:00
			`import torch`

			`from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner`
[Fix] Fix clean_up_tokenization_spaces in tokenizer (#1510) 2024-09-24 21:37:33 -07:00			`from sglang.test.test_utils import calculate_rouge_l, is_in_ci`
Add model accuracy test - step 1 (#866) 2024-08-03 18:20:50 -07:00
Support Alibaba-NLP/gte-Qwen2-7B-instruct embedding Model (#1186) Co-authored-by: Ying Sheng <sqy1415@gmail.com> 2024-08-26 01:29:12 +08:00
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`@dataclasses.dataclass`
			`class ModelCase:`
			`model_path: str`
			`tp_size: int = 1`
			`prefill_tolerance: float = 5e-2`
			`decode_tolerance: float = 5e-2`
			`rouge_l_tolerance: float = 1`
Fix failed ci tests on long prompts; Better error messages for embedding models (#1700) 2024-10-17 09:23:29 -07:00			`skip_long_prompt: bool = False`
Support Alibaba-NLP/gte-Qwen2-7B-instruct embedding Model (#1186) Co-authored-by: Ying Sheng <sqy1415@gmail.com> 2024-08-26 01:29:12 +08:00

Fix the correctness test in bench_latency.py when tp > 1 and test_generation_models.py (#1631) 2024-10-11 05:03:20 -07:00			`# Popular models that run on the CI`
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`CI_MODELS = [`
[Fix] Fix all the Huggingface paths (#1553) 2024-10-02 10:12:07 -07:00			`ModelCase("meta-llama/Llama-3.1-8B-Instruct"),`
Fix sliding window attention and gemma-2 unit tests in CI (#1746) 2024-10-21 13:47:12 -07:00			`ModelCase("google/gemma-2-2b"),`
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`]`
Support Alibaba-NLP/gte-Qwen2-7B-instruct embedding Model (#1186) Co-authored-by: Ying Sheng <sqy1415@gmail.com> 2024-08-26 01:29:12 +08:00
Fix the correctness test in bench_latency.py when tp > 1 and test_generation_models.py (#1631) 2024-10-11 05:03:20 -07:00			`# All other models that do not run on the CI`
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`ALL_OTHER_MODELS = [`
			`ModelCase("Qwen/Qwen2-1.5B"),`
[Fix] Fix AttributeError in Qwen2.5 LoRA: 'Qwen2ForCausalLM' object has no attribute 'get_hidden_dim' (#1536) Co-authored-by: Ying Sheng <sqy1415@gmail.com> 2024-10-03 12:41:15 +09:00			`ModelCase("Qwen/Qwen2.5-14B-Instruct"),`
Fix failed ci tests on long prompts; Better error messages for embedding models (#1700) 2024-10-17 09:23:29 -07:00			`ModelCase("HuggingFaceTB/SmolLM-135M-Instruct", skip_long_prompt=True),`
			`ModelCase("allenai/OLMo-1B-0724-hf", decode_tolerance=8e-2, skip_long_prompt=True),`
Add GLM-4 TextGeneration Model support for SGLang (#1736) 2024-10-21 12:08:30 +08:00			`ModelCase("THUDM/glm-4-9b-chat"),`
Add new model: Gpt2 (#1833) 2024-10-29 19:52:33 -05:00			`ModelCase("openai-community/gpt2")`
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`]`
Support Alibaba-NLP/gte-Qwen2-7B-instruct embedding Model (#1186) Co-authored-by: Ying Sheng <sqy1415@gmail.com> 2024-08-26 01:29:12 +08:00
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`TORCH_DTYPES = [torch.float16]`
Support Alibaba-NLP/gte-Qwen2-7B-instruct embedding Model (#1186) Co-authored-by: Ying Sheng <sqy1415@gmail.com> 2024-08-26 01:29:12 +08:00

fix: Fix returned prefill logits and add output str test (#1046) 2024-08-11 23:13:45 -07:00			`class TestGenerationModels(unittest.TestCase):`
Let reward model take text inputs instead of message lists (#1907) Co-authored-by: Kyle Corbitt <kyle@corbt.com> 2024-11-03 13:27:12 -08:00
Fix the correctness test in bench_latency.py when tp > 1 and test_generation_models.py (#1631) 2024-10-11 05:03:20 -07:00			`@classmethod`
			`def setUpClass(cls):`
Let reward model take text inputs instead of message lists (#1907) Co-authored-by: Kyle Corbitt <kyle@corbt.com> 2024-11-03 13:27:12 -08:00			`mp.set_start_method("spawn", force=True)`
Fix the correctness test in bench_latency.py when tp > 1 and test_generation_models.py (#1631) 2024-10-11 05:03:20 -07:00
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`def assert_close_logits_and_output_strs(`
Add model accuracy test - step 1 (#866) 2024-08-03 18:20:50 -07:00			`self,`
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`prompts: List[str],`
			`model_case: ModelCase,`
			`torch_dtype: torch.dtype,`
Add model accuracy test - step 1 (#866) 2024-08-03 18:20:50 -07:00			`) -> None:`
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`model_path = model_case.model_path`
			`prefill_tolerance, decode_tolerance, rouge_l_tolerance = (`
			`model_case.prefill_tolerance,`
			`model_case.decode_tolerance,`
			`model_case.rouge_l_tolerance,`
			`)`
			`max_new_tokens = 32`
[Feature] Initial support for multi-LoRA serving (#1307) 2024-09-12 16:46:14 -07:00
Add model accuracy test - step 1 (#866) 2024-08-03 18:20:50 -07:00			`with HFRunner(`
[Feature] Support reward model LxzGordon/URM-LLaMa-3.1-8B (#1525) 2024-09-27 23:32:11 -07:00			`model_path,`
			`torch_dtype=torch_dtype,`
			`model_type="generation",`
Add model accuracy test - step 1 (#866) 2024-08-03 18:20:50 -07:00			`) as hf_runner:`
fix: Fix returned prefill logits and add output str test (#1046) 2024-08-11 23:13:45 -07:00			`hf_outputs = hf_runner.forward(prompts, max_new_tokens=max_new_tokens)`
Add model accuracy test - step 1 (#866) 2024-08-03 18:20:50 -07:00
			`with SRTRunner(`
			`model_path,`
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`tp_size=model_case.tp_size,`
Add model accuracy test - step 1 (#866) 2024-08-03 18:20:50 -07:00			`torch_dtype=torch_dtype,`
[Feature] Support reward model LxzGordon/URM-LLaMa-3.1-8B (#1525) 2024-09-27 23:32:11 -07:00			`model_type="generation",`
Add model accuracy test - step 1 (#866) 2024-08-03 18:20:50 -07:00			`) as srt_runner:`
fix: Fix returned prefill logits and add output str test (#1046) 2024-08-11 23:13:45 -07:00			`srt_outputs = srt_runner.forward(prompts, max_new_tokens=max_new_tokens)`
Add model accuracy test - step 1 (#866) 2024-08-03 18:20:50 -07:00
			`for i in range(len(prompts)):`
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`# Compare input logprobs`
Add model accuracy test - step 1 (#866) 2024-08-03 18:20:50 -07:00			`hf_logprobs = torch.Tensor(hf_outputs.top_input_logprobs[i])`
			`srt_logprobs = torch.Tensor(srt_outputs.top_input_logprobs[i])`
[CI] Return output logprobs in unit test (#1361) 2024-09-09 13:05:13 -07:00			`input_len = hf_logprobs.shape[0]`
			`print(`
			`"prefill logprobs max_diff", torch.max(abs(hf_logprobs - srt_logprobs))`
			`)`
			`if input_len <= 100:`
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`assert torch.all(abs(hf_logprobs - srt_logprobs) < prefill_tolerance), (`
			`f"prefill logprobs are not all close with model_path={model_path} prompts={prompts} "`
			`f"prefill_tolerance={prefill_tolerance}."`
			`f"{hf_logprobs=}, {srt_logprobs=}"`
			`)`
Add model accuracy test - step 1 (#866) 2024-08-03 18:20:50 -07:00
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`# Compare output logprobs`
[CI] Return output logprobs in unit test (#1361) 2024-09-09 13:05:13 -07:00			`hf_logprobs = torch.Tensor(hf_outputs.top_output_logprobs[i])`
			`srt_logprobs = torch.Tensor(srt_outputs.top_output_logprobs[i])`
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00
[CI] Return output logprobs in unit test (#1361) 2024-09-09 13:05:13 -07:00			`print(`
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`"decode logprobs max_diff", torch.max(abs(hf_logprobs - srt_logprobs))`
[CI] Return output logprobs in unit test (#1361) 2024-09-09 13:05:13 -07:00			`)`
			`if input_len <= 100:`
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`assert torch.all(abs(hf_logprobs - srt_logprobs) < decode_tolerance), (`
			`f"decode logprobs are not all close with model_path={model_path} prompts={prompts} "`
			`f"decode_tolerance={decode_tolerance}."`
			`f"{hf_logprobs=}, {srt_logprobs=}"`
			`)`
[CI] Return output logprobs in unit test (#1361) 2024-09-09 13:05:13 -07:00
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`# Compare output strings`
			`print(f"{hf_outputs.output_strs=}")`
			`print(f"{srt_outputs.output_strs=}")`
Support Alibaba-NLP/gte-Qwen2-7B-instruct embedding Model (#1186) Co-authored-by: Ying Sheng <sqy1415@gmail.com> 2024-08-26 01:29:12 +08:00			`rouge_l_scores = calculate_rouge_l(`
			`hf_outputs.output_strs, srt_outputs.output_strs`
			`)`
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`print(f"{rouge_l_scores=}")`
Support Alibaba-NLP/gte-Qwen2-7B-instruct embedding Model (#1186) Co-authored-by: Ying Sheng <sqy1415@gmail.com> 2024-08-26 01:29:12 +08:00			`assert all(`
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`score >= rouge_l_tolerance for score in rouge_l_scores`
			`), f"Not all ROUGE-L scores are greater than rouge_l_tolerance={rouge_l_tolerance}"`

			`def test_ci_models(self):`
			`for model_case in CI_MODELS:`
Add model accuracy test - step 1 (#866) 2024-08-03 18:20:50 -07:00			`for torch_dtype in TORCH_DTYPES:`
Fix failed ci tests on long prompts; Better error messages for embedding models (#1700) 2024-10-17 09:23:29 -07:00
			`# Skip long prompts for models that do not have a long context`
			`prompts = DEFAULT_PROMPTS`
			`if model_case.skip_long_prompt:`
			`prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]`

			`# Assert the logits and output strs are close`
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`self.assert_close_logits_and_output_strs(`
Fix failed ci tests on long prompts; Better error messages for embedding models (#1700) 2024-10-17 09:23:29 -07:00			`prompts, model_case, torch_dtype`
Add model accuracy test - step 1 (#866) 2024-08-03 18:20:50 -07:00			`)`

Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`def test_others(self):`
[Fix] Fix clean_up_tokenization_spaces in tokenizer (#1510) 2024-09-24 21:37:33 -07:00			`if is_in_ci():`
			`return`

Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`for model_case in ALL_OTHER_MODELS:`
Fix the correctness test in bench_latency.py when tp > 1 and test_generation_models.py (#1631) 2024-10-11 05:03:20 -07:00			`# Only run a specified model`
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`if (`
			`"ONLY_RUN" in os.environ`
			`and os.environ["ONLY_RUN"] != model_case.model_path`
			`):`
			`continue`
Fix the correctness test in bench_latency.py when tp > 1 and test_generation_models.py (#1631) 2024-10-11 05:03:20 -07:00
Fix failed ci tests on long prompts; Better error messages for embedding models (#1700) 2024-10-17 09:23:29 -07:00			`# Skip long prompts for models that do not have a long context`
Fix the correctness test in bench_latency.py when tp > 1 and test_generation_models.py (#1631) 2024-10-11 05:03:20 -07:00			`prompts = DEFAULT_PROMPTS`
Fix failed ci tests on long prompts; Better error messages for embedding models (#1700) 2024-10-17 09:23:29 -07:00			`if model_case.skip_long_prompt:`
Fix the correctness test in bench_latency.py when tp > 1 and test_generation_models.py (#1631) 2024-10-11 05:03:20 -07:00			`prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]`

			`# Assert the logits and output strs are close`
			`self.assert_close_logits_and_output_strs(prompts, model_case, torch.float16)`
Add model accuracy test - step 1 (#866) 2024-08-03 18:20:50 -07:00
[CI] Fix the issue of unit test hanging (#1211) 2024-08-25 16:21:37 -07:00
Better unit tests for adding a new model (#1488) 2024-09-22 01:50:37 -07:00			`if __name__ == "__main__":`
[CI] Fix CI (#1217) 2024-08-25 19:56:42 -07:00			`unittest.main()`