Add model accuracy test - step 1 (#866)

2024-08-03 18:20:50 -07:00
parent 7dd8a7e6d9
commit 70cc0749ce
4 changed files with 330 additions and 3 deletions
--- a/.github/workflows/unit-test.yml
+++ b/.github/workflows/unit-test.yml
@@ -35,6 +35,7 @@ jobs:
        pip install -e "python[all]"
        pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ --force-reinstall
        pip install --upgrade transformers
+        pip install accelerate

    - name: Test Frontend Language with SRT Backend
      run: |
@@ -50,6 +51,7 @@ jobs:
      run: |
        cd test/srt
        python3 test_eval_accuracy.py
+        python3 models/test_causal_models.py

    - name: Test Frontend Language with OpenAI Backend
      run: |
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -28,7 +28,7 @@ import sys
 import threading
 import time
 from http import HTTPStatus
-from typing import Dict, Optional
+from typing import Dict, List, Optional, Union

 # Fix a bug of Python threading
 setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
@@ -481,10 +481,10 @@ class Runtime:
            trust_remote_code=self.server_args.trust_remote_code,
        )

-    async def add_request(
+    async def async_generate(
        self,
        prompt: str,
-        sampling_params: Dict,
+        sampling_params: Optional[Dict] = None,
    ):
        json_data = {
            "text": prompt,
@@ -507,5 +507,26 @@ class Runtime:
                            yield cur
                        pos += len(cur)

+    add_request = async_generate
+
+    def generate(
+        self,
+        prompt: str,
+        sampling_params: Optional[Dict] = None,
+        return_logprob: Optional[Union[List[bool], bool]] = False,
+        top_logprobs_num: Optional[Union[List[int], int]] = None,
+    ):
+        json_data = {
+            "text": prompt,
+            "sampling_params": sampling_params,
+            "return_logprob": return_logprob,
+            "top_logprobs_num": top_logprobs_num,
+        }
+        response = requests.post(
+            self.url + "/generate",
+            json=json_data,
+        )
+        return json.dumps(response.json())
+
    def __del__(self):
        self.shutdown()
--- a/python/sglang/test/runners.py
+++ b/python/sglang/test/runners.py
@@ -0,0 +1,237 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import json
+import multiprocessing
+from dataclasses import dataclass
+from typing import List, Union
+
+import torch
+import torch.nn.functional as F
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from sglang.srt.server import Runtime
+
+DEFAULT_PROMPTS = [
+    "The capital of France is",
+    "The capital of the United Kindom is",
+    "Today is a sunny day and I like",
+]
+
+NUM_TOP_LOGPROBS = 5
+
+
+def is_embedding_model(model_path):
+    # FIXME incomplete list
+    if "e5-mistral-7b-instruct" in model_path.lower():
+        return True
+    return False
+
+
+def get_dtype_str(torch_dtype):
+    if torch_dtype is torch.float16:
+        return "float16"
+    else:
+        raise NotImplementedError()
+
+
+@dataclass
+class ModelOutput:
+    output_strs: str = None
+    top_input_logprobs: torch.Tensor = None
+    top_output_logprobs: torch.Tensor = None
+    embed_logits: torch.Tensor = None
+
+
+class HFRunner:
+    def __init__(
+        self,
+        model_path,
+        torch_dtype=torch.float16,
+        is_embedding_model=None,
+    ):
+        self.in_queue = multiprocessing.Queue()
+        self.out_queue = multiprocessing.Queue()
+
+        self.model_proc = multiprocessing.Process(
+            target=self.start_model_process,
+            args=(
+                self.in_queue,
+                self.out_queue,
+                model_path,
+                torch_dtype,
+                is_embedding_model,
+            ),
+        )
+        self.model_proc.start()
+
+    def start_model_process(
+        self, in_queue, out_queue, model_path, torch_dtype, is_embedding_model
+    ):
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            torch_dtype=torch_dtype,
+            trust_remote_code=True,
+        )
+
+        self.is_embedding_model = (
+            is_embedding_model(model_path)
+            if is_embedding_model is None
+            else is_embedding_model
+        )
+        if not self.is_embedding_model:
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                torch_dtype=torch_dtype,
+                low_cpu_mem_usage=True,
+                trust_remote_code=True,
+            ).cuda()
+        else:
+            from sentence_transformers import SentenceTransformer
+
+            self.model = SentenceTransformer(
+                model_path,
+                device="cpu",
+            ).to(dtype=torch_dtype)
+
+        while True:
+            prompts, max_new_tokens = in_queue.get()
+            if prompts is not None:
+                if not self.is_embedding_model:
+                    output_strs = []
+                    prefill_logprobs = []
+                    for p in prompts:
+                        if isinstance(p, str):
+                            input_ids = self.tokenizer.encode(
+                                p, return_tensors="pt"
+                            ).cuda()
+                        else:
+                            input_ids = torch.tensor([p], device="cuda")
+
+                        output_ids = self.model.generate(
+                            input_ids, do_sample=False, max_new_tokens=max_new_tokens
+                        )
+                        output_strs.append(self.tokenizer.decode(output_ids[0]))
+
+                        logits = self.model.forward(input_ids).logits[0]
+                        logprobs = F.log_softmax(
+                            logits, dim=-1, dtype=torch.float32
+                        ).tolist()
+                        # index_of_max = (lambda nums: nums.index(max(nums)))(logprobs[-1])
+                        # print("index", index_of_max)
+                        logprobs = [
+                            sorted(token_logprobs, reverse=True)[:NUM_TOP_LOGPROBS]
+                            for token_logprobs in logprobs
+                        ]
+                        prefill_logprobs.append(logprobs)
+
+                    out_queue.put(
+                        ModelOutput(
+                            output_strs=output_strs, top_input_logprobs=prefill_logprobs
+                        )
+                    )
+
+                else:
+                    assert isinstance(prompts, List[str])
+                    logits = self.model.encode(prompts).tolist()
+
+                    out_queue.put(ModelOutput(embed_logits=logits))
+
+    def forward(
+        self,
+        prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
+        max_new_tokens=64,
+    ):
+        self.in_queue.put((prompts, max_new_tokens))
+        return self.out_queue.get()
+
+    def terminate(self):
+        self.model_proc.terminate()
+        self.in_queue = self.out_queue = None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.model_proc.terminate()
+        self.in_queue = self.out_queue = None
+
+
+class SRTRunner:
+    def __init__(
+        self,
+        model_path,
+        tp_size=1,
+        torch_dtype=torch.float16,
+        is_embedding_model=None,
+    ):
+        self.is_embedding_model = (
+            is_embedding_model(model_path)
+            if is_embedding_model is None
+            else is_embedding_model
+        )
+        if self.is_embedding_model:
+            raise NotImplementedError()
+
+        self.runtime = Runtime(
+            model_path=model_path,
+            tp_size=tp_size,
+            dtype=get_dtype_str(torch_dtype),
+        )
+
+    def forward(
+        self,
+        prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
+        max_new_tokens=64,
+    ):
+        # the return value contains logprobs from prefill
+        output_strs = []
+        top_input_logprobs = []
+        sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
+        for prompt in prompts:
+            response = self.runtime.generate(
+                prompt,
+                sampling_params=sampling_params,
+                return_logprob=True,
+                top_logprobs_num=NUM_TOP_LOGPROBS,
+            )
+            response = json.loads(response)
+            output_strs.append(response["text"])
+            top_input_logprobs.append(
+                [
+                    [tup[0] for tup in x[:NUM_TOP_LOGPROBS]]
+                    for x in response["meta_info"]["input_top_logprobs"][1:]
+                ]
+                + [
+                    [
+                        tup[0]
+                        for tup in response["meta_info"]["output_top_logprobs"][0][
+                            :NUM_TOP_LOGPROBS
+                        ]
+                    ]
+                ]
+            )
+            # print(response["meta_info"]["output_top_logprobs"][0])
+
+        return ModelOutput(
+            output_strs=output_strs, top_input_logprobs=top_input_logprobs
+        )
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.runtime.shutdown()
+        del self.runtime
--- a/test/srt/models/test_causal_models.py
+++ b/test/srt/models/test_causal_models.py
@@ -0,0 +1,67 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import torch
+
+from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
+
+MODELS = [
+    ("meta-llama/Meta-Llama-3.1-8B-Instruct", 1),
+    # ("meta-llama/Meta-Llama-3.1-8B-Instruct", 2),
+]
+TORCH_DTYPES = [torch.float16]
+
+
+class TestCausalModels(unittest.TestCase):
+
+    def assert_close_prefill_logits(
+        self,
+        prompts,
+        model_path,
+        tp_size,
+        torch_dtype,
+    ) -> None:
+        with HFRunner(
+            model_path, torch_dtype=torch_dtype, is_embedding_model=False
+        ) as hf_runner:
+            hf_outputs = hf_runner.forward(prompts)
+
+        with SRTRunner(
+            model_path,
+            tp_size=tp_size,
+            torch_dtype=torch_dtype,
+            is_embedding_model=False,
+        ) as srt_runner:
+            srt_outputs = srt_runner.forward(prompts)
+
+        for i in range(len(prompts)):
+            hf_logprobs = torch.Tensor(hf_outputs.top_input_logprobs[i])
+            srt_logprobs = torch.Tensor(srt_outputs.top_input_logprobs[i])
+
+            tolerance = 2e-2
+            assert torch.all(
+                abs(hf_logprobs - srt_logprobs) < tolerance
+            ), f"prefill logprobs not all close"
+
+    def test_prefill_logits(self):
+        for model, tp_size in MODELS:
+            for torch_dtype in TORCH_DTYPES:
+                self.assert_close_prefill_logits(
+                    DEFAULT_PROMPTS, model, tp_size, torch_dtype
+                )
+
+
+if __name__ == "__main__":
+    unittest.main(warnings="ignore")