Support Alibaba-NLP/gte-Qwen2-7B-instruct embedding Model (#1186)

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
2024-08-26 01:29:12 +08:00
parent 66e7dcaf70
commit 30b4f771b0
15 changed files with 167 additions and 55 deletions
--- a/test/srt/models/test_embedding_models.py
+++ b/test/srt/models/test_embedding_models.py
@@ -20,7 +20,10 @@ import torch
 from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
 from sglang.test.test_utils import get_similarities

-MODELS = [("intfloat/e5-mistral-7b-instruct", 1, 0.2)]
+MODELS = [
+    ("Alibaba-NLP/gte-Qwen2-1.5B-instruct", 1, 1e-5),
+    ("intfloat/e5-mistral-7b-instruct", 1, 1e-5),
+]
 TORCH_DTYPES = [torch.float16]


@@ -32,10 +35,10 @@ class TestEmbeddingModels(unittest.TestCase):
        model_path,
        tp_size,
        torch_dtype,
-        long_context_tolerance,
+        prefill_tolerance,
    ) -> None:
        with HFRunner(
-            model_path, torch_dtype=torch_dtype, is_generation_model=False
+            model_path, torch_dtype=torch_dtype, is_generation=False
        ) as hf_runner:
            hf_outputs = hf_runner.forward(prompts)

@@ -43,11 +46,9 @@ class TestEmbeddingModels(unittest.TestCase):
            model_path,
            tp_size=tp_size,
            torch_dtype=torch_dtype,
-            is_generation_model=False,
+            is_generation=False,
        ) as srt_runner:
-            srt_outputs = srt_runner.forward(
-                prompts,
-            )
+            srt_outputs = srt_runner.forward(prompts)

        for i in range(len(prompts)):
            hf_logits = torch.Tensor(hf_outputs.embed_logits[i])
@@ -57,18 +58,15 @@ class TestEmbeddingModels(unittest.TestCase):
            print("similarity diff", abs(similarity - 1))

            if len(prompts[i]) <= 1000:
-                tolerance = 1e-5
-            else:
-                tolerance = long_context_tolerance
-            assert torch.all(
-                abs(similarity - 1) < tolerance
-            ), "embeddings are not all close"
+                assert torch.all(
+                    abs(similarity - 1) < prefill_tolerance
+                ), "embeddings are not all close"

    def test_prefill_logits(self):
-        for model, tp_size, long_context_tolerance in MODELS:
+        for model, tp_size, prefill_tolerance in MODELS:
            for torch_dtype in TORCH_DTYPES:
                self.assert_close_prefill_logits(
-                    DEFAULT_PROMPTS, model, tp_size, torch_dtype, long_context_tolerance
+                    DEFAULT_PROMPTS, model, tp_size, torch_dtype, prefill_tolerance
                )


--- a/test/srt/models/test_generation_models.py
+++ b/test/srt/models/test_generation_models.py
@@ -20,12 +20,46 @@ import torch
 from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner

 MODELS = [
-    ("meta-llama/Meta-Llama-3.1-8B-Instruct", 1, 1.1),
-    ("google/gemma-2-2b", 1, 3),
+    ("meta-llama/Meta-Llama-3.1-8B-Instruct", 1, 1.1, 3e-2, 1),
+    ("google/gemma-2-2b", 1, 3, 3e-2, 1),
+    ("Alibaba-NLP/gte-Qwen2-1.5B-instruct", 1, None, 6e-2, 1),
 ]
 TORCH_DTYPES = [torch.float16]


+def lcs(X, Y):
+    m = len(X)
+    n = len(Y)
+    L = [[0] * (n + 1) for _ in range(m + 1)]
+
+    for i in range(m + 1):
+        for j in range(n + 1):
+            if i == 0 or j == 0:
+                L[i][j] = 0
+            elif X[i - 1] == Y[j - 1]:
+                L[i][j] = L[i - 1][j - 1] + 1
+            else:
+                L[i][j] = max(L[i - 1][j], L[i][j - 1])
+
+    return L[m][n]
+
+
+def calculate_rouge_l(output_strs_list1, output_strs_list2):
+    rouge_l_scores = []
+
+    for s1, s2 in zip(output_strs_list1, output_strs_list2):
+        lcs_len = lcs(s1, s2)
+        precision = lcs_len / len(s1) if len(s1) > 0 else 0
+        recall = lcs_len / len(s2) if len(s2) > 0 else 0
+        if precision + recall > 0:
+            fmeasure = (2 * precision * recall) / (precision + recall)
+        else:
+            fmeasure = 0.0
+        rouge_l_scores.append(fmeasure)
+
+    return rouge_l_scores
+
+
 class TestGenerationModels(unittest.TestCase):

    def assert_close_prefill_logits_and_output_strs(
@@ -35,10 +69,14 @@ class TestGenerationModels(unittest.TestCase):
        tp_size,
        torch_dtype,
        max_new_tokens,
+        prefill_tolerance,
+        rouge_threshold,
        long_context_tolerance,
    ) -> None:
+        if model_path == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
+            prompts = prompts[:-1]
        with HFRunner(
-            model_path, torch_dtype=torch_dtype, is_generation_model=True
+            model_path, torch_dtype=torch_dtype, is_generation=True
        ) as hf_runner:
            hf_outputs = hf_runner.forward(prompts, max_new_tokens=max_new_tokens)

@@ -46,7 +84,7 @@ class TestGenerationModels(unittest.TestCase):
            model_path,
            tp_size=tp_size,
            torch_dtype=torch_dtype,
-            is_generation_model=True,
+            is_generation=True,
        ) as srt_runner:
            srt_outputs = srt_runner.forward(prompts, max_new_tokens=max_new_tokens)

@@ -56,17 +94,34 @@ class TestGenerationModels(unittest.TestCase):

            print("max_diff", torch.max(abs(hf_logprobs - srt_logprobs)))
            if hf_logprobs.shape[0] <= 100:
-                tolerance = 3e-2
                assert torch.all(
-                    abs(hf_logprobs - srt_logprobs) < tolerance
+                    abs(hf_logprobs - srt_logprobs) < prefill_tolerance
                ), "prefill logprobs are not all close"

        print(hf_outputs.output_strs)
        print(srt_outputs.output_strs)
-        assert hf_outputs.output_strs == srt_outputs.output_strs
+        rouge_l_scores = calculate_rouge_l(
+            hf_outputs.output_strs, srt_outputs.output_strs
+        )
+        assert all(
+            score >= rouge_threshold for score in rouge_l_scores
+        ), f"Not all ROUGE-L scores are greater than {rouge_threshold}"

    def test_prefill_logits_and_output_strs(self):
-        for model, tp_size, long_context_tolerance in MODELS:
+        import multiprocessing as mp
+
+        try:
+            mp.set_start_method("spawn")
+        except RuntimeError:
+            pass
+
+        for (
+            model,
+            tp_size,
+            long_context_tolerance,
+            prefill_tolerance,
+            rouge_threshold,
+        ) in MODELS:
            for torch_dtype in TORCH_DTYPES:
                max_new_tokens = 8
                self.assert_close_prefill_logits_and_output_strs(
@@ -75,6 +130,8 @@ class TestGenerationModels(unittest.TestCase):
                    tp_size,
                    torch_dtype,
                    max_new_tokens,
+                    prefill_tolerance=prefill_tolerance,
+                    rouge_threshold=rouge_threshold,
                    long_context_tolerance=long_context_tolerance,
                )

--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -5,6 +5,9 @@ from sglang.test.test_utils import run_unittest_files

 suites = {
    "minimal": [
+        "models/test_embedding_models.py",
+        "models/test_generation_models.py",
+        "sampling/penaltylib",
        "test_chunked_prefill.py",
        "test_embedding_openai_server.py",
        "test_eval_accuracy_mini.py",
@@ -13,11 +16,8 @@ suites = {
        "test_skip_tokenizer_init.py",
        "test_torch_compile.py",
        "test_triton_attn_backend.py",
-        "test_vision_openai_server.py",
        "test_update_weights.py",
-        "models/test_generation_models.py",
-        "models/test_embedding_models.py",
-        "sampling/penaltylib",
+        "test_vision_openai_server.py",
    ],
    "sampling/penaltylib": glob.glob(
        "sampling/penaltylib/**/test_*.py", recursive=True