Better unit tests for adding a new model (#1488)

2024-09-22 01:50:37 -07:00
parent 441c22db8c
commit 167591e864
8 changed files with 157 additions and 126 deletions
--- a/python/sglang/test/runners.py
+++ b/python/sglang/test/runners.py
@@ -21,19 +21,18 @@ from typing import List, Union

 import torch
 import torch.nn.functional as F
-from peft import PeftModel
 from transformers import AutoModelForCausalLM, AutoTokenizer

 from sglang.srt.server import Runtime
 from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER

 DEFAULT_PROMPTS = [
-    # the output of gemma-2-2b from SRT is unstable on the commented prompt
-    # "The capital of France is",
    "Apple is red. Banana is Yellow. " * 800 + "Apple is",
    "The capital of the United Kingdom is",
    "Today is a sunny day and I like",
    "AI is a field of computer science focused on",
+    # the output of gemma-2-2b from SRT is unstable on the commented prompt
+    # "The capital of France is",
 ]

 dirpath = os.path.dirname(__file__)
@@ -132,6 +131,8 @@ class HFRunner:
                            input_ids = torch.tensor([p], device="cuda")

                        if lora_paths is not None and lora_paths[i] is not None:
+                            from peft import PeftModel
+
                            self.model = PeftModel.from_pretrained(
                                self.base_model,
                                lora_paths[i],
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -587,3 +587,37 @@ def run_bench_latency(model, other_args):
        kill_child_process(process.pid)

    return output_throughput
+
+
+def lcs(X, Y):
+    m = len(X)
+    n = len(Y)
+    L = [[0] * (n + 1) for _ in range(m + 1)]
+
+    for i in range(m + 1):
+        for j in range(n + 1):
+            if i == 0 or j == 0:
+                L[i][j] = 0
+            elif X[i - 1] == Y[j - 1]:
+                L[i][j] = L[i - 1][j - 1] + 1
+            else:
+                L[i][j] = max(L[i - 1][j], L[i][j - 1])
+
+    return L[m][n]
+
+
+def calculate_rouge_l(output_strs_list1, output_strs_list2):
+    """calculate the ROUGE-L score"""
+    rouge_l_scores = []
+
+    for s1, s2 in zip(output_strs_list1, output_strs_list2):
+        lcs_len = lcs(s1, s2)
+        precision = lcs_len / len(s1) if len(s1) > 0 else 0
+        recall = lcs_len / len(s2) if len(s2) > 0 else 0
+        if precision + recall > 0:
+            fmeasure = (2 * precision * recall) / (precision + recall)
+        else:
+            fmeasure = 0.0
+        rouge_l_scores.append(fmeasure)
+
+    return rouge_l_scores