[Minor, Performance] Use torch.argmax for greedy sampling (#1589)

2024-10-06 13:15:05 -07:00
parent 9c064bf78a
commit c98e84c21e
3 changed files with 34 additions and 2 deletions
--- a/test/srt/test_pytorch_sampling_backend.py
+++ b/test/srt/test_pytorch_sampling_backend.py
@@ -1,6 +1,9 @@
+import json
 import unittest
 from types import SimpleNamespace

+import requests
+
 from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
@@ -39,6 +42,32 @@ class TestPyTorchSamplingBackend(unittest.TestCase):
        metrics = run_eval(args)
        assert metrics["score"] >= 0.65

+    def test_greedy(self):
+        response_single = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": 32,
+                },
+            },
+        ).json()
+        response_batch = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": ["The capital of France is"] * 10,
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": 32,
+                },
+            },
+        ).json()
+        text = response_single["text"]
+        print(text)
+        for i in range(10):
+            assert response_batch[i]["text"] == text
+

 if __name__ == "__main__":
    unittest.main()