Improve benchmark scripts & rename some scripts (#477)

2024-05-26 12:51:45 -07:00
parent 2b605ab1d7
commit 55c1643627
10 changed files with 161 additions and 62 deletions
--- a/benchmark/latency_throughput/test_latency.py
+++ b/benchmark/latency_throughput/test_latency.py
@@ -18,20 +18,22 @@ if __name__ == "__main__":
            args.port = 21000
        elif args.backend == "lightllm":
            args.port = 22000
+        elif args.backend == "xinfer":
+            args.port = 9988
        else:
            raise ValueError(f"Invalid backend: {args.backend}")

    url = f"{args.host}:{args.port}"
    a = random.randint(0, 1 << 20)
    max_new_tokens = 256
+    prompt = f"{a, }"

    tic = time.time()
    if args.backend == "srt":
        response = requests.post(
            url + "/generate",
            json={
-                "text": f"The capital of France is",
-                # "input_ids": [[2] * 256] * 196,
+                "text": prompt,
                "sampling_params": {
                    "temperature": 0,
                    "max_new_tokens": max_new_tokens,
@@ -42,7 +44,7 @@ if __name__ == "__main__":
        response = requests.post(
            url + "/generate",
            json={
-                "inputs": f"{a}, ",
+                "inputs": prompt,
                "parameters": {
                    "temperature": 0,
                    "max_new_tokens": max_new_tokens,
@@ -53,14 +55,36 @@ if __name__ == "__main__":
        response = requests.post(
            url + "/generate",
            json={
-                "prompt": f"{a}, ",
+                "prompt": prompt,
                "temperature": 0,
                "max_tokens": max_new_tokens,
            },
        )
+    elif args.backend == "xinfer":
+        import grpc
+        from xlm.proto import sampler_pb2, sampler_pb2_grpc
+
+        sampler_channel = grpc.insecure_channel(url.replace("http://", ""))
+        sampler = sampler_pb2_grpc.SamplerStub(sampler_channel)
+
+        tic = time.time()
+        sample_request = sampler_pb2.SampleTextRequest(
+            prompt=prompt,
+            settings=sampler_pb2.SampleSettings(
+                max_len=max_new_tokens,
+                rng_seed=0,
+                temperature=0,
+                nucleus_p=1,
+            ),
+        )
+        stream = sampler.SampleText(sample_request)
+        response = "".join([x.text for x in stream])
    latency = time.time() - tic

-    ret = response.json()
+    if isinstance(response, str):
+        ret = response
+    else:
+        ret = response.json()
    print(ret)

    speed = max_new_tokens / latency