example: add async offline inference demo (#3961)

Signed-off-by: joeshikui <joeshikui@tencent.com> Co-authored-by: joeshikui <joeshikui@tencent.com>
2025-03-13 12:41:21 +08:00
parent 6412c5e493
commit 959a3143fc
1 changed files with 65 additions and 0 deletions
--- a/examples/runtime/engine/offline_batch_inference_async.py
+++ b/examples/runtime/engine/offline_batch_inference_async.py
@@ -0,0 +1,65 @@
+"""
+Usage:
+python offline_batch_inference_async.py --model-path Qwen/Qwen2-VL-7B-Instruct
+
+Note:
+This demo shows the usage of async generation,
+which is useful to implement an online-like generation with batched inference.
+"""
+
+import argparse
+import asyncio
+import dataclasses
+import time
+
+import sglang as sgl
+from sglang.srt.server_args import ServerArgs
+
+
+class InferenceEngine:
+    def __init__(self, **kwargs):
+        self.engine = sgl.Engine(**kwargs)
+
+    async def generate(self, prompt, sampling_params):
+        result = await self.engine.async_generate(prompt, sampling_params)
+        return result
+
+
+async def run_server(server_args):
+    inference = InferenceEngine(**dataclasses.asdict(server_args))
+
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ] * 100
+
+    # Create a sampling params object.
+    sampling_params = {"temperature": 0.8, "top_p": 0.95}
+
+    # Run the generation tasks concurrently in async mode.
+    tasks = []
+    for prompt in prompts:
+        task = asyncio.create_task(inference.generate(prompt, sampling_params))
+        tasks.append(task)
+
+    # Get and print the result
+    for task in tasks:
+        await task
+        while True:
+            if not task.done():
+                time.sleep(1)
+            else:
+                result = task.result()
+                print(f"Generated text: {result['text']}")
+                break
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    server_args = ServerArgs.from_cli_args(args)
+    asyncio.run(run_server(server_args))