Cleanup readme, llava examples, usage examples and nccl init (#1194)

2024-08-24 08:02:23 -07:00
parent c9064e6fd9
commit f6af3a6561
65 changed files with 174 additions and 317 deletions
--- a/examples/runtime/async_io_api.py
+++ b/examples/runtime/async_io_api.py
@@ -0,0 +1,45 @@
+"""
+Usage:
+python3 async_io.py
+"""
+
+import asyncio
+
+from sglang import Runtime
+
+
+async def generate(
+    engine,
+    prompt,
+    sampling_params,
+):
+    tokenizer = engine.get_tokenizer()
+
+    messages = [
+        {
+            "role": "system",
+            "content": "You will be given question answer tasks.",
+        },
+        {"role": "user", "content": prompt},
+    ]
+
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    stream = engine.add_request(prompt, sampling_params)
+
+    async for output in stream:
+        print(output, end="", flush=True)
+    print()
+
+
+if __name__ == "__main__":
+    runtime = Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
+    print("--- runtime ready ---\n")
+
+    prompt = "Who is Alan Turing?"
+    sampling_params = {"max_new_tokens": 128}
+    asyncio.run(generate(runtime, prompt, sampling_params))
+
+    runtime.shutdown()