Files
sglang/examples/runtime/async_io_api.py

47 lines
931 B
Python
Raw Normal View History

"""
Usage:
2024-09-28 14:43:35 -07:00
python3 async_io.py
"""
2024-07-18 04:55:39 +10:00
2024-01-21 15:17:30 -08:00
import asyncio
2024-07-18 04:55:39 +10:00
2024-01-21 15:17:30 -08:00
from sglang import Runtime
async def generate(
engine,
prompt,
sampling_params,
):
tokenizer = engine.get_tokenizer()
messages = [
2024-07-18 04:55:39 +10:00
{
"role": "system",
"content": "You will be given question answer tasks.",
},
2024-01-21 15:17:30 -08:00
{"role": "user", "content": prompt},
]
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
stream = engine.add_request(prompt, sampling_params)
async for output in stream:
print(output, end="", flush=True)
print()
if __name__ == "__main__":
runtime = Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
print("--- runtime ready ---\n")
2024-01-21 15:17:30 -08:00
prompt = "Who is Alan Turing?"
sampling_params = {"max_new_tokens": 128}
asyncio.run(generate(runtime, prompt, sampling_params))
2024-07-18 04:55:39 +10:00
2024-01-21 15:17:30 -08:00
runtime.shutdown()