Move sgl.Runtime under sglang/lang (#2990)
This commit is contained in:
@@ -9,7 +9,7 @@ from enum import Enum
|
||||
from pydantic import BaseModel
|
||||
|
||||
import sglang as sgl
|
||||
from sglang.srt.constrained import build_regex_from_object
|
||||
from sglang.srt.constrained.outlines_backend import build_regex_from_object
|
||||
|
||||
character_regex = (
|
||||
r"""\{\n"""
|
||||
|
||||
@@ -3,8 +3,8 @@ import triton_python_backend_utils as pb_utils
|
||||
from pydantic import BaseModel
|
||||
|
||||
import sglang as sgl
|
||||
from sglang import function, set_default_backend
|
||||
from sglang.srt.constrained import build_regex_from_object
|
||||
from sglang import function
|
||||
from sglang.srt.constrained.outlines_backend import build_regex_from_object
|
||||
|
||||
sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
|
||||
|
||||
|
||||
@@ -1,46 +0,0 @@
|
||||
"""
|
||||
Usage:
|
||||
|
||||
python3 async_io.py
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
|
||||
from sglang import Runtime
|
||||
|
||||
|
||||
async def generate(
|
||||
engine,
|
||||
prompt,
|
||||
sampling_params,
|
||||
):
|
||||
tokenizer = engine.get_tokenizer()
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You will be given question answer tasks.",
|
||||
},
|
||||
{"role": "user", "content": prompt},
|
||||
]
|
||||
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
stream = engine.add_request(prompt, sampling_params)
|
||||
|
||||
async for output in stream:
|
||||
print(output, end="", flush=True)
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
runtime = Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
|
||||
print("--- runtime ready ---\n")
|
||||
|
||||
prompt = "Who is Alan Turing?"
|
||||
sampling_params = {"max_new_tokens": 128}
|
||||
asyncio.run(generate(runtime, prompt, sampling_params))
|
||||
|
||||
runtime.shutdown()
|
||||
Reference in New Issue
Block a user