Move sgl.Runtime under sglang/lang (#2990)

2025-01-19 17:10:29 -08:00
parent e403d23757
commit 61f42b5732
17 changed files with 267 additions and 329 deletions
--- a/examples/frontend_language/usage/json_decode.py
+++ b/examples/frontend_language/usage/json_decode.py
@@ -9,7 +9,7 @@ from enum import Enum
 from pydantic import BaseModel

 import sglang as sgl
-from sglang.srt.constrained import build_regex_from_object
+from sglang.srt.constrained.outlines_backend import build_regex_from_object

 character_regex = (
    r"""\{\n"""
--- a/examples/frontend_language/usage/triton/models/character_generation/1/model.py
+++ b/examples/frontend_language/usage/triton/models/character_generation/1/model.py
@@ -3,8 +3,8 @@ import triton_python_backend_utils as pb_utils
 from pydantic import BaseModel

 import sglang as sgl
-from sglang import function, set_default_backend
-from sglang.srt.constrained import build_regex_from_object
+from sglang import function
+from sglang.srt.constrained.outlines_backend import build_regex_from_object

 sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))

--- a/examples/runtime/async_io_api.py
+++ b/examples/runtime/async_io_api.py
@@ -1,46 +0,0 @@
-"""
-Usage:
-
-python3 async_io.py
-"""
-
-import asyncio
-
-from sglang import Runtime
-
-
-async def generate(
-    engine,
-    prompt,
-    sampling_params,
-):
-    tokenizer = engine.get_tokenizer()
-
-    messages = [
-        {
-            "role": "system",
-            "content": "You will be given question answer tasks.",
-        },
-        {"role": "user", "content": prompt},
-    ]
-
-    prompt = tokenizer.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-
-    stream = engine.add_request(prompt, sampling_params)
-
-    async for output in stream:
-        print(output, end="", flush=True)
-    print()
-
-
-if __name__ == "__main__":
-    runtime = Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
-    print("--- runtime ready ---\n")
-
-    prompt = "Who is Alan Turing?"
-    sampling_params = {"max_new_tokens": 128}
-    asyncio.run(generate(runtime, prompt, sampling_params))
-
-    runtime.shutdown()