diff --git a/benchmark/gsm8k/bench_sglang.py b/benchmark/gsm8k/bench_sglang.py index fe15c015a..9cdc2cf84 100644 --- a/benchmark/gsm8k/bench_sglang.py +++ b/benchmark/gsm8k/bench_sglang.py @@ -7,7 +7,7 @@ import time import numpy as np -from sglang.api import set_default_backend +from sglang.lang.api import set_default_backend from sglang.test.test_utils import ( add_common_sglang_args_and_parse, dump_bench_raw_result, diff --git a/benchmark/hellaswag/bench_sglang.py b/benchmark/hellaswag/bench_sglang.py index 6345a453b..2adce99b8 100644 --- a/benchmark/hellaswag/bench_sglang.py +++ b/benchmark/hellaswag/bench_sglang.py @@ -5,7 +5,7 @@ import time import numpy as np -from sglang.api import set_default_backend +from sglang.lang.api import set_default_backend from sglang.test.test_utils import ( add_common_sglang_args_and_parse, select_sglang_backend, diff --git a/python/pyproject.toml b/python/pyproject.toml index 21145f261..d6f1ce169 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -47,10 +47,10 @@ runtime_common = [ "sentencepiece", "soundfile==0.13.1", "scipy", - "torchao==0.9.0", - "transformers==4.55.0", "timm==1.0.16", "tiktoken", + "torchao==0.9.0", + "transformers==4.55.0", "uvicorn", "uvloop", "xgrammar==0.1.22", @@ -84,6 +84,9 @@ srt_hip = [ "petit_kernel==0.0.2", ] +# CPU: torch wheel for CPU needs to be installed from https://download.pytorch.org/whl/cpu +srt_cpu = ["sglang[runtime_common]", "einops"] + # xpu is not enabled in public vllm and torch whl, # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm srt_xpu = ["sglang[runtime_common]"] @@ -92,8 +95,6 @@ srt_xpu = ["sglang[runtime_common]"] # https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html srt_hpu = ["sglang[runtime_common]"] -# CPU: torch wheel for CPU needs to be installed from https://download.pytorch.org/whl/cpu -srt_cpu = ["sglang[runtime_common]", "einops"] # https://vllm-ascend.readthedocs.io/en/latest/installation.html srt_npu = ["sglang[runtime_common]"] @@ -112,12 +113,12 @@ test = [ "sentence_transformers", "pytest", ] -all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]", "sglang[torch_memory_saver]", "sglang[decord]"] -all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]", "sglang[decord]"] -all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]", "sglang[decord]"] -all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]", "sglang[decord]"] -all_cpu = ["sglang[srt_cpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]", "sglang[decord]"] -all_npu = ["sglang[srt_npu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]", "sglang[decord]"] +all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[torch_memory_saver]", "sglang[decord]"] +all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"] +all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"] +all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"] +all_cpu = ["sglang[srt_cpu]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"] +all_npu = ["sglang[srt_npu]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"] dev = ["sglang[all]", "sglang[test]"] dev_hip = ["sglang[all_hip]", "sglang[test]"] diff --git a/python/sglang/README.md b/python/sglang/README.md index e07792403..ae0c479b9 100644 --- a/python/sglang/README.md +++ b/python/sglang/README.md @@ -5,7 +5,7 @@ - `srt`: The backend engine for running local models. (SRT = SGLang Runtime). - `test`: The test utilities. - `api.py`: The public APIs. -- `bench_offline_throughput.py`: Benchmark the throughput in the offline mode. +- `bench_offline_throughput.py`: Benchmark the performance in the offline mode. - `bench_one_batch.py`: Benchmark the latency of running a single static batch without a server. - `bench_one_batch_server.py`: Benchmark the latency of running a single batch with a server. - `bench_serving.py`: Benchmark online serving with dynamic requests. diff --git a/python/sglang/__init__.py b/python/sglang/__init__.py index 3f10e0234..6ac7692e6 100644 --- a/python/sglang/__init__.py +++ b/python/sglang/__init__.py @@ -1,7 +1,8 @@ # SGLang public APIs # Frontend Language APIs -from sglang.api import ( +from sglang.global_config import global_config +from sglang.lang.api import ( Engine, Runtime, assistant, @@ -25,13 +26,13 @@ from sglang.api import ( user_end, video, ) -from sglang.global_config import global_config from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint from sglang.lang.choices import ( greedy_token_selection, token_length_normalized, unconditional_likelihood_normalized, ) +from sglang.srt.entrypoints.engine import Engine from sglang.utils import LazyImport from sglang.version import __version__ diff --git a/python/sglang/api.py b/python/sglang/lang/api.py similarity index 100% rename from python/sglang/api.py rename to python/sglang/lang/api.py diff --git a/python/sglang/lang/backend/__init__.py b/python/sglang/lang/backend/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/python/sglang/test/few_shot_gsm8k.py b/python/sglang/test/few_shot_gsm8k.py index 5aac87bd2..e9971fa90 100644 --- a/python/sglang/test/few_shot_gsm8k.py +++ b/python/sglang/test/few_shot_gsm8k.py @@ -12,7 +12,7 @@ import time import numpy as np -from sglang.api import set_default_backend +from sglang.lang.api import set_default_backend from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl diff --git a/python/sglang/test/few_shot_gsm8k_engine.py b/python/sglang/test/few_shot_gsm8k_engine.py index 2453a91e4..05b095713 100644 --- a/python/sglang/test/few_shot_gsm8k_engine.py +++ b/python/sglang/test/few_shot_gsm8k_engine.py @@ -8,7 +8,7 @@ import time import numpy as np import sglang as sgl -from sglang.api import set_default_backend +from sglang.lang.api import set_default_backend from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl