From 2bc3fcd4204ec9189d2fa8dd27f764769156cfbc Mon Sep 17 00:00:00 2001 From: Keyang Ru Date: Fri, 17 Oct 2025 10:26:54 -0700 Subject: [PATCH] [doc] update router document (#11767) --- docs/advanced_features/router.md | 22 +++++++++++++++------- sgl-router/README.md | 15 +++++---------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/docs/advanced_features/router.md b/docs/advanced_features/router.md index 40632fb9c..0736f7ed5 100644 --- a/docs/advanced_features/router.md +++ b/docs/advanced_features/router.md @@ -81,7 +81,7 @@ Comprehensive example: python3 -m sglang_router.launch_server \ --host 0.0.0.0 \ --port 8080 \ - --model /raid/models/meta-llama/Llama-3.1-8B-Instruct \ + --model meta-llama/Llama-3.1-8B-Instruct \ --tp-size 1 \ --dp-size 8 \ --grpc-mode \ @@ -91,7 +91,7 @@ python3 -m sglang_router.launch_server \ --router-health-success-threshold 2 \ --router-health-check-timeout-secs 6000 \ --router-health-check-interval-secs 60 \ - --router-model-path /raid/models/meta-llama/Llama-3.1-8B-Instruct \ + --router-model-path meta-llama/Llama-3.1-8B-Instruct \ --router-policy round_robin \ --router-log-level debug ``` @@ -117,7 +117,7 @@ Use SRT gRPC workers to unlock the highest throughput and access native reasonin ```bash # Workers expose gRPC endpoints python -m sglang.launch_server \ - --model /raid/models/meta-llama/Llama-3.1-8B-Instruct \ + --model meta-llama/Llama-3.1-8B-Instruct \ --grpc-mode \ --port 20000 @@ -152,7 +152,6 @@ Proxy OpenAI-compatible endpoints (OpenAI, xAI, etc.) while keeping history and python -m sglang_router.launch_router \ --backend openai \ --worker-urls https://api.openai.com \ - --api-key "$OPENAI_API_KEY" \ --history-backend memory ``` @@ -171,7 +170,7 @@ curl -X POST http://localhost:30000/workers \ -d '{"url":"grpc://0.0.0.0:31000","worker_type":"regular"}' # Inspect registry -curl http://localhost:30000/workers | jq +curl http://localhost:30000/workers # Remove a worker curl -X DELETE http://localhost:30000/workers/grpc://0.0.0.0:31000 @@ -278,8 +277,18 @@ PD deployments can specify `--prefill-selector` and `--decode-selector` plus the | `oracle` | Oracle Autonomous Database-backed storage (pooled connections). | `--history-backend oracle` | Oracle configuration (choose DSN *or* TNS alias): +Install the Oracle Instant Client and set `LD_LIBRARY_PATH` accordingly. +Choose **one** connection method: +```bash +# Option 1: Full connection descriptor +export ATP_DSN="(description=(address=(protocol=tcps)(port=1522)(host=adb.region.oraclecloud.com))(connect_data=(service_name=service_name)))" + +# Option 2: TNS alias (requires wallet) +export ATP_TNS_ALIAS="sglroutertestatp_high" +export ATP_WALLET_PATH="/path/to/wallet" +``` +Provide database credentials and optional pool sizing: ```bash -export ATP_DSN="tcps://host:port/service" # or use ATP_TNS_ALIAS + ATP_WALLET_PATH export ATP_USER="admin" export ATP_PASSWORD="secret" export ATP_POOL_MIN=4 @@ -320,7 +329,6 @@ Use CLI flags to select parsers: | `POST` | `/v1/completions` | OpenAI-compatible text completions. | | `POST` | `/v1/responses` | Create background responses (agentic loops). | | `GET` | `/v1/responses/{id}` | Retrieve stored responses. | -| `GET` | `/v1/responses/{id}/input` | List captured input items. | | `POST` | `/v1/embeddings` | Forward embedding requests. | | `POST` | `/v1/rerank` | Ranking endpoint (`/rerank` synonym). | | `POST` | `/v1/conversations` | Create conversation metadata. | diff --git a/sgl-router/README.md b/sgl-router/README.md index 7880e033d..da0890edf 100644 --- a/sgl-router/README.md +++ b/sgl-router/README.md @@ -147,7 +147,7 @@ curl -X POST http://localhost:30000/workers \ }' # Inspect registered workers -curl http://localhost:30000/workers | jq +curl http://localhost:30000/workers ``` Sample response (http workers): ```json @@ -194,13 +194,11 @@ Route requests to OpenAI or OpenAI-compatible endpoints: python3 -m sglang_router.launch_router \ --backend openai \ --worker-urls https://api.openai.com \ - --api-key "$OPENAI_API_KEY" # Route to custom OpenAI-compatible endpoint (Gemini, xAI, etc.) python3 -m sglang_router.launch_router \ --backend openai \ --worker-urls http://my-openai-compatible-service:8000 \ - --api-key "tenant-api-key" ``` **Notes** @@ -218,7 +216,7 @@ Add flags as needed for production deployments: python3 -m sglang_router.launch_server \ --host 0.0.0.0 \ --port 8080 \ - --model /raid/models/meta-llama/Llama-3.1-8B-Instruct \ + --model meta-llama/Llama-3.1-8B-Instruct \ --tp-size 1 \ --dp-size 8 \ --grpc-mode @@ -240,7 +238,7 @@ Use upstream SGLang binaries to start dedicated worker processes. - **Prefill worker server (gRPC mode)**: ```bash python3 -m sglang.launch_server \ - --model /raid/models/meta-llama/Llama-3.1-8B-Instruct \ + --model meta-llama/Llama-3.1-8B-Instruct \ --port 20000 \ --tp-size 1 \ --grpc-mode @@ -312,7 +310,7 @@ The HTTP router exposes the full OpenAI-compatible surface area (`/generate`, `/ ### OpenAI Router - Proxies OpenAI-compatible chat completions and responses APIs, preserving headers and SSE streams end-to-end. - Supports `/v1/responses` background jobs with cancellation, deletion, and listing input items—enabling agentic, multi-turn orchestration without persisting data at remote vendor endpoints. -- Conversation APIs (`/v1/conversations` and `/items`) interact with the configured conversation storage backend for compliant chat-history management. Conversation state lives at the router tier, so the same history can drive different models or MCP loops without leaking data to upstream vendors. +- Conversation APIs (`/v1/conversations` and `/v1/conversations/{id}/items`) interact with the configured conversation storage backend for compliant chat-history management. Conversation state lives at the router tier, so the same history can drive different models or MCP loops without leaking data to upstream vendors. - Chat history, agentic multi-turn `/v1/responses`, and the native MCP client (STDIO/HTTP/SSE/Streamable transports) are designed to satisfy enterprise data-privacy requirements by keeping sensitive state within the router. ### Request Endpoints @@ -323,10 +321,7 @@ The HTTP router exposes the full OpenAI-compatible surface area (`/generate`, `/ | `POST /v1/completions` | OpenAI-compatible text completions. | | `POST /v1/responses` | Create background responses, returns response IDs. | | `GET /v1/responses/{id}` | Retrieve stored responses. | -| `POST /v1/responses/{id}/cancel` | Cancel in-flight background jobs. | -| `DELETE /v1/responses/{id}` | Delete stored response. | -| `GET /v1/responses/{id}/input` | List captured input items. | -| Conversation endpoints (`/v1/conversations`, `/v1/conversations/{id}`, `/items`) | Manage chat history. | +| Conversation endpoints (`/v1/conversations`, `/v1/conversations/{id}`, `/v1/conversations/{id}/items`) | Manage chat history. | | `POST /v1/embeddings` | Forward embedding requests. | | `POST /v1/rerank`, `POST /rerank` | Ranking APIs. |