diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index dbc24f2cc..29def290b 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -13,7 +13,6 @@ "\n", "- `chat/completions`\n", "- `completions`\n", - "- `batches`\n", "\n", "Check out other tutorials to learn about [vision APIs](https://docs.sglang.ai/backend/openai_api_vision.html) for vision-language models and [embedding APIs](https://docs.sglang.ai/backend/openai_api_embeddings.html) for embedding models." ] @@ -278,290 +277,6 @@ "For OpenAI compatible structured outputs API, refer to [Structured Outputs](https://docs.sglang.ai/backend/structured_outputs.html#OpenAI-Compatible-API) for more details.\n" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Batches\n", - "\n", - "Batches API for chat completions and completions are also supported. You can upload your requests in `jsonl` files, create a batch job, and retrieve the results when the batch job is completed (which takes longer but costs less).\n", - "\n", - "The batches APIs are:\n", - "\n", - "- `batches`\n", - "- `batches/{batch_id}/cancel`\n", - "- `batches/{batch_id}`\n", - "\n", - "Here is an example of a batch job for chat completions, completions are similar.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import time\n", - "from openai import OpenAI\n", - "\n", - "client = OpenAI(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n", - "\n", - "requests = [\n", - " {\n", - " \"custom_id\": \"request-1\",\n", - " \"method\": \"POST\",\n", - " \"url\": \"/chat/completions\",\n", - " \"body\": {\n", - " \"model\": \"qwen/qwen2.5-0.5b-instruct\",\n", - " \"messages\": [\n", - " {\"role\": \"user\", \"content\": \"Tell me a joke about programming\"}\n", - " ],\n", - " \"max_tokens\": 50,\n", - " },\n", - " },\n", - " {\n", - " \"custom_id\": \"request-2\",\n", - " \"method\": \"POST\",\n", - " \"url\": \"/chat/completions\",\n", - " \"body\": {\n", - " \"model\": \"qwen/qwen2.5-0.5b-instruct\",\n", - " \"messages\": [{\"role\": \"user\", \"content\": \"What is Python?\"}],\n", - " \"max_tokens\": 50,\n", - " },\n", - " },\n", - "]\n", - "\n", - "input_file_path = \"batch_requests.jsonl\"\n", - "\n", - "with open(input_file_path, \"w\") as f:\n", - " for req in requests:\n", - " f.write(json.dumps(req) + \"\\n\")\n", - "\n", - "with open(input_file_path, \"rb\") as f:\n", - " file_response = client.files.create(file=f, purpose=\"batch\")\n", - "\n", - "batch_response = client.batches.create(\n", - " input_file_id=file_response.id,\n", - " endpoint=\"/v1/chat/completions\",\n", - " completion_window=\"24h\",\n", - ")\n", - "\n", - "print_highlight(f\"Batch job created with ID: {batch_response.id}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "while batch_response.status not in [\"completed\", \"failed\", \"cancelled\"]:\n", - " time.sleep(3)\n", - " print(f\"Batch job status: {batch_response.status}...trying again in 3 seconds...\")\n", - " batch_response = client.batches.retrieve(batch_response.id)\n", - "\n", - "if batch_response.status == \"completed\":\n", - " print(\"Batch job completed successfully!\")\n", - " print(f\"Request counts: {batch_response.request_counts}\")\n", - "\n", - " result_file_id = batch_response.output_file_id\n", - " file_response = client.files.content(result_file_id)\n", - " result_content = file_response.read().decode(\"utf-8\")\n", - "\n", - " results = [\n", - " json.loads(line) for line in result_content.split(\"\\n\") if line.strip() != \"\"\n", - " ]\n", - "\n", - " for result in results:\n", - " print_highlight(f\"Request {result['custom_id']}:\")\n", - " print_highlight(f\"Response: {result['response']}\")\n", - "\n", - " print_highlight(\"Cleaning up files...\")\n", - " # Only delete the result file ID since file_response is just content\n", - " client.files.delete(result_file_id)\n", - "else:\n", - " print_highlight(f\"Batch job failed with status: {batch_response.status}\")\n", - " if hasattr(batch_response, \"errors\"):\n", - " print_highlight(f\"Errors: {batch_response.errors}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It takes a while to complete the batch job. You can use these two APIs to retrieve the batch job status or cancel the batch job.\n", - "\n", - "1. `batches/{batch_id}`: Retrieve the batch job status.\n", - "2. `batches/{batch_id}/cancel`: Cancel the batch job.\n", - "\n", - "Here is an example to check the batch job status." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import time\n", - "from openai import OpenAI\n", - "\n", - "client = OpenAI(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n", - "\n", - "requests = []\n", - "for i in range(20):\n", - " requests.append(\n", - " {\n", - " \"custom_id\": f\"request-{i}\",\n", - " \"method\": \"POST\",\n", - " \"url\": \"/chat/completions\",\n", - " \"body\": {\n", - " \"model\": \"qwen/qwen2.5-0.5b-instruct\",\n", - " \"messages\": [\n", - " {\n", - " \"role\": \"system\",\n", - " \"content\": f\"{i}: You are a helpful AI assistant\",\n", - " },\n", - " {\n", - " \"role\": \"user\",\n", - " \"content\": \"Write a detailed story about topic. Make it very long.\",\n", - " },\n", - " ],\n", - " \"max_tokens\": 64,\n", - " },\n", - " }\n", - " )\n", - "\n", - "input_file_path = \"batch_requests.jsonl\"\n", - "with open(input_file_path, \"w\") as f:\n", - " for req in requests:\n", - " f.write(json.dumps(req) + \"\\n\")\n", - "\n", - "with open(input_file_path, \"rb\") as f:\n", - " uploaded_file = client.files.create(file=f, purpose=\"batch\")\n", - "\n", - "batch_job = client.batches.create(\n", - " input_file_id=uploaded_file.id,\n", - " endpoint=\"/v1/chat/completions\",\n", - " completion_window=\"24h\",\n", - ")\n", - "\n", - "print_highlight(f\"Created batch job with ID: {batch_job.id}\")\n", - "print_highlight(f\"Initial status: {batch_job.status}\")\n", - "\n", - "time.sleep(10)\n", - "\n", - "max_checks = 5\n", - "for i in range(max_checks):\n", - " batch_details = client.batches.retrieve(batch_id=batch_job.id)\n", - "\n", - " print_highlight(\n", - " f\"Batch job details (check {i+1} / {max_checks}) // ID: {batch_details.id} // Status: {batch_details.status} // Created at: {batch_details.created_at} // Input file ID: {batch_details.input_file_id} // Output file ID: {batch_details.output_file_id}\"\n", - " )\n", - " print_highlight(\n", - " f\"Request counts: Total: {batch_details.request_counts.total} // Completed: {batch_details.request_counts.completed} // Failed: {batch_details.request_counts.failed}\"\n", - " )\n", - "\n", - " time.sleep(3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here is an example to cancel a batch job." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import time\n", - "from openai import OpenAI\n", - "import os\n", - "\n", - "client = OpenAI(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n", - "\n", - "requests = []\n", - "for i in range(5000):\n", - " requests.append(\n", - " {\n", - " \"custom_id\": f\"request-{i}\",\n", - " \"method\": \"POST\",\n", - " \"url\": \"/chat/completions\",\n", - " \"body\": {\n", - " \"model\": \"qwen/qwen2.5-0.5b-instruct\",\n", - " \"messages\": [\n", - " {\n", - " \"role\": \"system\",\n", - " \"content\": f\"{i}: You are a helpful AI assistant\",\n", - " },\n", - " {\n", - " \"role\": \"user\",\n", - " \"content\": \"Write a detailed story about topic. Make it very long.\",\n", - " },\n", - " ],\n", - " \"max_tokens\": 128,\n", - " },\n", - " }\n", - " )\n", - "\n", - "input_file_path = \"batch_requests.jsonl\"\n", - "with open(input_file_path, \"w\") as f:\n", - " for req in requests:\n", - " f.write(json.dumps(req) + \"\\n\")\n", - "\n", - "with open(input_file_path, \"rb\") as f:\n", - " uploaded_file = client.files.create(file=f, purpose=\"batch\")\n", - "\n", - "batch_job = client.batches.create(\n", - " input_file_id=uploaded_file.id,\n", - " endpoint=\"/v1/chat/completions\",\n", - " completion_window=\"24h\",\n", - ")\n", - "\n", - "print_highlight(f\"Created batch job with ID: {batch_job.id}\")\n", - "print_highlight(f\"Initial status: {batch_job.status}\")\n", - "\n", - "time.sleep(10)\n", - "\n", - "try:\n", - " cancelled_job = client.batches.cancel(batch_id=batch_job.id)\n", - " print_highlight(f\"Cancellation initiated. Status: {cancelled_job.status}\")\n", - " assert cancelled_job.status == \"cancelling\"\n", - "\n", - " # Monitor the cancellation process\n", - " while cancelled_job.status not in [\"failed\", \"cancelled\"]:\n", - " time.sleep(3)\n", - " cancelled_job = client.batches.retrieve(batch_job.id)\n", - " print_highlight(f\"Current status: {cancelled_job.status}\")\n", - "\n", - " # Verify final status\n", - " assert cancelled_job.status == \"cancelled\"\n", - " print_highlight(\"Batch job successfully cancelled\")\n", - "\n", - "except Exception as e:\n", - " print_highlight(f\"Error during cancellation: {e}\")\n", - " raise e\n", - "\n", - "finally:\n", - " try:\n", - " del_response = client.files.delete(uploaded_file.id)\n", - " if del_response.deleted:\n", - " print_highlight(\"Successfully cleaned up input file\")\n", - " if os.path.exists(input_file_path):\n", - " os.remove(input_file_path)\n", - " print_highlight(\"Successfully deleted local batch_requests.jsonl file\")\n", - " except Exception as e:\n", - " print_highlight(f\"Error cleaning up: {e}\")\n", - " raise e" - ] - }, { "cell_type": "code", "execution_count": null, diff --git a/examples/runtime/openai_batch_chat.py b/examples/runtime/openai_batch_chat.py deleted file mode 100644 index d251ca0ff..000000000 --- a/examples/runtime/openai_batch_chat.py +++ /dev/null @@ -1,94 +0,0 @@ -""" -Usage: - -python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 -python openai_batch_chat.py - -Note: Before running this script, -you should create the input.jsonl file with the following content: -{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world! List 3 NBA players and tell a story"}],"max_tokens": 300}} -{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are an assistant. "},{"role": "user", "content": "Hello world! List three capital and tell a story"}],"max_tokens": 500}} -""" - -import json -import time - -import openai - - -class OpenAIBatchProcessor: - def __init__(self): - client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="EMPTY") - - self.client = client - - def process_batch(self, input_file_path, endpoint, completion_window): - - # Upload the input file - with open(input_file_path, "rb") as file: - uploaded_file = self.client.files.create(file=file, purpose="batch") - - # Create the batch job - batch_job = self.client.batches.create( - input_file_id=uploaded_file.id, - endpoint=endpoint, - completion_window=completion_window, - ) - - # Monitor the batch job status - while batch_job.status not in ["completed", "failed", "cancelled"]: - time.sleep(3) # Wait for 3 seconds before checking the status again - print( - f"Batch job status: {batch_job.status}...trying again in 3 seconds..." - ) - batch_job = self.client.batches.retrieve(batch_job.id) - - # Check the batch job status and errors - if batch_job.status == "failed": - print(f"Batch job failed with status: {batch_job.status}") - print(f"Batch job errors: {batch_job.errors}") - return None - - # If the batch job is completed, process the results - if batch_job.status == "completed": - - # print result of batch job - print("batch", batch_job.request_counts) - - result_file_id = batch_job.output_file_id - # Retrieve the file content from the server - file_response = self.client.files.content(result_file_id) - result_content = file_response.read() # Read the content of the file - - # Save the content to a local file - result_file_name = "batch_job_chat_results.jsonl" - with open(result_file_name, "wb") as file: - file.write(result_content) # Write the binary content to the file - # Load data from the saved JSONL file - results = [] - with open(result_file_name, "r", encoding="utf-8") as file: - for line in file: - json_object = json.loads( - line.strip() - ) # Parse each line as a JSON object - results.append(json_object) - - return results - else: - print(f"Batch job failed with status: {batch_job.status}") - return None - - -# Initialize the OpenAIBatchProcessor -processor = OpenAIBatchProcessor() - -# Process the batch job -input_file_path = "input.jsonl" -endpoint = "/v1/chat/completions" -completion_window = "24h" - -# Process the batch job -results = processor.process_batch(input_file_path, endpoint, completion_window) - -# Print the results -print(results) diff --git a/examples/runtime/openai_batch_complete.py b/examples/runtime/openai_batch_complete.py deleted file mode 100644 index 2f5be3d30..000000000 --- a/examples/runtime/openai_batch_complete.py +++ /dev/null @@ -1,93 +0,0 @@ -""" -Usage: -python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 -python openai_batch_complete.py -Note: Before running this script, -you should create the input.jsonl file with the following content: -{"custom_id": "request-1", "method": "POST", "url": "/v1/completions", "body": {"model": "gpt-3.5-turbo-instruct", "prompt": "List 3 names of famous soccer player: ", "max_tokens": 200}} -{"custom_id": "request-2", "method": "POST", "url": "/v1/completions", "body": {"model": "gpt-3.5-turbo-instruct", "prompt": "List 6 names of famous basketball player: ", "max_tokens": 400}} -{"custom_id": "request-3", "method": "POST", "url": "/v1/completions", "body": {"model": "gpt-3.5-turbo-instruct", "prompt": "List 6 names of famous basketball player: ", "max_tokens": 400}} -""" - -import json -import time - -import openai - - -class OpenAIBatchProcessor: - def __init__(self): - client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="EMPTY") - - self.client = client - - def process_batch(self, input_file_path, endpoint, completion_window): - - # Upload the input file - with open(input_file_path, "rb") as file: - uploaded_file = self.client.files.create(file=file, purpose="batch") - - # Create the batch job - batch_job = self.client.batches.create( - input_file_id=uploaded_file.id, - endpoint=endpoint, - completion_window=completion_window, - ) - - # Monitor the batch job status - while batch_job.status not in ["completed", "failed", "cancelled"]: - time.sleep(3) # Wait for 3 seconds before checking the status again - print( - f"Batch job status: {batch_job.status}...trying again in 3 seconds..." - ) - batch_job = self.client.batches.retrieve(batch_job.id) - - # Check the batch job status and errors - if batch_job.status == "failed": - print(f"Batch job failed with status: {batch_job.status}") - print(f"Batch job errors: {batch_job.errors}") - return None - - # If the batch job is completed, process the results - if batch_job.status == "completed": - - # print result of batch job - print("batch", batch_job.request_counts) - - result_file_id = batch_job.output_file_id - # Retrieve the file content from the server - file_response = self.client.files.content(result_file_id) - result_content = file_response.read() # Read the content of the file - - # Save the content to a local file - result_file_name = "batch_job_complete_results.jsonl" - with open(result_file_name, "wb") as file: - file.write(result_content) # Write the binary content to the file - # Load data from the saved JSONL file - results = [] - with open(result_file_name, "r", encoding="utf-8") as file: - for line in file: - json_object = json.loads( - line.strip() - ) # Parse each line as a JSON object - results.append(json_object) - - return results - else: - print(f"Batch job failed with status: {batch_job.status}") - return None - - -# Initialize the OpenAIBatchProcessor -processor = OpenAIBatchProcessor() - -# Process the batch job -input_file_path = "input.jsonl" -endpoint = "/v1/completions" -completion_window = "24h" - -# Process the batch job -results = processor.process_batch(input_file_path, endpoint, completion_window) - -# Print the results -print(results)