Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/examples/online_serving/ray_serve_deepseek.py
+++ b/examples/online_serving/ray_serve_deepseek.py
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Deploy DeepSeek R1 or V3 with Ray Serve LLM.
+
+Ray Serve LLM is a scalable and production-grade model serving library built
+on the Ray distributed computing framework and first-class support for the vLLM engine.
+
+Key features:
+- Automatic scaling, back-pressure, and load balancing across a Ray cluster.
+- Unified multi-node multi-model deployment.
+- Exposes an OpenAI-compatible HTTP API.
+- Multi-LoRA support with shared base models.
+
+Run `python3 ray_serve_deepseek.py` to launch an endpoint.
+
+Learn more in the official Ray Serve LLM documentation:
+https://docs.ray.io/en/latest/serve/llm/serving-llms.html
+"""
+
+from ray import serve
+from ray.serve.llm import LLMConfig, build_openai_app
+
+llm_config = LLMConfig(
+    model_loading_config={
+        "model_id": "deepseek",
+        # Pre-downloading the model to local storage is recommended since
+        # the model is large. Set model_source="/path/to/the/model".
+        "model_source": "deepseek-ai/DeepSeek-R1",
+    },
+    deployment_config={
+        "autoscaling_config": {
+            "min_replicas": 1,
+            "max_replicas": 1,
+        }
+    },
+    # Set to the node's accelerator type.
+    accelerator_type="H100",
+    # Customize engine arguments as required (for example, vLLM engine kwargs).
+    engine_kwargs={
+        "tensor_parallel_size": 8,
+        "pipeline_parallel_size": 2,
+        "gpu_memory_utilization": 0.92,
+        "dtype": "auto",
+        "max_num_seqs": 40,
+        "max_model_len": 16384,
+        "enable_chunked_prefill": True,
+        "enable_prefix_caching": True,
+        "trust_remote_code": True,
+    },
+)
+
+# Deploy the application.
+llm_app = build_openai_app({"llm_configs": [llm_config]})
+serve.run(llm_app)