From 97011abc8a29ce95d90fd0c98695c4b27092d274 Mon Sep 17 00:00:00 2001 From: woodx <124784234+woodx9@users.noreply.github.com> Date: Fri, 20 Jun 2025 12:53:54 +0800 Subject: [PATCH] [Doc] add embedding rerank doc (#7364) --- docs/backend/native_api.ipynb | 60 ++++++++++++++++++++++- docs/supported_models/embedding_models.md | 1 + docs/supported_models/rerank_models.md | 49 ++++++++++++++++++ 3 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 docs/supported_models/rerank_models.md diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb index a54ae6996..1bb1fcf2a 100644 --- a/docs/backend/native_api.ipynb +++ b/docs/backend/native_api.ipynb @@ -16,6 +16,7 @@ "- `/flush_cache`\n", "- `/update_weights`\n", "- `/encode`(embedding model)\n", + "- `/v1/rerank`(cross encoder rerank model)\n", "- `/classify`(reward model)\n", "- `/start_expert_distribution_record`\n", "- `/stop_expert_distribution_record`\n", @@ -307,6 +308,63 @@ "terminate_process(embedding_process)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## v1/rerank (cross encoder rerank model)\n", + "Rerank a list of documents given a query using a cross-encoder model. Note that this API is only available for cross encoder model like [BAAI/bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) with `attention-backend` `triton` and `torch_native`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reranker_process, port = launch_server_cmd(\n", + " \"\"\"\n", + "python3 -m sglang.launch_server --model-path BAAI/bge-reranker-v2-m3 \\\n", + " --host 0.0.0.0 --disable-radix-cache --chunked-prefill-size -1 --attention-backend triton --is-embedding\n", + "\"\"\"\n", + ")\n", + "\n", + "wait_for_server(f\"http://localhost:{port}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# compute rerank scores for query and documents\n", + "\n", + "url = f\"http://localhost:{port}/v1/rerank\"\n", + "data = {\n", + " \"model\": \"BAAI/bge-reranker-v2-m3\",\n", + " \"query\": \"what is panda?\",\n", + " \"documents\": [\n", + " \"hi\",\n", + " \"The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.\",\n", + " ],\n", + "}\n", + "\n", + "response = requests.post(url, json=data)\n", + "response_json = response.json()\n", + "for item in response_json:\n", + " print_highlight(f\"Score: {item['score']:.2f} - Document: '{item['document']}'\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "terminate_process(reranker_process)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -322,8 +380,6 @@ "metadata": {}, "outputs": [], "source": [ - "terminate_process(embedding_process)\n", - "\n", "# Note that SGLang now treats embedding models and reward models as the same type of models.\n", "# This will be updated in the future.\n", "\n", diff --git a/docs/supported_models/embedding_models.md b/docs/supported_models/embedding_models.md index 777341a41..c1e095ef2 100644 --- a/docs/supported_models/embedding_models.md +++ b/docs/supported_models/embedding_models.md @@ -51,3 +51,4 @@ print("Embeddings:", [x.get("embedding") for x in response.get("data", [])]) | **GTE (QwenEmbeddingModel)** | `Alibaba-NLP/gte-Qwen2-7B-instruct` | N/A | Alibaba’s general text embedding model (7B), achieving state‑of‑the‑art multilingual performance in English and Chinese. | | **GME (MultimodalEmbedModel)** | `Alibaba-NLP/gme-Qwen2-VL-2B-Instruct` | `gme-qwen2-vl` | Multimodal embedding model (2B) based on Qwen2‑VL, encoding image + text into a unified vector space for cross‑modal retrieval. | | **CLIP (CLIPEmbeddingModel)** | `openai/clip-vit-large-patch14-336` | N/A | OpenAI’s CLIP model (ViT‑L/14) for embedding images (and text) into a joint latent space; widely used for image similarity search. | +| **BGE (BgeEmbeddingModel)** | `BAAI/bge-large-en-v1.5` | N/A | Currently only support `attention-backend` `triton` and `torch_native`. BAAI's BGE embedding models optimized for retrieval and reranking tasks. | diff --git a/docs/supported_models/rerank_models.md b/docs/supported_models/rerank_models.md new file mode 100644 index 000000000..b6f2ffa20 --- /dev/null +++ b/docs/supported_models/rerank_models.md @@ -0,0 +1,49 @@ +# Rerank Models + +SGLang offers comprehensive support for rerank models by incorporating optimized serving frameworks with a flexible programming interface. This setup enables efficient processing of cross-encoder reranking tasks, improving the accuracy and relevance of search result ordering. SGLang’s design ensures high throughput and low latency during reranker model deployment, making it ideal for semantic-based result refinement in large-scale retrieval systems. + +```{important} +They are executed with `--is-embedding` and some may require `--trust-remote-code` +``` + +## Example Launch Command + +```shell +python3 -m sglang.launch_server \ + --model-path BAAI/bge-reranker-v2-m3 \ + --host 0.0.0.0 \ + --disable-radix-cache \ + --chunked-prefill-size -1 \ + --attention-backend triton \ + --is-embedding \ + --port 30000 +``` + +## Example Client Request + +```python +import requests + +url = "http://127.0.0.1:30000/v1/rerank" + +payload = { + "model": "BAAI/bge-reranker-v2-m3", + "query": "what is panda?", + "documents": [ + "hi", + "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." + ] +} + +response = requests.post(url, json=payload) +response_json = response.json() + +for item in response_json: + print(f"Score: {item['score']:.2f} - Document: '{item['document']}'") +``` + +## Supported rerank models + +| Model Family (Rerank) | Example HuggingFace Identifier | Chat Template | Description | +|------------------------------------------------|--------------------------------------|---------------|----------------------------------------------------------------------------------------------------------------------------------| +| **BGE-Reranker (BgeRerankModel)** | `BAAI/bge-reranker-v2-m3` | N/A | Currently only support `attention-backend` `triton` and `torch_native`. high-performance cross-encoder reranker model from BAAI. Suitable for reranking search results based on semantic relevance. |