From 2497bbbaf6bdf99e019ac08c960932aabc99a3b1 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Mon, 15 Dec 2025 08:36:19 +0800 Subject: [PATCH] [Misc] Update pooling example (#5002) ### What this PR does / why we need it? Since the param `task` has been depprecated, we should use the latest unified standard parameters for pooling models, this should be more clear - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: wangli --- docs/source/tutorials/Qwen3_embedding.md | 4 ++-- examples/offline_embed.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/tutorials/Qwen3_embedding.md b/docs/source/tutorials/Qwen3_embedding.md index 475dae70..b1bba732 100644 --- a/docs/source/tutorials/Qwen3_embedding.md +++ b/docs/source/tutorials/Qwen3_embedding.md @@ -40,7 +40,7 @@ export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 ### Online Inference ```bash -vllm serve Qwen/Qwen3-Embedding-8B --task embed +vllm serve Qwen/Qwen3-Embedding-8B --runner pooling ``` Once your server is started, you can query the model with input prompts. @@ -81,7 +81,7 @@ if __name__=="__main__": input_texts = queries + documents model = LLM(model="Qwen/Qwen3-Embedding-8B", - task="embed", + runner="pooling", distributed_executor_backend="mp") outputs = model.embed(input_texts) diff --git a/examples/offline_embed.py b/examples/offline_embed.py index 7707e5fb..cf609070 100644 --- a/examples/offline_embed.py +++ b/examples/offline_embed.py @@ -44,7 +44,7 @@ def main(): ] input_texts = queries + documents - model = LLM(model="Qwen/Qwen3-Embedding-0.6B", task="embed") + model = LLM(model="Qwen/Qwen3-Embedding-0.6B", runner="pooling") outputs = model.embed(input_texts) embeddings = torch.tensor([o.outputs.embedding for o in outputs])