From 2497bbbaf6bdf99e019ac08c960932aabc99a3b1 Mon Sep 17 00:00:00 2001
From: Li Wang <wangli858794774@gmail.com>
Date: Mon, 15 Dec 2025 08:36:19 +0800
Subject: [PATCH] [Misc] Update pooling example (#5002)

### What this PR does / why we need it?
Since the param `task` has been depprecated, we should use the latest
unified standard parameters for pooling models, this should be more
clear


- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 docs/source/tutorials/Qwen3_embedding.md | 4 ++--
 examples/offline_embed.py                | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/tutorials/Qwen3_embedding.md b/docs/source/tutorials/Qwen3_embedding.md
index 475dae70..b1bba732 100644
--- a/docs/source/tutorials/Qwen3_embedding.md
+++ b/docs/source/tutorials/Qwen3_embedding.md
@@ -40,7 +40,7 @@ export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256
 ### Online Inference
 
 ```bash
-vllm serve Qwen/Qwen3-Embedding-8B --task embed
+vllm serve Qwen/Qwen3-Embedding-8B --runner pooling
 ```
 
 Once your server is started, you can query the model with input prompts.
@@ -81,7 +81,7 @@ if __name__=="__main__":
     input_texts = queries + documents
 
     model = LLM(model="Qwen/Qwen3-Embedding-8B",
-                task="embed",
+                runner="pooling",
                 distributed_executor_backend="mp")
 
     outputs = model.embed(input_texts)
diff --git a/examples/offline_embed.py b/examples/offline_embed.py
index 7707e5fb..cf609070 100644
--- a/examples/offline_embed.py
+++ b/examples/offline_embed.py
@@ -44,7 +44,7 @@ def main():
     ]
     input_texts = queries + documents
 
-    model = LLM(model="Qwen/Qwen3-Embedding-0.6B", task="embed")
+    model = LLM(model="Qwen/Qwen3-Embedding-0.6B", runner="pooling")
 
     outputs = model.embed(input_texts)
     embeddings = torch.tensor([o.outputs.embedding for o in outputs])