diff --git a/docs/source/tutorials/Qwen3_embedding.md b/docs/source/tutorials/Qwen3_embedding.md index 475dae70..b1bba732 100644 --- a/docs/source/tutorials/Qwen3_embedding.md +++ b/docs/source/tutorials/Qwen3_embedding.md @@ -40,7 +40,7 @@ export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 ### Online Inference ```bash -vllm serve Qwen/Qwen3-Embedding-8B --task embed +vllm serve Qwen/Qwen3-Embedding-8B --runner pooling ``` Once your server is started, you can query the model with input prompts. @@ -81,7 +81,7 @@ if __name__=="__main__": input_texts = queries + documents model = LLM(model="Qwen/Qwen3-Embedding-8B", - task="embed", + runner="pooling", distributed_executor_backend="mp") outputs = model.embed(input_texts) diff --git a/examples/offline_embed.py b/examples/offline_embed.py index 7707e5fb..cf609070 100644 --- a/examples/offline_embed.py +++ b/examples/offline_embed.py @@ -44,7 +44,7 @@ def main(): ] input_texts = queries + documents - model = LLM(model="Qwen/Qwen3-Embedding-0.6B", task="embed") + model = LLM(model="Qwen/Qwen3-Embedding-0.6B", runner="pooling") outputs = model.embed(input_texts) embeddings = torch.tensor([o.outputs.embedding for o in outputs])