[Doc] Add qwen3 embedding 8b guide (#1734)
1. Add the tutorials for qwen3-embedding-8b
2. Remove VLLM_USE_V1=1 in docs, it's useless any more from 0.9.2
- vLLM version: v0.9.2
- vLLM main:
5923ab9524
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -50,8 +50,6 @@ Run the following script to execute offline inference on a single NPU:
|
||||
import os
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
os.environ["VLLM_USE_V1"] = "1"
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The future of AI is",
|
||||
@@ -77,8 +75,6 @@ for output in outputs:
|
||||
import os
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
os.environ["VLLM_USE_V1"] = "1"
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The future of AI is",
|
||||
@@ -133,7 +129,7 @@ docker run --rm \
|
||||
-e VLLM_USE_MODELSCOPE=True \
|
||||
-e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \
|
||||
-it $IMAGE \
|
||||
VLLM_USE_V1=1 vllm serve Qwen/Qwen3-8B --max_model_len 26240
|
||||
vllm serve Qwen/Qwen3-8B --max_model_len 26240
|
||||
```
|
||||
::::
|
||||
|
||||
@@ -158,7 +154,7 @@ docker run --rm \
|
||||
-e VLLM_USE_MODELSCOPE=True \
|
||||
-e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \
|
||||
-it $IMAGE \
|
||||
VLLM_USE_V1=1 vllm serve Qwen/Qwen3-8B --max_model_len 26240 --enforce-eager
|
||||
vllm serve Qwen/Qwen3-8B --max_model_len 26240 --enforce-eager
|
||||
```
|
||||
::::
|
||||
:::::
|
||||
|
||||
Reference in New Issue
Block a user