[Doc][v0.18.0] Fix documentation formatting and improve code examples (#8701)
### What this PR does / why we need it? This PR fixes various documentation issues and improves code examples throughout the project. Signed-off-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
@@ -559,7 +559,7 @@ There are three `vllm bench` subcommands:
|
||||
Take the `serve` as an example. Run the code as follows.
|
||||
|
||||
```shell
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
vllm bench serve --model /path_to_weight/DeepSeek-V3.1_w8a8mix_mtp --dataset-name random --random-input 131072 --num-prompts 20 --request-rate 0 --save-result --result-dir ./
|
||||
```
|
||||
|
||||
|
||||
@@ -72,7 +72,7 @@ Run the following script to execute online 128k inference.
|
||||
```shell
|
||||
#!/bin/sh
|
||||
# Load model from ModelScope to speed up download
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
# To reduce memory fragmentation and avoid out of memory
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
export HCCL_BUFFSIZE=512
|
||||
@@ -166,7 +166,7 @@ There are three `vllm bench` subcommands:
|
||||
Take the `serve` as an example. Run the code as follows.
|
||||
|
||||
```shell
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
vllm bench serve --model vllm-ascend/Qwen3-235B-A22B-w8a8 --dataset-name random --random-input 131072 --num-prompts 1 --request-rate 1 --save-result --result-dir ./
|
||||
```
|
||||
|
||||
|
||||
@@ -96,7 +96,7 @@ Run the following steps to start the vLLM service on NPU for the Qwen3 Dense ser
|
||||
--served_model_name qwen --dtype float16 \
|
||||
--additional-config '{"ascend_compilation_config": {"fuse_norm_quant": false}}' \
|
||||
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1,2,4,8,16,32]}' \
|
||||
--quantization ascend --max_model_len 16384
|
||||
--quantization ascend --max-model-len 16384
|
||||
# `--load_format` is required only for the W8A8SC quantized weight format.
|
||||
#
|
||||
```
|
||||
@@ -134,7 +134,7 @@ Run the following steps to start the vLLM service on NPU for the Qwen3 Dense ser
|
||||
--enforce-eager \
|
||||
--dtype float16 \
|
||||
--quantization ascend \
|
||||
--max_model_len 10240
|
||||
--max-model-len 10240
|
||||
```
|
||||
|
||||
Argument notes: `--tensor-parallel-size`: `W8A8SC` quantized weights are tightly coupled to the TP size, so you must specify the TP size you plan to use at serving time when running compression. `--model` is the path to the input `w8a8s` weights, and `--output` is the output path for the compressed `w8a8sc` weights.
|
||||
@@ -159,7 +159,7 @@ Run the following steps to start the vLLM service on NPU for the Qwen3 Dense ser
|
||||
--additional-config '{"ascend_compilation_config": {"fuse_norm_quant": false}}' \
|
||||
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1,2,4,8,16,32]}' \
|
||||
--quantization ascend \
|
||||
--max_model_len 16384 \
|
||||
--max-model-len 16384 \
|
||||
--no-enable-prefix-caching \
|
||||
--load_format="sharded_state"
|
||||
```
|
||||
@@ -178,7 +178,7 @@ Run the following steps to start the vLLM service on NPU for the Qwen3 Dense ser
|
||||
--additional-config '{"ascend_compilation_config": {"fuse_norm_quant": false}}' \
|
||||
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1,2,4,8,16]}' \
|
||||
--quantization ascend \
|
||||
--max_model_len 16384 \
|
||||
--max-model-len 16384 \
|
||||
--no-enable-prefix-caching \
|
||||
--load_format="sharded_state"
|
||||
```
|
||||
@@ -199,7 +199,7 @@ Run the following steps to start the vLLM service on NPU for the Qwen3 Dense ser
|
||||
--additional-config '{"ascend_compilation_config": {"fuse_norm_quant": false}}' \
|
||||
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [16,32]}' \
|
||||
--quantization ascend \
|
||||
--max_model_len 20480 \
|
||||
--max-model-len 20480 \
|
||||
--no-enable-prefix-caching \
|
||||
--load_format="sharded_state"
|
||||
```
|
||||
|
||||
@@ -302,7 +302,7 @@ There are three `vllm bench` subcommands:
|
||||
Take the `serve` as an example. Run the code as follows.
|
||||
|
||||
```shell
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
vllm bench serve --model path/DeepSeek-R1-W8A8 --dataset-name random --random-input 200 --num-prompts 200 --request-rate 1 --save-result --result-dir ./
|
||||
```
|
||||
|
||||
|
||||
@@ -943,7 +943,7 @@ There are three `vllm bench` subcommands:
|
||||
Take the `serve` as an example. Run the code as follows.
|
||||
|
||||
```shell
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
vllm bench serve --model /root/.cache/Eco-Tech/DeepSeek-V3.2-w8a8-mtp-QuaRot --dataset-name random --random-input 200 --num-prompts 200 --request-rate 1 --save-result --result-dir ./
|
||||
```
|
||||
|
||||
|
||||
@@ -93,7 +93,7 @@ vllm serve /root/.cache/DeepSeek-OCR-2 \
|
||||
--trust-remote-code \
|
||||
--tensor-parallel-size 1 \
|
||||
--port 1055 \
|
||||
--max_model_len 8192 \
|
||||
--max-model-len 8192 \
|
||||
--no-enable-prefix-caching \
|
||||
--gpu-memory-utilization 0.8 \
|
||||
--allowed-local-media-path / \
|
||||
|
||||
@@ -784,7 +784,7 @@ There are three `vllm bench` subcommands:
|
||||
Take the `serve` as an example. Run the code as follows.
|
||||
|
||||
```shell
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
vllm bench serve --model Eco-Tech/Kimi-K2.5-w4a8 --dataset-name random --random-input 1024 --num-prompts 200 --request-rate 1 --save-result --result-dir ./
|
||||
```
|
||||
|
||||
|
||||
@@ -72,7 +72,7 @@ Run the following script to start the vLLM server on single 910B4:
|
||||
|
||||
```shell
|
||||
#!/bin/sh
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
export MODEL_PATH="PaddlePaddle/PaddleOCR-VL"
|
||||
export TASK_QUEUE_ENABLE=1
|
||||
export CPU_AFFINITY_CONF=1
|
||||
@@ -97,11 +97,11 @@ Run the following script to start the vLLM server on single Atlas 300 inference
|
||||
|
||||
```shell
|
||||
#!/bin/sh
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
export MODEL_PATH="PaddlePaddle/PaddleOCR-VL"
|
||||
|
||||
vllm serve ${MODEL_PATH} \
|
||||
--max_model_len 16384 \
|
||||
--max-model-len 16384 \
|
||||
--served-model-name PaddleOCR-VL-0.9B \
|
||||
--trust-remote-code \
|
||||
--no-enable-prefix-caching \
|
||||
@@ -112,7 +112,7 @@ vllm serve ${MODEL_PATH} \
|
||||
```
|
||||
|
||||
:::{note}
|
||||
The `--max_model_len` option is added to prevent errors when generating the attention operator mask on the Atlas 300 inference products.
|
||||
The `--max-model-len` option is added to prevent errors when generating the attention operator mask on the Atlas 300 inference products.
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
@@ -323,12 +323,12 @@ Run docker container to start the vLLM server on single-NPU:
|
||||
:substitutions:
|
||||
vllm serve Qwen/Qwen3-VL-8B-Instruct \
|
||||
--dtype bfloat16 \
|
||||
--max_model_len 16384 \
|
||||
--max-model-len 16384 \
|
||||
--max-num-batched-tokens 16384
|
||||
```
|
||||
|
||||
:::{note}
|
||||
Add `--max_model_len` option to avoid ValueError that the Qwen3-VL-8B-Instruct model's max seq len (256000) is larger than the maximum number of tokens that can be stored in KV cache. This will differ with different NPU series based on the on-chip memory size. Please modify the value according to a suitable value for your NPU series.
|
||||
Add `--max-model-len` option to avoid ValueError that the Qwen3-VL-8B-Instruct model's max seq len (256000) is larger than the maximum number of tokens that can be stored in KV cache. This will differ with different NPU series based on the on-chip memory size. Please modify the value according to a suitable value for your NPU series.
|
||||
:::
|
||||
|
||||
If your service start successfully, you can see the info shown below:
|
||||
@@ -415,7 +415,7 @@ vllm serve Qwen/Qwen2.5-VL-32B-Instruct \
|
||||
```
|
||||
|
||||
:::{note}
|
||||
Add `--max_model_len` option to avoid ValueError that the Qwen2.5-VL-32B-Instruct model's max_model_len (128000) is larger than the maximum number of tokens that can be stored in KV cache. This will differ with different NPU series base on the on-chip memory size. Please modify the value according to a suitable value for your NPU series.
|
||||
Add `--max-model-len` option to avoid ValueError that the Qwen2.5-VL-32B-Instruct model's max_model_len (128000) is larger than the maximum number of tokens that can be stored in KV cache. This will differ with different NPU series base on the on-chip memory size. Please modify the value according to a suitable value for your NPU series.
|
||||
:::
|
||||
|
||||
If your service start successfully, you can see the info shown below:
|
||||
|
||||
@@ -74,7 +74,7 @@ The environment variable `LOCAL_MEDIA_PATH` which allows API requests to read lo
|
||||
:::
|
||||
|
||||
```bash
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
export MODEL_PATH="Qwen/Qwen2.5-Omni-7B"
|
||||
export LOCAL_MEDIA_PATH=$HOME/.cache/vllm/assets/vllm_public_assets/
|
||||
|
||||
@@ -104,7 +104,7 @@ VLLM_TARGET_DEVICE=empty pip install -v ".[audio]"
|
||||
#### Multiple NPU (Qwen2.5-Omni-7B)
|
||||
|
||||
```bash
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
export MODEL_PATH=Qwen/Qwen2.5-Omni-7B
|
||||
export LOCAL_MEDIA_PATH=$HOME/.cache/vllm/assets/vllm_public_assets/
|
||||
export DP_SIZE=8
|
||||
|
||||
@@ -95,7 +95,7 @@ Run the following script to execute online 128k inference.
|
||||
```shell
|
||||
#!/bin/sh
|
||||
# Load model from ModelScope to speed up download
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
# To reduce memory fragmentation and avoid out of memory
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
export HCCL_BUFFSIZE=512
|
||||
@@ -157,7 +157,7 @@ Node 0
|
||||
```shell
|
||||
#!/bin/sh
|
||||
# Load model from ModelScope to speed up download
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
# To reduce memory fragmentation and avoid out of memory
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
# this obtained through ifconfig
|
||||
@@ -199,7 +199,7 @@ Node1
|
||||
```shell
|
||||
#!/bin/sh
|
||||
# Load model from ModelScope to speed up download
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
# To reduce memory fragmentation and avoid out of memory
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
# this obtained through ifconfig
|
||||
@@ -309,7 +309,7 @@ There are three `vllm bench` subcommands:
|
||||
Take the `serve` as an example. Run the code as follows.
|
||||
|
||||
```shell
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
vllm bench serve --model vllm-ascend/Qwen3-235B-A22B-w8a8 --dataset-name random --random-input-len 200 --num-prompts 200 --request-rate 1 --save-result --result-dir ./
|
||||
```
|
||||
|
||||
@@ -335,7 +335,7 @@ Example server scripts:
|
||||
```shell
|
||||
#!/bin/sh
|
||||
# Load model from ModelScope to speed up download
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
# To reduce memory fragmentation and avoid out of memory
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
export HCCL_BUFFSIZE=512
|
||||
@@ -408,7 +408,7 @@ export TP_SOCKET_IFNAME=${ifname}
|
||||
export HCCL_SOCKET_IFNAME=${ifname}
|
||||
|
||||
# Load model from ModelScope to speed up download
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
# To reduce memory fragmentation and avoid out of memory
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
export HCCL_BUFFSIZE=512
|
||||
@@ -470,7 +470,7 @@ export TP_SOCKET_IFNAME=${ifname}
|
||||
export HCCL_SOCKET_IFNAME=${ifname}
|
||||
|
||||
# Load model from ModelScope to speed up download
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
# To reduce memory fragmentation and avoid out of memory
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
export HCCL_BUFFSIZE=1024
|
||||
@@ -534,7 +534,7 @@ export TP_SOCKET_IFNAME=${ifname}
|
||||
export HCCL_SOCKET_IFNAME=${ifname}
|
||||
|
||||
# Load model from ModelScope to speed up download
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
# To reduce memory fragmentation and avoid out of memory
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
export HCCL_BUFFSIZE=1024
|
||||
|
||||
@@ -93,7 +93,7 @@ The converted model files look like:
|
||||
Run the following script to start the vLLM server with the quantized model:
|
||||
|
||||
```bash
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
export MODEL_PATH=vllm-ascend/Qwen3-8B-W4A8
|
||||
vllm serve ${MODEL_PATH} --served-model-name "qwen3-8b-w4a8" --max-model-len 4096 --quantization ascend
|
||||
```
|
||||
|
||||
@@ -64,7 +64,7 @@ For an Atlas A2 with 64 GB of NPU card memory, tensor-parallel-size should be at
|
||||
|
||||
```shell
|
||||
#!/bin/sh
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
|
||||
vllm serve Qwen/Qwen3-Coder-30B-A3B-Instruct --served-model-name qwen3-coder --tensor-parallel-size 4 --enable_expert_parallel
|
||||
```
|
||||
|
||||
@@ -163,7 +163,7 @@ There are three `vllm bench` subcommands:
|
||||
Take the `serve` as an example. Run the code as follows.
|
||||
|
||||
```shell
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
vllm bench serve --model Qwen/Qwen3-Next-80B-A3B-Instruct --dataset-name random --random-input 200 --num-prompts 200 --request-rate 1 --save-result --result-dir ./
|
||||
```
|
||||
|
||||
|
||||
@@ -94,7 +94,7 @@ Node 0
|
||||
```shell
|
||||
#!/bin/sh
|
||||
# Load model from ModelScope to speed up download
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
# To reduce memory fragmentation and avoid out of memory
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
# this obtained through ifconfig
|
||||
@@ -137,7 +137,7 @@ Node1
|
||||
```shell
|
||||
#!/bin/sh
|
||||
# Load model from ModelScope to speed up download
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
# To reduce memory fragmentation and avoid out of memory
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
# this is obtained through ifconfig
|
||||
@@ -269,7 +269,7 @@ There are three `vllm bench` subcommands:
|
||||
Take the `serve` as an example. Run the code as follows.
|
||||
|
||||
```shell
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
vllm bench serve --model Qwen/Qwen3-VL-235B-A22B-Instruct --dataset-name random --random-input 200 --num-prompts 200 --request-rate 1 --save-result --result-dir ./
|
||||
```
|
||||
|
||||
|
||||
@@ -94,7 +94,7 @@ Run the following script to execute online 128k inference.
|
||||
```shell
|
||||
#!/bin/sh
|
||||
# Load model from ModelScope to speed up download
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
# To reduce memory fragmentation and avoid out of memory
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
export HCCL_BUFFSIZE=512
|
||||
@@ -190,7 +190,7 @@ There are three `vllm bench` subcommands:
|
||||
Take the `serve` as an example. Run the code as follows.
|
||||
|
||||
```shell
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
vllm bench serve --model Eco-Tech/Qwen3.5-27B-w8a8-mtp --dataset-name random --random-input 200 --num-prompts 200 --request-rate 1 --save-result --result-dir ./
|
||||
```
|
||||
|
||||
|
||||
@@ -94,7 +94,7 @@ Run the following script to execute online 128k inference On 1 Atlas 800 A3(64G*
|
||||
```shell
|
||||
#!/bin/sh
|
||||
# Load model from ModelScope to speed up download
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
# To reduce memory fragmentation and avoid out of memory
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
export HCCL_OP_EXPANSION_MODE="AIV"
|
||||
@@ -157,7 +157,7 @@ Node 0
|
||||
```shell
|
||||
#!/bin/sh
|
||||
# Load model from ModelScope to speed up download
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
# To reduce memory fragmentation and avoid out of memory
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
# this obtained through ifconfig
|
||||
@@ -203,7 +203,7 @@ Node1
|
||||
```shell
|
||||
#!/bin/sh
|
||||
# Load model from ModelScope to speed up download
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
# To reduce memory fragmentation and avoid out of memory
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
# this obtained through ifconfig
|
||||
@@ -595,7 +595,7 @@ There are three `vllm bench` subcommands:
|
||||
Take the `serve` as an example. Run the code as follows.
|
||||
|
||||
```shell
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
vllm bench serve --model Eco-Tech/Qwen3.5-397B-A17B-w8a8-mtp --dataset-name random --random-input 200 --num-prompts 200 --request-rate 1 --save-result --result-dir ./
|
||||
```
|
||||
|
||||
|
||||
Reference in New Issue
Block a user