From 5c048a9b711e4b94b3766c873179b398e973c867 Mon Sep 17 00:00:00 2001 From: linfeng-yuan <1102311262@qq.com> Date: Thu, 23 Apr 2026 18:55:44 +0800 Subject: [PATCH] [Doc][releases/v0.18.0] fix documentation error or non-standard description (#8626) ### What this PR does / why we need it? fix documentation error or non-standard description in releases/v0.18.0 branch ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Documentation check. --------- Signed-off-by: linfeng-yuan <1102311262@qq.com> --- benchmarks/README.md | 11 +++++------ .../long_sequence_context_parallel_single_node.po | 8 ++++---- .../LC_MESSAGES/tutorials/models/Qwen2.5-Omni.po | 4 ++-- .../LC_MESSAGES/tutorials/models/Qwen3-235B-A22B.po | 8 ++++---- .../tutorials/models/Qwen3-Coder-30B-A3B.po | 4 ++-- .../long_sequence_context_parallel_single_node.md | 4 ++-- .../pd_disaggregation_mooncake_single_node.md | 4 ++-- docs/source/tutorials/models/Qwen2.5-Omni.md | 2 +- docs/source/tutorials/models/Qwen3-235B-A22B.md | 10 +++++----- docs/source/tutorials/models/Qwen3-32B-W4A4.md | 8 ++++---- docs/source/tutorials/models/Qwen3-8B-W4A8.md | 8 ++++---- docs/source/tutorials/models/Qwen3-Coder-30B-A3B.md | 2 +- docs/source/tutorials/models/Qwen3-VL-Reranker.md | 2 +- docs/source/tutorials/models/Qwen3_reranker.md | 2 +- tests/e2e/multicard/2-cards/test_qwen3_performance.py | 2 +- 15 files changed, 39 insertions(+), 40 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index ad573fbb..55778130 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -132,12 +132,11 @@ Once the script completes, you can find the results in the benchmarks/results fo ```shell . -|-- serving_qwen2_5_7B_tp1_qps_1.json -|-- serving_qwen2_5_7B_tp1_qps_16.json -|-- serving_qwen2_5_7B_tp1_qps_4.json -|-- serving_qwen2_5_7B_tp1_qps_inf.json -|-- latency_qwen2_5_7B_tp1.json -|-- throughput_qwen2_5_7B_tp1.json +|-- serving_qwen2_5_7Bvl_tp1_qps_1.json +|-- serving_qwen2_5_7Bvl_tp1_qps_16.json +|-- serving_qwen2_5_7Bvl_tp1_qps_4.json +|-- serving_qwen2_5_7Bvl_tp1_qps_inf.json +|-- throughput_qwen2_5_7Bvl_tp1.json ``` These files contain detailed benchmarking results for further analysis. diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/features/long_sequence_context_parallel_single_node.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/features/long_sequence_context_parallel_single_node.po index 2d9d59b1..7551b451 100644 --- a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/features/long_sequence_context_parallel_single_node.po +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/features/long_sequence_context_parallel_single_node.po @@ -138,23 +138,23 @@ msgstr "**注意:**" #: ../../source/tutorials/features/long_sequence_context_parallel_single_node.md:108 #, python-brace-format msgid "" -"for vllm version below `v0.12.0` use parameter: `--rope_scaling " +"for vllm version below `v0.12.0` use parameter: `--rope-scaling " "'{\"rope_type\":\"yarn\",\"factor\":4,\"original_max_position_embeddings\":32768}'" " \\`" msgstr "" -"对于 vllm 版本低于 `v0.12.0`,使用参数:`--rope_scaling " +"对于 vllm 版本低于 `v0.12.0`,使用参数:`--rope-scaling " "'{\"rope_type\":\"yarn\",\"factor\":4,\"original_max_position_embeddings\":32768}'" " \\`" #: ../../source/tutorials/features/long_sequence_context_parallel_single_node.md:109 #, python-brace-format msgid "" -"for vllm version `v0.12.0` use parameter: `--hf-overrides " +"for vllm version same as or newer than `v0.12.0` use parameter: `--hf-overrides " "'{\"rope_parameters\": " "{\"rope_type\":\"yarn\",\"rope_theta\":1000000,\"factor\":4,\"original_max_position_embeddings\":32768}}'" " \\`" msgstr "" -"对于 vllm 版本 `v0.12.0`,使用参数:`--hf-overrides '{\"rope_parameters\": " +"对于 vllm 版本 `v0.12.0`及以上,使用参数:`--hf-overrides '{\"rope_parameters\": " "{\"rope_type\":\"yarn\",\"rope_theta\":1000000,\"factor\":4,\"original_max_position_embeddings\":32768}}'" " \\`" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/models/Qwen2.5-Omni.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/models/Qwen2.5-Omni.po index cc394166..0764a06d 100644 --- a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/models/Qwen2.5-Omni.po +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/models/Qwen2.5-Omni.po @@ -210,8 +210,8 @@ msgstr "详情请参考[使用 AISBench](../../developer_guide/evaluation/using_ #: ../../source/tutorials/models/Qwen2.5-Omni.md:181 msgid "" "After execution, you can get the result, here is the result of `Qwen2.5" -"-Omni-7B` with `vllm-ascend:0.11.0rc0` for reference only." -msgstr "执行后,您可以获得结果,以下是 `Qwen2.5-Omni-7B` 在 `vllm-ascend:0.11.0rc0` 上的结果,仅供参考。" +"-Omni-7B` with `vllm-ascend:v0.11.0rc0` for reference only." +msgstr "执行后,您可以获得结果,以下是 `Qwen2.5-Omni-7B` 在 `vllm-ascend:v0.11.0rc0` 上的结果,仅供参考。" #: ../../source/tutorials/models/Qwen2.5-Omni.md:91 msgid "dataset" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/models/Qwen3-235B-A22B.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/models/Qwen3-235B-A22B.po index 02e0cef9..8e66b54d 100644 --- a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/models/Qwen3-235B-A22B.po +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/models/Qwen3-235B-A22B.po @@ -218,14 +218,14 @@ msgstr "" #: ../../source/tutorials/models/Qwen3-235B-A22B.md:130 #, python-brace-format msgid "" -"For vllm version below `v0.12.0`, use parameter: `--rope_scaling " +"For vllm version below `v0.12.0`, use parameter: `--rope-scaling " "'{\"rope_type\":\"yarn\",\"factor\":4,\"original_max_position_embeddings\":32768}'" " \\`. If you are using weights like [Qwen3-235B-A22B-" "Instruct-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507)" " which originally supports long contexts, there is no need to add this " "parameter." msgstr "" -"对于 `v0.12.0` 以下版本的 vLLM,使用参数:`--rope_scaling " +"对于 `v0.12.0` 以下版本的 vLLM,使用参数:`--rope-scaling " "'{\"rope_type\":\"yarn\",\"factor\":4,\"original_max_position_embeddings\":32768}'" " \\`。如果您使用的是像 [Qwen3-235B-A22B-" "Instruct-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507)" @@ -452,8 +452,8 @@ msgstr "详情请参阅 [使用 AISBench](../../developer_guide/evaluation/using #: ../../source/tutorials/models/Qwen3-235B-A22B.md:285 msgid "" "After execution, you can get the result, here is the result of `Qwen3" -"-235B-A22B-w8a8` in `vllm-ascend:0.11.0rc0` for reference only." -msgstr "执行后,您将获得结果。以下是 `vllm-ascend:0.11.0rc0` 中 `Qwen3-235B-A22B-w8a8` 的结果,仅供参考。" +"-235B-A22B-w8a8` in `vllm-ascend:v0.11.0rc0` for reference only." +msgstr "执行后,您将获得结果。以下是 `vllm-ascend:v0.11.0rc0` 中 `Qwen3-235B-A22B-w8a8` 的结果,仅供参考。" #: ../../source/tutorials/models/Qwen3-235B-A22B.md:76 msgid "dataset" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/models/Qwen3-Coder-30B-A3B.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/models/Qwen3-Coder-30B-A3B.po index becd60d6..92759346 100644 --- a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/models/Qwen3-Coder-30B-A3B.po +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/models/Qwen3-Coder-30B-A3B.po @@ -155,8 +155,8 @@ msgstr "详情请参考[使用 AISBench](../../developer_guide/evaluation/using_ #: ../../source/tutorials/models/Qwen3-Coder-30B-A3B.md:95 msgid "" "After execution, you can get the result, here is the result of `Qwen3" -"-Coder-30B-A3B-Instruct` in `vllm-ascend:0.11.0rc0` for reference only." -msgstr "执行后,您可以获得结果。以下是 `Qwen3-Coder-30B-A3B-Instruct` 在 `vllm-ascend:0.11.0rc0` 中的结果,仅供参考。" +"-Coder-30B-A3B-Instruct` in `vllm-ascend:v0.11.0rc0` for reference only." +msgstr "执行后,您可以获得结果。以下是 `Qwen3-Coder-30B-A3B-Instruct` 在 `vllm-ascend:v0.11.0rc0` 中的结果,仅供参考。" #: ../../source/tutorials/models/Qwen3-Coder-30B-A3B.md:29 msgid "dataset" diff --git a/docs/source/tutorials/features/long_sequence_context_parallel_single_node.md b/docs/source/tutorials/features/long_sequence_context_parallel_single_node.md index 30375a9d..476b34a6 100644 --- a/docs/source/tutorials/features/long_sequence_context_parallel_single_node.md +++ b/docs/source/tutorials/features/long_sequence_context_parallel_single_node.md @@ -105,8 +105,8 @@ vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ **Notice:** -- for vllm version below `v0.12.0` use parameter: `--rope_scaling '{"rope_type":"yarn","factor":4,"original_max_position_embeddings":32768}' \` -- for vllm version `v0.12.0` use parameter: `--hf-overrides '{"rope_parameters": {"rope_type":"yarn","rope_theta":1000000,"factor":4,"original_max_position_embeddings":32768}}' \` +- for vllm version below `v0.12.0` use parameter: `--rope-scaling '{"rope_type":"yarn","factor":4,"original_max_position_embeddings":32768}' \` +- for vllm version same as or newer than `v0.12.0` use parameter: `--hf-overrides '{"rope_parameters": {"rope_type":"yarn","rope_theta":1000000,"factor":4,"original_max_position_embeddings":32768}}' \` The parameters are explained as follows: diff --git a/docs/source/tutorials/features/pd_disaggregation_mooncake_single_node.md b/docs/source/tutorials/features/pd_disaggregation_mooncake_single_node.md index 37a5496d..f6a92e87 100644 --- a/docs/source/tutorials/features/pd_disaggregation_mooncake_single_node.md +++ b/docs/source/tutorials/features/pd_disaggregation_mooncake_single_node.md @@ -244,7 +244,7 @@ python load_balance_proxy_server_example.py \ --host 192.0.0.1 \ --port 8080 \ --prefiller-hosts 192.0.0.1 \ - --prefiller-port 13700 \ + --prefiller-ports 13700 \ --decoder-hosts 192.0.0.1 \ --decoder-ports 13701 ``` @@ -252,7 +252,7 @@ python load_balance_proxy_server_example.py \ |Parameter | Meaning | | --- | --- | | --port | Port of proxy | -| --prefiller-port | All ports of prefill | +| --prefiller-ports | All ports of prefill | | --decoder-ports | All ports of decoder | ## Verification diff --git a/docs/source/tutorials/models/Qwen2.5-Omni.md b/docs/source/tutorials/models/Qwen2.5-Omni.md index 03e50ce9..f5e94fcb 100644 --- a/docs/source/tutorials/models/Qwen2.5-Omni.md +++ b/docs/source/tutorials/models/Qwen2.5-Omni.md @@ -178,7 +178,7 @@ Qwen2.5-Omni on vllm-ascend has been tested on AISBench. 1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details. -2. After execution, you can get the result, here is the result of `Qwen2.5-Omni-7B` with `vllm-ascend:0.11.0rc0` for reference only. +2. After execution, you can get the result, here is the result of `Qwen2.5-Omni-7B` with `vllm-ascend:v0.11.0rc0` for reference only. | dataset | platform | metric | mode | vllm-api-stream-chat | |----- | ----- | ----- | ----- | -----| diff --git a/docs/source/tutorials/models/Qwen3-235B-A22B.md b/docs/source/tutorials/models/Qwen3-235B-A22B.md index bc5ea18b..4bad6d0c 100644 --- a/docs/source/tutorials/models/Qwen3-235B-A22B.md +++ b/docs/source/tutorials/models/Qwen3-235B-A22B.md @@ -127,7 +127,7 @@ vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ - [Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B#processing-long-texts) originally only supports 40960 context(max_position_embeddings). If you want to use it and its related quantization weights to run long seqs (such as 128k context), it is required to use yarn rope-scaling technique. - For vLLM version same as or new than `v0.12.0`, use parameter: `--hf-overrides '{"rope_parameters": {"rope_type":"yarn","rope_theta":1000000,"factor":4,"original_max_position_embeddings":32768}}' \`. - - For vllm version below `v0.12.0`, use parameter: `--rope_scaling '{"rope_type":"yarn","factor":4,"original_max_position_embeddings":32768}' \`. + - For vllm version below `v0.12.0`, use parameter: `--rope-scaling '{"rope_type":"yarn","factor":4,"original_max_position_embeddings":32768}' \`. If you are using weights like [Qwen3-235B-A22B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507) which originally supports long contexts, there is no need to add this parameter. The parameters are explained as follows: @@ -150,7 +150,7 @@ The parameters are explained as follows: ### Multi-node Deployment with MP (Recommended) -Assume you have Atlas 800 A3 (64G*16) nodes (or 2* A2), and want to deploy the `Qwen3-VL-235B-A22B-Instruct` model across multiple nodes. +Assume you have Atlas 800 A3 (64G*16) nodes (or 2* A2), and want to deploy the `Qwen3-235B-A22B-Instruct` model across multiple nodes. Node 0 @@ -282,7 +282,7 @@ Here are two accuracy evaluation methods. 1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details. -2. After execution, you can get the result, here is the result of `Qwen3-235B-A22B-w8a8` in `vllm-ascend:0.11.0rc0` for reference only. +2. After execution, you can get the result, here is the result of `Qwen3-235B-A22B-w8a8` in `vllm-ascend:v0.11.0rc0` for reference only. | dataset | version | metric | mode | vllm-api-general-chat | |----- | ----- | ----- | ----- | -----| @@ -310,7 +310,7 @@ Take the `serve` as an example. Run the code as follows. ```shell export VLLM_USE_MODELSCOPE=true -vllm bench serve --model vllm-ascend/Qwen3-235B-A22B-w8a8 --dataset-name random --random-input 200 --num-prompts 200 --request-rate 1 --save-result --result-dir ./ +vllm bench serve --model vllm-ascend/Qwen3-235B-A22B-w8a8 --dataset-name random --random-input-len 200 --num-prompts 200 --request-rate 1 --save-result --result-dir ./ ``` After about several minutes, you can get the performance evaluation result. @@ -589,7 +589,7 @@ vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ PD proxy: ```shell -python load_balance_proxy_server_example.py --port 12347 --prefiller-hosts prefill_node_1_ip --prefiller-port 8000 --decoder-hosts decode_node_1_ip --decoder-ports 8000 +python load_balance_proxy_server_example.py --port 12347 --prefiller-hosts prefill_node_1_ip --prefiller-ports 8000 --decoder-hosts decode_node_1_ip --decoder-ports 8000 ``` Benchmark scripts: diff --git a/docs/source/tutorials/models/Qwen3-32B-W4A4.md b/docs/source/tutorials/models/Qwen3-32B-W4A4.md index 28b9ecb4..a928fd7f 100644 --- a/docs/source/tutorials/models/Qwen3-32B-W4A4.md +++ b/docs/source/tutorials/models/Qwen3-32B-W4A4.md @@ -108,10 +108,10 @@ curl http://localhost:8000/v1/completions \ -d '{ "model": "qwen3-32b-w4a4", "prompt": "what is large language model?", - "max_completion_tokens": "128", - "top_p": "0.95", - "top_k": "40", - "temperature": "0.0" + "max_completion_tokens": 128, + "top_p": 0.95, + "top_k": 40, + "temperature": 0 }' ``` diff --git a/docs/source/tutorials/models/Qwen3-8B-W4A8.md b/docs/source/tutorials/models/Qwen3-8B-W4A8.md index 8d07b117..a8874900 100644 --- a/docs/source/tutorials/models/Qwen3-8B-W4A8.md +++ b/docs/source/tutorials/models/Qwen3-8B-W4A8.md @@ -106,10 +106,10 @@ curl http://localhost:8000/v1/completions \ -d '{ "model": "qwen3-8b-w4a8", "prompt": "what is large language model?", - "max_completion_tokens": "128", - "top_p": "0.95", - "top_k": "40", - "temperature": "0.0" + "max_completion_tokens": 128, + "top_p": 0.95, + "top_k": 40, + "temperature": 0 }' ``` diff --git a/docs/source/tutorials/models/Qwen3-Coder-30B-A3B.md b/docs/source/tutorials/models/Qwen3-Coder-30B-A3B.md index 3e1bb5d0..16e165ab 100644 --- a/docs/source/tutorials/models/Qwen3-Coder-30B-A3B.md +++ b/docs/source/tutorials/models/Qwen3-Coder-30B-A3B.md @@ -92,7 +92,7 @@ curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/jso 1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details. -2. After execution, you can get the result, here is the result of `Qwen3-Coder-30B-A3B-Instruct` in `vllm-ascend:0.11.0rc0` for reference only. +2. After execution, you can get the result, here is the result of `Qwen3-Coder-30B-A3B-Instruct` in `vllm-ascend:v0.11.0rc0` for reference only. | dataset | version | metric | mode | vllm-api-general-chat | |----- | ----- | ----- | ----- | -----| diff --git a/docs/source/tutorials/models/Qwen3-VL-Reranker.md b/docs/source/tutorials/models/Qwen3-VL-Reranker.md index cb5a5b9d..faee59f6 100644 --- a/docs/source/tutorials/models/Qwen3-VL-Reranker.md +++ b/docs/source/tutorials/models/Qwen3-VL-Reranker.md @@ -71,7 +71,7 @@ Start the server with the following command: vllm serve Qwen/Qwen3-VL-Reranker-8B \ --runner pooling \ --max-model-len 4096 \ - --hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' \ + --hf-overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' \ --chat-template ./qwen3_vl_reranker.jinja ``` diff --git a/docs/source/tutorials/models/Qwen3_reranker.md b/docs/source/tutorials/models/Qwen3_reranker.md index ba30df56..fa787e06 100644 --- a/docs/source/tutorials/models/Qwen3_reranker.md +++ b/docs/source/tutorials/models/Qwen3_reranker.md @@ -35,7 +35,7 @@ Using the Qwen3-Reranker-8B model as an example, first run the docker container ### Online Inference ```bash -vllm serve Qwen/Qwen3-Reranker-8B --host 127.0.0.1 --port 8888 --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' +vllm serve Qwen/Qwen3-Reranker-8B --host 127.0.0.1 --port 8888 --hf-overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' ``` Once your server is started, you can send request with follow examples. diff --git a/tests/e2e/multicard/2-cards/test_qwen3_performance.py b/tests/e2e/multicard/2-cards/test_qwen3_performance.py index 7c3b409f..6ab03858 100644 --- a/tests/e2e/multicard/2-cards/test_qwen3_performance.py +++ b/tests/e2e/multicard/2-cards/test_qwen3_performance.py @@ -42,7 +42,7 @@ vllm_bench_cases = { "random_input_len": 128, "max_concurrency": 40, "random_output_len": 100, - "temperature": 0.0, + "temperature": 0, } # NOTE: Any changes for the baseline throughput should be approved by team members.