[Misc] Drop Prefetch MLP Env (#7357)

### What this PR does / why we need it? remove deprecated environment variables related to MLP prefetching ### Does this PR introduce _any_ user-facing change? yes, the deprecated env vars can not be used then. - vLLM version: v0.17.0 - vLLM main: 4034c3d32e Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2026-03-19 14:27:27 +08:00
parent ce239db4fb
commit 8e0ebb470a
4 changed files with 10 additions and 65 deletions
--- a/docs/source/tutorials/features/suffix_speculative_decoding.md
+++ b/docs/source/tutorials/features/suffix_speculative_decoding.md
@@ -80,8 +80,6 @@ export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
 export TASK_QUEUE_ENABLE=1
 # Enable the AIVector core to directly schedule ROCE communication.
 export HCCL_OP_EXPANSION_MODE="AIV"
-# Enable MLP prefetch for better performance.
-export VLLM_ASCEND_ENABLE_PREFETCH_MLP=1
 # Enable FlashComm_v1 optimization when tensor parallel is enabled.
 export VLLM_ASCEND_ENABLE_FLASHCOMM1=1

@@ -94,7 +92,7 @@ vllm serve /data/Qwen3-32B \
  --max-num-batched-tokens 40960 \
  --speculative-config '{"method": "suffix", "num_speculative_tokens": 3}' \
  --gpu-memory-utilization 0.9 \
-  --additional-config '{"pa_shape_list":[48,64,72,80]}' \
+  --additional-config '{"pa_shape_list":[48,64,72,80], "weight_prefetch_config":{"enable":true}}' \
  --port 8011
 ```