[Refactor] MLP weight prefetch to consistency with MoE Model's prefetching in terms of code and usage (#6442)

### What this PR does / why we need it? Refactor MLP weight prefetch to consistency with MoE Model's prefetching in terms of code and usage. Environments VLLM_ASCEND_ENABLE_PREFETCH_MLP, VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE and VLLM_ASCEND_MLP_GATE_UP_PREFETCH_SIZE is removed, usage as following: --additional-config '{"weight_prefetch_config": { "enabled": true, "prefetch_ratio": {"mlp": { "gate_up": 1.0, "down": 1.0} }}}' ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.14.1 - vLLM main: dc917cceb8 --------- Signed-off-by: leo-pony <nengjunma@outlook.com>
2026-02-04 09:08:18 +08:00
parent fa56abea9f
commit 78fad4e348
18 changed files with 250 additions and 171 deletions
--- a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
@@ -222,7 +222,7 @@ def test_qwen3_dense_fc1_tp2(model):


@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
-@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1"})
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
 def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
    example_prompts = [
        "Hello, my name is",
@@ -236,6 +236,7 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
            tensor_parallel_size=2,
            cudagraph_capture_sizes=[1, 2, 4, 8],
            quantization="ascend",
+            additional_config={"weight_prefetch_config": {"enabled": True}},
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)

--- a/tests/e2e/multicard/2-cards/test_qwen3_performance.py
+++ b/tests/e2e/multicard/2-cards/test_qwen3_performance.py
@@ -57,7 +57,6 @@ async def test_models(model: str) -> None:
    env_dict = {
        "TASK_QUEUE_ENABLE": "1",
        "HCCL_OP_EXPANSION_MODE": "AIV",
-        "VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1",
    }
    server_args = [
        "--async-scheduling",
@@ -74,7 +73,7 @@ async def test_models(model: str) -> None:
        "--compilation-config",
        '{"cudagraph_mode": "FULL_DECODE_ONLY"}',
        "--additional-config",
-        '{"pa_shape_list":[48,64,72,80]}',
+        '{"pa_shape_list":[48,64,72,80],"weight_prefetch_config":{"enabled":true}}',
        "--block-size",
        "128",
        "--trust-remote-code",