[main] flashcomm_v1 optim in Qwen Dense Models (#2802)

### What this PR does / why we need it? Flashcomm_v1 optim in Qwen Dense Models. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? CI passed with new added/existing test. - vLLM version: v0.10.1.1 - vLLM main: 5e537f45b4 Co-authored-by: 1024daniel <xxltju324@gmail.com>
2025-09-08 22:52:24 +08:00
parent 4df8df5b94
commit 1bbb20ea13
11 changed files with 362 additions and 20 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -37,6 +37,7 @@ from vllm.attention.layer import Attention
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.monitor import set_cudagraph_capturing_enabled
 from vllm.config import CompilationLevel, CUDAGraphMode, VllmConfig
+from vllm.distributed import tensor_model_parallel_all_gather
 from vllm.distributed.kv_transfer import (get_kv_transfer_group,
                                          has_kv_transfer_group)
 from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
@@ -1182,6 +1183,11 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            intermediate_tensors=intermediate_tensors,
            inputs_embeds=inputs_embeds,
        )
+        if get_forward_context().flashcomm_v1_enabled:
+            hidden_states = tensor_model_parallel_all_gather(hidden_states, 0)
+            pad_size = get_forward_context().pad_size
+            if pad_size > 0:
+                hidden_states = hidden_states[:-pad_size, :]
        return hidden_states

    def _build_attn_state(self, num_reqs, num_scheduled_tokens,