[Misc] Upgrade vllm hash to 12_14 (#5000)

### What this PR does / why we need it? ### Does this PR introduce _any_ user-facing change? 1. fix https://github.com/vllm-project/vllm/pull/27938 2. fix https://github.com/vllm-project/vllm/pull/27145 pooling models now supports chunked prefill and prefix caching, 3. fix https://github.com/vllm-project/vllm/pull/30181 define the CPU fields in the field config where they really belong. 4. fix https://github.com/vllm-project/vllm/pull/28168 define the CPU fields in the field config where they really belong. 5. fix https://github.com/vllm-project/vllm/pull/30201 some moudle rename 6. fix https://github.com/vllm-project/vllm/pull/29067 fusedmoe moudle refactor 7. fix https://github.com/vllm-project/vllm/pull/29066 fusedmoe moudle refactor 8. fix https://github.com/vllm-project/vllm/pull/29624 ### How was this patch tested? - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-12-15 19:54:23 +08:00
parent 3b7eb5179f
commit 8d2998d0e4
17 changed files with 167 additions and 1183 deletions
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -41,7 +41,7 @@ from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
                               flashcomm2_o_shared_enabled, is_enable_nz,
                               weak_ref_tensors)
-from vllm_ascend.worker.npu_input_batch import InputBatch
+from vllm_ascend.worker.npu_input_batch import NPUInputBatch

 if TYPE_CHECKING:
    from vllm.v1.core.sched.output import SchedulerOutput
@@ -280,7 +280,7 @@ class AscendMLAMetadataBuilder:
                                              dtype=torch.uint8,
                                              device=device)

-    def reorder_batch(self, input_batch: "InputBatch",
+    def reorder_batch(self, input_batch: "NPUInputBatch",
                      scheduler_output: "SchedulerOutput") -> bool:
        # We now want to reorder the batch so that the "decode" requests are at
        # the front and the "prefill" requests are at the using the least amount