[CI] Upgrade vLLM to 20250919 (6d8246aa) and fix some broken issue (#2907)

### What this PR does / why we need it? 1. This pr bump vllm commit to 6d8246aaff 2. fix upstream changes https://github.com/vllm-project/vllm/pull/24548 abort multi-modal kwargs, make vllm main and `v0.10.2` both adaptable 3. fix metadata_builder changes introduced by https://github.com/vllm-project/vllm/pull/23693 4. fix `structured_outputs_config` changes introduced by https://github.com/vllm-project/vllm/pull/22772 5. fix `moe_config` changes introduced by https://github.com/vllm-project/vllm/pull/22537 Co-authored-by: MengqingCao <cmq0113@163.com> Co-authored-by: Yikun Jiang <yikunkero@gmail.com> - vLLM version: v0.10.2 - vLLM main: c60e6137f0 --------- Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Co-authored-by: MengqingCao <cmq0113@163.com>
2025-09-20 17:37:57 +08:00
parent 53ecd89e8f
commit 12bcbd02bb
14 changed files with 359 additions and 143 deletions
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -47,7 +47,8 @@ from vllm_ascend.ops.moe.moe_comm_method import (AllGatherCommImpl,
 from vllm_ascend.ops.sequence_parallel import MetadataForPadding
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ,
                               get_all_reduce_merge_state,
-                               get_rm_router_logits_state, is_310p)
+                               get_rm_router_logits_state, is_310p,
+                               vllm_version_is)


 class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
@@ -278,16 +279,25 @@ class AscendFusedMoE(FusedMoE):
        if self.scoring_func != "softmax" and not self.use_grouped_topk:
            raise ValueError("Only softmax scoring function is supported for "
                             "non-grouped topk.")
-        moe = FusedMoEConfig.make(
-            num_experts=self.global_num_experts,
-            experts_per_token=top_k,
-            hidden_dim=hidden_size,
-            num_local_experts=self.local_num_experts,
-            moe_parallel_config=self.moe_parallel_config,
-            # TODO (bnell): this needs to be fixed for quantized types.
-            in_dtype=params_dtype,
-            quant_config=quant_config)
-
+        if vllm_version_is("0.10.2"):
+            moe = FusedMoEConfig.make(
+                num_experts=self.global_num_experts,
+                experts_per_token=top_k,
+                hidden_dim=hidden_size,
+                num_local_experts=self.local_num_experts,
+                moe_parallel_config=self.moe_parallel_config,
+                # TODO (bnell): this needs to be fixed for quantized types.
+                in_dtype=params_dtype,
+                quant_config=quant_config)
+        else:
+            moe = FusedMoEConfig(
+                num_experts=self.global_num_experts,
+                experts_per_token=top_k,
+                hidden_dim=hidden_size,
+                num_local_experts=self.local_num_experts,
+                moe_parallel_config=self.moe_parallel_config,
+                in_dtype=params_dtype,
+            )
        self.moe_config = moe

        if quant_config is None: