[CI] Remove compatibility maintenance for vllm v0.10.1 and v0.10.1.1 (#2840)

### What this PR does / why we need it? Remove compatibility maintenance for vllm v0.10.1 and v0.10.1.1 ### Does this PR introduce _any_ user-facing change? branch main of vllm-ascend will not be compatible with vllm v0.10.1 and v0.10.1.1 ### How was this patch tested? CI passed with existing test. - vLLM version: v0.10.1.1 - vLLM main: 6fb2788163 --------- Signed-off-by: MengqingCao <cmq0113@163.com>
2025-09-10 08:43:10 +08:00
parent 93e28e6862
commit edf1f600ad
22 changed files with 340 additions and 876 deletions
--- a/vllm_ascend/ops/common_fused_moe.py
+++ b/vllm_ascend/ops/common_fused_moe.py
@@ -34,7 +34,7 @@ from vllm_ascend.ops.moe.experts_selector import select_experts
 from vllm_ascend.ops.moe.moe_comm_method import (AllGatherCommImpl,
                                                 AlltoAllCommImpl, MC2CommImpl)
 from vllm_ascend.ops.moe.token_dispatcher import setup_token_dispatchers
-from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p, vllm_version_is
+from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p

 original_unquantized_fused_moe_init_func = UnquantizedFusedMoEMethod.__init__

@@ -137,67 +137,6 @@ def unquantized_fused_moe_init_func(self, *args, **kwargs):
    self.transpose = True


-def forward_oot_v01011(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
-        router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        custom_routing_function: Optional[Callable] = None,
-        scoring_func: str = "softmax",
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None) -> torch.Tensor:
-
-    topk_weights, topk_ids, row_idx = select_experts(
-        hidden_states=x,
-        router_logits=router_logits,
-        top_k=top_k,
-        use_grouped_topk=use_grouped_topk,
-        renormalize=renormalize,
-        topk_group=topk_group,
-        num_expert_group=num_expert_group,
-        custom_routing_function=custom_routing_function,
-        scoring_func=scoring_func,
-        routed_scaling_factor=1.0,
-        e_score_correction_bias=e_score_correction_bias,
-        global_num_experts=global_num_experts)
-
-    if topk_ids.shape[1] < top_k or is_310p():
-        assert global_num_experts is not None
-        return fused_experts_moge(
-            hidden_states=x,
-            w1=layer.w13_weight,
-            w2=layer.w2_weight,
-            moe_parallel_config=self.moe.moe_parallel_config,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            top_k=top_k,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            apply_router_weight_on_input=apply_router_weight_on_input)
-
-    moe_comm_method = get_forward_context().moe_comm_method
-    return moe_comm_method.fused_experts(hidden_states=x,
-                                         w1=layer.w13_weight,
-                                         w2=layer.w2_weight,
-                                         topk_weights=topk_weights,
-                                         topk_ids=topk_ids,
-                                         row_idx=row_idx,
-                                         global_num_experts=global_num_experts,
-                                         expert_map=expert_map)
-
-
 def forward_oot(
        self,
        layer: torch.nn.Module,
@@ -315,59 +254,32 @@ class AscendFusedMoE(FusedMoE):
        num_redundant_experts=0,
        has_bias=False,
    ):
-        if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
-            super().__init__(
-                num_experts,
-                top_k,
-                hidden_size,
-                intermediate_size,
-                params_dtype,
-                reduce_results,
-                renormalize,
-                use_grouped_topk,
-                num_expert_group,
-                topk_group,
-                quant_config,
-                tp_size,
-                ep_size,
-                dp_size,
-                prefix,
-                custom_routing_function,
-                scoring_func,
-                e_score_correction_bias,
-                apply_router_weight_on_input,
-                activation,
-                enable_eplb,
-                num_redundant_experts,
-                has_bias,
-            )
-        else:
-            super().__init__(
-                num_experts,
-                top_k,
-                hidden_size,
-                intermediate_size,
-                params_dtype,
-                reduce_results,
-                renormalize,
-                use_grouped_topk,
-                num_expert_group,
-                topk_group,
-                quant_config,
-                tp_size,
-                ep_size,
-                dp_size,
-                prefix,
-                custom_routing_function,
-                scoring_func,
-                routed_scaling_fator,
-                e_score_correction_bias,
-                apply_router_weight_on_input,
-                activation,
-                enable_eplb,
-                num_redundant_experts,
-                has_bias,
-            )
+        super().__init__(
+            num_experts,
+            top_k,
+            hidden_size,
+            intermediate_size,
+            params_dtype,
+            reduce_results,
+            renormalize,
+            use_grouped_topk,
+            num_expert_group,
+            topk_group,
+            quant_config,
+            tp_size,
+            ep_size,
+            dp_size,
+            prefix,
+            custom_routing_function,
+            scoring_func,
+            routed_scaling_fator,
+            e_score_correction_bias,
+            apply_router_weight_on_input,
+            activation,
+            enable_eplb,
+            num_redundant_experts,
+            has_bias,
+        )
        setup_token_dispatchers(self.moe_config.ep_size,
                                top_k=self.top_k,
                                num_experts=self.global_num_experts,
@@ -529,8 +441,4 @@ class AscendSharedFusedMoE(AscendFusedMoE):

 UnquantizedFusedMoEMethod.__init__ = unquantized_fused_moe_init_func
 UnquantizedFusedMoEMethod.process_weights_after_loading = process_weights_after_loading
-
-if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
-    UnquantizedFusedMoEMethod.forward_oot = forward_oot_v01011
-else:
-    UnquantizedFusedMoEMethod.forward_oot = forward_oot
+UnquantizedFusedMoEMethod.forward_oot = forward_oot