Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -233,6 +233,17 @@ class PunicaWrapperGPU(PunicaWrapperBase):

        assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)

+        import vllm.envs as env
+        if env.VLLM_USE_LORA_FUSION:
+            import ixformer.inference.functions as ops
+
+            num_token, m = x.size(0), x.size(-1)
+            k, n = lora_b_stacked[0].size(-1), y.size(-1)
+            if len(lora_a_stacked) == 1 and ops.lora_gemv_optim_condition(num_token, m, k, n):
+                ops.add_lora_linear(y, x, lora_a_stacked, lora_b_stacked,
+                        lora_bias_stacked = None, scale = 1.0, output_slices = (1,))
+                return
+
        assert buffer is None, (
            "To minimize overhead, the buffer should be created by "
            ".add_lora_linear() instead of being passed in."
@@ -351,6 +362,8 @@ class PunicaWrapperGPU(PunicaWrapperBase):
            max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
            if pad_sorted_ids:
                max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
+            if topk_ids.numel() < num_experts:
+                max_num_tokens_padded = topk_ids.numel() * block_size
            sorted_ids = torch.empty(
                (max_loras * max_num_tokens_padded,),
                dtype=torch.int32,