Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -233,6 +233,17 @@ class PunicaWrapperGPU(PunicaWrapperBase):
assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
import vllm.envs as env
if env.VLLM_USE_LORA_FUSION:
import ixformer.inference.functions as ops
num_token, m = x.size(0), x.size(-1)
k, n = lora_b_stacked[0].size(-1), y.size(-1)
if len(lora_a_stacked) == 1 and ops.lora_gemv_optim_condition(num_token, m, k, n):
ops.add_lora_linear(y, x, lora_a_stacked, lora_b_stacked,
lora_bias_stacked = None, scale = 1.0, output_slices = (1,))
return
assert buffer is None, (
"To minimize overhead, the buffer should be created by "
".add_lora_linear() instead of being passed in."
@@ -351,6 +362,8 @@ class PunicaWrapperGPU(PunicaWrapperBase):
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
if pad_sorted_ids:
max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
if topk_ids.numel() < num_experts:
max_num_tokens_padded = topk_ids.numel() * block_size
sorted_ids = torch.empty(
(max_loras * max_num_tokens_padded,),
dtype=torch.int32,