Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -233,6 +233,17 @@ class PunicaWrapperGPU(PunicaWrapperBase):
|
||||
|
||||
assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
|
||||
|
||||
import vllm.envs as env
|
||||
if env.VLLM_USE_LORA_FUSION:
|
||||
import ixformer.inference.functions as ops
|
||||
|
||||
num_token, m = x.size(0), x.size(-1)
|
||||
k, n = lora_b_stacked[0].size(-1), y.size(-1)
|
||||
if len(lora_a_stacked) == 1 and ops.lora_gemv_optim_condition(num_token, m, k, n):
|
||||
ops.add_lora_linear(y, x, lora_a_stacked, lora_b_stacked,
|
||||
lora_bias_stacked = None, scale = 1.0, output_slices = (1,))
|
||||
return
|
||||
|
||||
assert buffer is None, (
|
||||
"To minimize overhead, the buffer should be created by "
|
||||
".add_lora_linear() instead of being passed in."
|
||||
@@ -351,6 +362,8 @@ class PunicaWrapperGPU(PunicaWrapperBase):
|
||||
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
|
||||
if pad_sorted_ids:
|
||||
max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
|
||||
if topk_ids.numel() < num_experts:
|
||||
max_num_tokens_padded = topk_ids.numel() * block_size
|
||||
sorted_ids = torch.empty(
|
||||
(max_loras * max_num_tokens_padded,),
|
||||
dtype=torch.int32,
|
||||
|
||||
Reference in New Issue
Block a user