Support GPU pinning for LoRA (#8697)
This commit is contained in:
@@ -1538,14 +1538,11 @@ class Scheduler(
|
||||
|
||||
# Get requests from the waiting queue to a new prefill batch
|
||||
for req in self.waiting_queue:
|
||||
if (
|
||||
self.enable_lora
|
||||
and len(
|
||||
lora_set
|
||||
| set([req.lora_id for req in adder.can_run_list])
|
||||
| set([req.lora_id])
|
||||
)
|
||||
> self.max_loras_per_batch
|
||||
|
||||
if self.enable_lora and not self.tp_worker.can_run_lora_batch(
|
||||
lora_set
|
||||
| set([req.lora_id for req in adder.can_run_list])
|
||||
| set([req.lora_id])
|
||||
):
|
||||
self.running_batch.batch_is_full = True
|
||||
break
|
||||
|
||||
Reference in New Issue
Block a user