Fix the overhead due to penalizer in bench_latency (#1496)
This commit is contained in:
@@ -97,14 +97,12 @@ class InputMetadata:
|
||||
self.modalities = [r.modalities for r in reqs]
|
||||
|
||||
def compute_positions(self, batch: ScheduleBatch):
|
||||
position_ids_offsets = batch.position_ids_offsets
|
||||
|
||||
if self.forward_mode.is_decode():
|
||||
if True:
|
||||
self.positions = self.seq_lens - 1
|
||||
else:
|
||||
# Deprecated
|
||||
self.positions = (self.seq_lens - 1) + position_ids_offsets
|
||||
self.positions = (self.seq_lens - 1) + batch.position_ids_offsets
|
||||
else:
|
||||
if True:
|
||||
self.positions = torch.tensor(
|
||||
@@ -119,7 +117,7 @@ class InputMetadata:
|
||||
)
|
||||
else:
|
||||
# Deprecated
|
||||
position_ids_offsets_cpu = position_ids_offsets.cpu().numpy()
|
||||
position_ids_offsets_cpu = batch.position_ids_offsets.cpu().numpy()
|
||||
self.positions = torch.tensor(
|
||||
np.concatenate(
|
||||
[
|
||||
|
||||
@@ -467,7 +467,6 @@ class ModelRunner:
|
||||
logger.info("Capture cuda graph begin. This can take up to several minutes.")
|
||||
self.cuda_graph_runner = CudaGraphRunner(self)
|
||||
|
||||
@torch.inference_mode()
|
||||
def forward_decode(self, batch: ScheduleBatch):
|
||||
if self.server_args.lora_paths is not None:
|
||||
self.lora_manager.prepare_lora_batch(batch)
|
||||
@@ -481,7 +480,6 @@ class ModelRunner:
|
||||
batch.input_ids, input_metadata.positions, input_metadata
|
||||
)
|
||||
|
||||
@torch.inference_mode()
|
||||
def forward_extend(self, batch: ScheduleBatch):
|
||||
input_metadata = InputMetadata.from_schedule_batch(self, batch)
|
||||
if self.server_args.lora_paths is not None:
|
||||
@@ -500,7 +498,6 @@ class ModelRunner:
|
||||
get_embedding=True,
|
||||
)
|
||||
|
||||
@torch.inference_mode()
|
||||
def forward_extend_multi_modal(self, batch: ScheduleBatch):
|
||||
input_metadata = InputMetadata.from_schedule_batch(self, batch)
|
||||
return self.model.forward(
|
||||
|
||||
Reference in New Issue
Block a user