Enable overlap by default (#2067)

2024-11-19 22:07:58 -08:00
parent 699384cb01
commit 7d671e4ad2
17 changed files with 92 additions and 75 deletions
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -116,7 +116,7 @@ class ModelRunner:
            )

        if self.is_multimodal:
-            logger.warning(
+            logger.info(
                "Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
            )
            server_args.chunked_prefill_size = None
@@ -636,13 +636,11 @@ class ModelRunner:
        self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch
    ) -> torch.Tensor:
        sampling_info = forward_batch.sampling_info
-
        if sampling_info.sampling_info_done:
            # Overlap mode: the function update_regex_vocab_mask was executed
            # in process_batch_result of the last batch.
            if sampling_info.grammars:
                sampling_info.sampling_info_done.wait()
-            sampling_info.update_penalties()
        else:
            # Normal mode: Put CPU-heavy tasks here. They will be overlapped with the forward pass.
            sampling_info.update_regex_vocab_mask()