Remove sampling info events and overlap thread file (#11300)

This commit is contained in:
Liangsheng Yin
2025-10-07 21:34:25 +08:00
committed by GitHub
parent 79d3495177
commit 501dfa6b42
9 changed files with 13 additions and 393 deletions

View File

@@ -2057,15 +2057,11 @@ class ModelRunner:
def _preprocess_logits(
self, logits_output: LogitsProcessorOutput, sampling_info: SamplingBatchInfo
):
# Apply logit bias
if sampling_info.sampling_info_done:
# Overlap mode: the function update_regex_vocab_mask was executed
# in process_batch_result of the last batch.
if sampling_info.grammars:
sampling_info.sampling_info_done.wait()
else:
# Normal mode: Put CPU-heavy tasks here. They will be overlapped with the forward pass.
sampling_info.update_regex_vocab_mask()
# NOTE: In overlap mode, the function update_regex_vocab_mask (in sample)
# was executed after we processed last batch's results.
# Calculate logits bias and apply it to next_token_logits.
sampling_info.update_regex_vocab_mask()
sampling_info.apply_logits_bias(logits_output.next_token_logits)
def sample(