enginex-bi_150-vllm/config/speech_to_text.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project


from pydantic.dataclasses import dataclass

from vllm.config.utils import config


@config
@dataclass
class SpeechToTextConfig:
    """Configuration for speech-to-text models."""

    sample_rate: float = 16_000
    """Sample rate (Hz) to resample input audio to. Most speech models expect
    16kHz audio input. The input audio will be automatically resampled to this
    rate before processing."""

    max_audio_clip_s: int = 30
    """Maximum duration in seconds for a single audio clip without chunking.
    Audio longer than this will be split into smaller chunks if
    `allow_audio_chunking` evaluates to True, otherwise it will be rejected."""

    overlap_chunk_second: int = 1
    """Overlap duration in seconds between consecutive audio chunks when
    splitting long audio. This helps maintain context across chunk boundaries
    and improves transcription quality at split points."""

    min_energy_split_window_size: int | None = 1600
    """Window size in samples for finding low-energy (quiet) regions to split
    audio chunks. The algorithm looks for the quietest moment within this
    window to minimize cutting through speech. Default 1600 samples ≈ 100ms
    at 16kHz. If None, no chunking will be done."""

    @property
    def allow_audio_chunking(self) -> bool:
        return self.min_energy_split_window_size is not None
v1.0 2026-03-05 18:06:10 +08:00			`# SPDX-License-Identifier: Apache-2.0`
			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`


			`from pydantic.dataclasses import dataclass`

			`from vllm.config.utils import config`


			`@config`
			`@dataclass`
			`class SpeechToTextConfig:`
			`"""Configuration for speech-to-text models."""`

			`sample_rate: float = 16_000`
			`"""Sample rate (Hz) to resample input audio to. Most speech models expect`
			`16kHz audio input. The input audio will be automatically resampled to this`
			`rate before processing."""`

			`max_audio_clip_s: int = 30`
			`"""Maximum duration in seconds for a single audio clip without chunking.`
			`Audio longer than this will be split into smaller chunks if`
			`allow_audio_chunking` evaluates to True, otherwise it will be rejected."""

			`overlap_chunk_second: int = 1`
			`"""Overlap duration in seconds between consecutive audio chunks when`
			`splitting long audio. This helps maintain context across chunk boundaries`
			`and improves transcription quality at split points."""`

			`min_energy_split_window_size: int \| None = 1600`
			`"""Window size in samples for finding low-energy (quiet) regions to split`
			`audio chunks. The algorithm looks for the quietest moment within this`
			`window to minimize cutting through speech. Default 1600 samples ≈ 100ms`
			`at 16kHz. If None, no chunking will be done."""`

			`@property`
			`def allow_audio_chunking(self) -> bool:`
			`return self.min_energy_split_window_size is not None`