Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py
@@ -11,6 +11,7 @@ from typing import Final, Literal, TypeAlias, TypeVar, cast

 import numpy as np
 from fastapi import Request
+from soundfile import LibsndfileError
 from transformers import PreTrainedTokenizerBase

 import vllm.envs as envs
@@ -57,6 +58,14 @@ try:
 except ImportError:
    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]

+# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
+# being librosa's main backend. Used to validate if an audio loading error is due to a
+# server error vs a client error (invalid audio file).
+# 1 = unrecognised format      (file is not a supported audio container)
+# 3 = malformed file           (corrupt or structurally invalid audio)
+# 4 = unsupported encoding     (codec not supported by this libsndfile build)
+_BAD_SF_CODES = {1, 3, 4}
+
 SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
 SpeechToTextResponseVerbose: TypeAlias = (
    TranscriptionResponseVerbose | TranslationResponseVerbose
@@ -315,9 +324,15 @@ class OpenAISpeechToText(OpenAIServing):
            )

        with io.BytesIO(audio_data) as bytes_:
-            # NOTE resample to model SR here for efficiency. This is also a
-            # pre-requisite for chunking, as it assumes Whisper SR.
-            y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate)
+            try:
+                # NOTE resample to model SR here for efficiency. This is also a
+                # pre-requisite for chunking, as it assumes Whisper SR.
+                y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate)
+            except LibsndfileError as exc:
+                # Distinguish client errors (invalid audio) from server errors
+                if exc.code in _BAD_SF_CODES:
+                    raise ValueError("Invalid or unsupported audio file.") from exc
+                raise

        duration = librosa.get_duration(y=y, sr=sr)
        do_split_audio = (