Feat: Support audio in Phi4-mm model (#8048)

2025-07-18 21:03:53 -07:00
parent d918ab7985
commit b7e951a6db
11 changed files with 3333 additions and 54 deletions
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -691,12 +691,17 @@ def decode_video_base64(video_base64):
        )  # Return an empty array and size tuple if no frames were found


-def load_audio(audio_file: str, sr: int = 16000, mono: bool = True) -> np.ndarray:
+def load_audio(
+    audio_file: str, sr: Optional[int] = None, mono: bool = True
+) -> np.ndarray:
    # Use soundfile here, since librosa use it under the hood,
    # and librosa will not support audio loading in the future
    import soundfile as sf
    from scipy.signal import resample

+    if sr is None:
+        sr = 16000
+
    # Load audio data
    if isinstance(audio_file, bytes):
        audio, original_sr = sf.read(BytesIO(audio_file))