From b0265b258dc9f868cdf02b8d74a6230d7b45788b Mon Sep 17 00:00:00 2001 From: gtf35 Date: Sat, 13 Apr 2024 23:39:07 +0800 Subject: [PATCH] Replace torchaudio with soundfile in python-api-examples (#765) --- ...aker-identification-with-vad-non-streaming-asr.py | 12 +++++++++--- .../speaker-identification-with-vad.py | 12 +++++++++--- python-api-examples/speaker-identification.py | 12 +++++++++--- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/python-api-examples/speaker-identification-with-vad-non-streaming-asr.py b/python-api-examples/speaker-identification-with-vad-non-streaming-asr.py index fe735e17..0534b80f 100755 --- a/python-api-examples/speaker-identification-with-vad-non-streaming-asr.py +++ b/python-api-examples/speaker-identification-with-vad-non-streaming-asr.py @@ -65,7 +65,7 @@ from typing import Dict, List, Tuple import numpy as np import sherpa_onnx -import torchaudio +import soundfile as sf try: import sounddevice as sd @@ -357,8 +357,14 @@ def load_speaker_file(args) -> Dict[str, List[str]]: def load_audio(filename: str) -> Tuple[np.ndarray, int]: - samples, sample_rate = torchaudio.load(filename) - return samples[0].contiguous().numpy(), sample_rate + data, sample_rate = sf.read( + filename, + always_2d=True, + dtype="float32", + ) + data = data[:, 0] # use only the first channel + samples = np.ascontiguousarray(data) + return samples, sample_rate def compute_speaker_embedding( diff --git a/python-api-examples/speaker-identification-with-vad.py b/python-api-examples/speaker-identification-with-vad.py index afad458d..8514ed58 100755 --- a/python-api-examples/speaker-identification-with-vad.py +++ b/python-api-examples/speaker-identification-with-vad.py @@ -60,7 +60,7 @@ from typing import Dict, List, Tuple import numpy as np import sherpa_onnx -import torchaudio +import soundfile as sf try: import sounddevice as sd @@ -160,8 +160,14 @@ def load_speaker_file(args) -> Dict[str, List[str]]: def load_audio(filename: str) -> Tuple[np.ndarray, int]: - samples, sample_rate = torchaudio.load(filename) - return samples[0].contiguous().numpy(), sample_rate + data, sample_rate = sf.read( + filename, + always_2d=True, + dtype="float32", + ) + data = data[:, 0] # use only the first channel + samples = np.ascontiguousarray(data) + return samples, sample_rate def compute_speaker_embedding( diff --git a/python-api-examples/speaker-identification.py b/python-api-examples/speaker-identification.py index c09478d8..abfa4558 100755 --- a/python-api-examples/speaker-identification.py +++ b/python-api-examples/speaker-identification.py @@ -52,7 +52,7 @@ from typing import Dict, List, Tuple import numpy as np import sherpa_onnx -import torchaudio +import soundfile as sf try: import sounddevice as sd @@ -145,8 +145,14 @@ def load_speaker_file(args) -> Dict[str, List[str]]: def load_audio(filename: str) -> Tuple[np.ndarray, int]: - samples, sample_rate = torchaudio.load(filename) - return samples[0].contiguous().numpy(), sample_rate + data, sample_rate = sf.read( + filename, + always_2d=True, + dtype="float32", + ) + data = data[:, 0] # use only the first channel + samples = np.ascontiguousarray(data) + return samples, sample_rate def compute_speaker_embedding(