Support reading multi-channel wave files with 8/16/32-bit encoded samples (#1258)

2024-08-15 14:54:43 +08:00
parent 62c4d4ab62
commit ca729faebf
5 changed files with 150 additions and 44 deletions
--- a/.github/scripts/test-offline-ctc.sh
+++ b/.github/scripts/test-offline-ctc.sh
@@ -38,14 +38,28 @@ done
 # test wav reader for non-standard wav files
-curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/naudio.wav
+waves=(
-curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/junk-padding.wav
+  naudio.wav
  junk-padding.wav
  int8-1-channel-zh.wav
  int8-2-channel-zh.wav
  int8-4-channel-zh.wav
  int16-1-channel-zh.wav
  int16-2-channel-zh.wav
  int32-1-channel-zh.wav
  int32-2-channel-zh.wav
  float32-1-channel-zh.wav
  float32-2-channel-zh.wav
 )
 for w in ${waves[@]}; do
  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/$w
-time $EXE \
+  time $EXE \
-  --tokens=$repo/tokens.txt \
+    --tokens=$repo/tokens.txt \
-  --sense-voice-model=$repo/model.int8.onnx \
+    --sense-voice-model=$repo/model.int8.onnx \
-  ./naudio.wav \
+    $w
-  ./junk-padding.wav
+  rm -v $w
 done
 rm -rf $repo
--- a/.github/workflows/linux.yaml
+++ b/.github/workflows/linux.yaml
@@ -143,6 +143,16 @@ jobs:
          name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }}
          path: install/*
      - name: Test offline CTC
        shell: bash
        run: |
          du -h -d1 .
          export PATH=$PWD/build/bin:$PATH
          export EXE=sherpa-onnx-offline
          .github/scripts/test-offline-ctc.sh
          du -h -d1 .
      - name: Test online punctuation
        shell: bash
        run: |
@@ -163,17 +173,6 @@ jobs:
          .github/scripts/test-offline-transducer.sh
          du -h -d1 .
      - name: Test offline CTC
        shell: bash
        run: |
          du -h -d1 .
          export PATH=$PWD/build/bin:$PATH
          export EXE=sherpa-onnx-offline
          .github/scripts/test-offline-ctc.sh
          du -h -d1 .
      - name: Test online transducer
        shell: bash
        run: |
--- a/sherpa-onnx/csrc/offline-tts-frontend.h
+++ b/sherpa-onnx/csrc/offline-tts-frontend.h
@@ -6,6 +6,7 @@
 #define SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_
 #include <cstdint>
 #include <string>
 #include <utility>
 #include <vector>
 #include "sherpa-onnx/csrc/macros.h"
--- a/sherpa-onnx/csrc/wave-reader.cc
+++ b/sherpa-onnx/csrc/wave-reader.cc
@@ -50,6 +50,16 @@ struct WaveHeader {
 };
 static_assert(sizeof(WaveHeader) == 44);
 /*
 sox int16-1-channel-zh.wav -b 8 int8-1-channel-zh.wav
 sox int16-1-channel-zh.wav -c 2 int16-2-channel-zh.wav
 we use audacity to generate int32-1-channel-zh.wav and float32-1-channel-zh.wav
 because sox uses WAVE_FORMAT_EXTENSIBLE, which is not easy to support
 in sherpa-onnx.
 */
 // Read a wave file of mono-channel.
 // Return its samples normalized to the range [-1, 1).
 std::vector<float> ReadWaveImpl(std::istream &is, int32_t *sampling_rate,
@@ -114,9 +124,18 @@ std::vector<float> ReadWaveImpl(std::istream &is, int32_t *sampling_rate,
  is.read(reinterpret_cast<char *>(&header.audio_format),
          sizeof(header.audio_format));
-  if (header.audio_format != 1) {  // 1 for PCM
+  if (header.audio_format != 1 && header.audio_format != 3) {
    // 1 for integer PCM
    // 3 for floating point PCM
    // see https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
    // and https://github.com/microsoft/DirectXTK/wiki/Wave-Formats
    SHERPA_ONNX_LOGE("Expected audio_format 1. Given: %d\n",
                     header.audio_format);
    if (header.audio_format == static_cast<int16_t>(0xfffe)) {
      SHERPA_ONNX_LOGE("We don't support WAVE_FORMAT_EXTENSIBLE files.");
    }
    *is_ok = false;
    return {};
  }
@@ -125,10 +144,9 @@ std::vector<float> ReadWaveImpl(std::istream &is, int32_t *sampling_rate,
          sizeof(header.num_channels));
  if (header.num_channels != 1) {  // we support only single channel for now
-    SHERPA_ONNX_LOGE("Expected single channel. Given: %d\n",
+    SHERPA_ONNX_LOGE(
-                     header.num_channels);
+        "Warning: %d channels are found. We only use the first channel.\n",
-    *is_ok = false;
+        header.num_channels);
    return {};
  }
  is.read(reinterpret_cast<char *>(&header.sample_rate),
@@ -161,8 +179,9 @@ std::vector<float> ReadWaveImpl(std::istream &is, int32_t *sampling_rate,
    return {};
  }
-  if (header.bits_per_sample != 16) {  // we support only 16 bits per sample
+  if (header.bits_per_sample != 8 && header.bits_per_sample != 16 &&
-    SHERPA_ONNX_LOGE("Expected bits_per_sample 16. Given: %d\n",
+      header.bits_per_sample != 32) {
    SHERPA_ONNX_LOGE("Expected bits_per_sample 8, 16 or 32. Given: %d\n",
                     header.bits_per_sample);
    *is_ok = false;
    return {};
@@ -199,21 +218,95 @@ std::vector<float> ReadWaveImpl(std::istream &is, int32_t *sampling_rate,
  *sampling_rate = header.sample_rate;
-  // header.subchunk2_size contains the number of bytes in the data.
+  std::vector<float> ans;
  // As we assume each sample contains two bytes, so it is divided by 2 here
  std::vector<int16_t> samples(header.subchunk2_size / 2);
-  is.read(reinterpret_cast<char *>(samples.data()), header.subchunk2_size);
+  if (header.bits_per_sample == 16 && header.audio_format == 1) {
-  if (!is) {
+    // header.subchunk2_size contains the number of bytes in the data.
    // As we assume each sample contains two bytes, so it is divided by 2 here
    std::vector<int16_t> samples(header.subchunk2_size / 2);
    SHERPA_ONNX_LOGE("%d samples, bytes: %d", (int)samples.size(),
                     header.subchunk2_size);
    is.read(reinterpret_cast<char *>(samples.data()), header.subchunk2_size);
    if (!is) {
      SHERPA_ONNX_LOGE("Failed to read %d bytes", header.subchunk2_size);
      *is_ok = false;
      return {};
    }
    ans.resize(samples.size() / header.num_channels);
    // samples are interleaved
    for (int32_t i = 0; i != static_cast<int32_t>(ans.size()); ++i) {
      ans[i] = samples[i * header.num_channels] / 32768.;
    }
  } else if (header.bits_per_sample == 8 && header.audio_format == 1) {
    // number of samples == number of bytes for 8-bit encoded samples
    //
    // For 8-bit encoded samples, they are unsigned!
    std::vector<uint8_t> samples(header.subchunk2_size);
    is.read(reinterpret_cast<char *>(samples.data()), header.subchunk2_size);
    if (!is) {
      SHERPA_ONNX_LOGE("Failed to read %d bytes", header.subchunk2_size);
      *is_ok = false;
      return {};
    }
    ans.resize(samples.size() / header.num_channels);
    for (int32_t i = 0; i != static_cast<int32_t>(ans.size()); ++i) {
      // Note(fangjun): We want to normalize each sample into the range [-1, 1]
      // Since each original sample is in the range [0, 256], dividing
      // them by 128 converts them to the range [0, 2];
      // so after subtracting 1, we get the range [-1, 1]
      //
      ans[i] = samples[i * header.num_channels] / 128. - 1;
    }
  } else if (header.bits_per_sample == 32 && header.audio_format == 1) {
    // 32 here is for int32
    //
    // header.subchunk2_size contains the number of bytes in the data.
    // As we assume each sample contains 4 bytes, so it is divided by 4 here
    std::vector<int32_t> samples(header.subchunk2_size / 4);
    is.read(reinterpret_cast<char *>(samples.data()), header.subchunk2_size);
    if (!is) {
      SHERPA_ONNX_LOGE("Failed to read %d bytes", header.subchunk2_size);
      *is_ok = false;
      return {};
    }
    ans.resize(samples.size() / header.num_channels);
    for (int32_t i = 0; i != static_cast<int32_t>(ans.size()); ++i) {
      ans[i] = static_cast<float>(samples[i * header.num_channels]) / (1 << 31);
    }
  } else if (header.bits_per_sample == 32 && header.audio_format == 3) {
    // 32 here is for float32
    //
    // header.subchunk2_size contains the number of bytes in the data.
    // As we assume each sample contains 4 bytes, so it is divided by 4 here
    std::vector<float> samples(header.subchunk2_size / 4);
    is.read(reinterpret_cast<char *>(samples.data()), header.subchunk2_size);
    if (!is) {
      SHERPA_ONNX_LOGE("Failed to read %d bytes", header.subchunk2_size);
      *is_ok = false;
      return {};
    }
    ans.resize(samples.size() / header.num_channels);
    for (int32_t i = 0; i != static_cast<int32_t>(ans.size()); ++i) {
      ans[i] = samples[i * header.num_channels];
    }
  } else {
    SHERPA_ONNX_LOGE(
        "Unsupported %d bits per sample and audio format: %d. Supported values "
        "are: 8, 16, 32.",
        header.bits_per_sample, header.audio_format);
    *is_ok = false;
    return {};
  }
  std::vector<float> ans(samples.size());
  for (int32_t i = 0; i != static_cast<int32_t>(ans.size()); ++i) {
    ans[i] = samples[i] / 32768.;
  }
  *is_ok = true;
  return ans;
 }
--- a/sherpa-onnx/jni/offline-recognizer.cc
+++ b/sherpa-onnx/jni/offline-recognizer.cc
@@ -264,13 +264,9 @@ Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_newFromFile(JNIEnv *env,
  return (jlong)model;
 }
 SHERPA_ONNX_EXTERN_C
-JNIEXPORT void JNICALL
+JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_setConfig(
-Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_setConfig(JNIEnv *env,
+    JNIEnv *env, jobject /*obj*/, jlong ptr, jobject _config) {
                                                         jobject /*obj*/,
                                                         jlong ptr,
                                                         jobject _config) {
  auto config = sherpa_onnx::GetOfflineConfig(env, _config);
  SHERPA_ONNX_LOGE("config:\n%s", config.ToString().c_str());
@@ -350,9 +346,12 @@ Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_getResult(JNIEnv *env,
  // [3]: lang, jstring
  // [4]: emotion, jstring
  // [5]: event, jstring
-  env->SetObjectArrayElement(obj_arr, 3, env->NewStringUTF(result.lang.c_str()));
+  env->SetObjectArrayElement(obj_arr, 3,
-  env->SetObjectArrayElement(obj_arr, 4, env->NewStringUTF(result.emotion.c_str()));
+                             env->NewStringUTF(result.lang.c_str()));
-  env->SetObjectArrayElement(obj_arr, 5, env->NewStringUTF(result.event.c_str()));
+  env->SetObjectArrayElement(obj_arr, 4,
                             env->NewStringUTF(result.emotion.c_str()));
  env->SetObjectArrayElement(obj_arr, 5,
                             env->NewStringUTF(result.event.c_str()));
  return obj_arr;
 }