Support Giga AM transducer V2 (#2136)

2025-04-20 10:15:20 +08:00
parent 01110cc22b
commit be0f382a54
14 changed files with 355 additions and 25 deletions
--- a/scripts/nemo/GigaAM/test-onnx-rnnt.py
+++ b/scripts/nemo/GigaAM/test-onnx-rnnt.py
@@ -20,7 +20,7 @@ def create_fbank():
    opts.frame_opts.window_type = "hann"

    # Even though GigaAM uses 400 for fft, here we use 512
-    # since kaldi-native-fbank only support fft for power of 2.
+    # since kaldi-native-fbank only supports fft for power of 2.
    opts.frame_opts.round_to_power_of_two = True

    opts.mel_opts.low_freq = 0
@@ -166,12 +166,7 @@ class OnnxModel:
        target = torch.tensor([[token]], dtype=torch.int32).numpy()
        target_len = torch.tensor([1], dtype=torch.int32).numpy()

-        (
-            decoder_out,
-            decoder_out_length,
-            state0_next,
-            state1_next,
-        ) = self.decoder.run(
+        (decoder_out, decoder_out_length, state0_next, state1_next,) = self.decoder.run(
            [
                self.decoder.get_outputs()[0].name,
                self.decoder.get_outputs()[1].name,
@@ -213,8 +208,12 @@ def main():
    id2token = dict()
    with open("./tokens.txt", encoding="utf-8") as f:
        for line in f:
-            t, idx = line.split()
-            id2token[int(idx)] = t
+            fields = line.split()
+            if len(fields) == 1:
+                id2token[int(fields[0])] = " "
+            else:
+                t, idx = fields
+                id2token[int(idx)] = t

    fbank = create_fbank()
    audio, sample_rate = sf.read("./example.wav", dtype="float32", always_2d=True)