Support Giga AM transducer V2 (#2136)

This commit is contained in:
Fangjun Kuang
2025-04-20 10:15:20 +08:00
committed by GitHub
parent 01110cc22b
commit be0f382a54
14 changed files with 355 additions and 25 deletions

View File

@@ -20,7 +20,7 @@ def create_fbank():
opts.frame_opts.window_type = "hann"
# Even though GigaAM uses 400 for fft, here we use 512
# since kaldi-native-fbank only support fft for power of 2.
# since kaldi-native-fbank only supports fft for power of 2.
opts.frame_opts.round_to_power_of_two = True
opts.mel_opts.low_freq = 0
@@ -166,12 +166,7 @@ class OnnxModel:
target = torch.tensor([[token]], dtype=torch.int32).numpy()
target_len = torch.tensor([1], dtype=torch.int32).numpy()
(
decoder_out,
decoder_out_length,
state0_next,
state1_next,
) = self.decoder.run(
(decoder_out, decoder_out_length, state0_next, state1_next,) = self.decoder.run(
[
self.decoder.get_outputs()[0].name,
self.decoder.get_outputs()[1].name,
@@ -213,8 +208,12 @@ def main():
id2token = dict()
with open("./tokens.txt", encoding="utf-8") as f:
for line in f:
t, idx = line.split()
id2token[int(idx)] = t
fields = line.split()
if len(fields) == 1:
id2token[int(fields[0])] = " "
else:
t, idx = fields
id2token[int(idx)] = t
fbank = create_fbank()
audio, sample_rate = sf.read("./example.wav", dtype="float32", always_2d=True)