Support decoding with byte-level BPE (bbpe) models. (#1633)

This commit is contained in:
Fangjun Kuang
2024-12-20 19:21:32 +08:00
committed by GitHub
parent 7192e576a9
commit b76cd9033a
11 changed files with 270 additions and 10 deletions

View File

@@ -43,7 +43,7 @@ static OfflineRecognitionResult Convert(
text.append(sym);
if (sym.size() == 1 && (sym[0] < 0x20 || sym[0] > 0x7e)) {
// for byte bpe models,
// for bpe models with byte_fallback,
// (but don't rewrite printable characters 0x20..0x7e,
// which collide with standard BPE units)
std::ostringstream os;
@@ -54,6 +54,10 @@ static OfflineRecognitionResult Convert(
r.tokens.push_back(std::move(sym));
}
if (sym_table.IsByteBpe()) {
text = sym_table.DecodeByteBpe(text);
}
r.text = std::move(text);
float frame_shift_s = frame_shift_ms / 1000. * subsampling_factor;