Add C++ runtime and Python API for NeMo Canary models (#2352)

This commit is contained in:
Fangjun Kuang
2025-07-07 17:03:49 +08:00
committed by GitHub
parent f8d957a24b
commit 0e738c356c
24 changed files with 1091 additions and 8 deletions

View File

@@ -281,9 +281,14 @@ def export_decoder(canary_model):
def export_tokens(canary_model):
underline = ""
with open("./tokens.txt", "w", encoding="utf-8") as f:
for i in range(canary_model.tokenizer.vocab_size):
s = canary_model.tokenizer.ids_to_text([i])
if s[0] == " ":
s = underline + s[1:]
f.write(f"{s} {i}\n")
print("Saved to tokens.txt")

View File

@@ -289,7 +289,13 @@ def main():
tokens.append(t)
print("len(tokens)", len(tokens))
print("tokens", tokens)
text = "".join([id2token[i] for i in tokens])
underline = ""
# underline = b"\xe2\x96\x81".decode()
text = text.replace(underline, " ").strip()
print("text:", text)