27 lines
951 B
Python
27 lines
951 B
Python
|
|
#!/usr/bin/env python3
|
||
|
|
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
|
||
|
|
|
||
|
|
|
||
|
|
def get_vocab():
|
||
|
|
# https://huggingface.co/hexgrad/kLegacy/blob/main/v0.19/kokoro.py#L75
|
||
|
|
_pad = "$"
|
||
|
|
_punctuation = ';:,.!?¡¿—…"«»“” '
|
||
|
|
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
||
|
|
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
|
||
|
|
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
|
||
|
|
dicts = {}
|
||
|
|
for i in range(len((symbols))):
|
||
|
|
dicts[symbols[i]] = i
|
||
|
|
return dicts
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
token2id = get_vocab()
|
||
|
|
with open("tokens.txt", "w", encoding="utf-8") as f:
|
||
|
|
for s, i in token2id.items():
|
||
|
|
f.write(f"{s} {i}\n")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|