Refactor hotwords,support loading hotwords from file (#296)
This commit is contained in:
97
scripts/text2token.py
Executable file
97
scripts/text2token.py
Executable file
@@ -0,0 +1,97 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
This script encode the texts (given line by line through `text`) to tokens and
|
||||
write the results to the file given by ``output``.
|
||||
|
||||
Usage:
|
||||
If the tokens_type is bpe:
|
||||
|
||||
python3 ./text2token.py \
|
||||
--text texts.txt \
|
||||
--tokens tokens.txt \
|
||||
--tokens-type bpe \
|
||||
--bpe-model bpe.model \
|
||||
--output hotwords.txt
|
||||
|
||||
If the tokens_type is cjkchar:
|
||||
|
||||
python3 ./text2token.py \
|
||||
--text texts.txt \
|
||||
--tokens tokens.txt \
|
||||
--tokens-type cjkchar \
|
||||
--output hotwords.txt
|
||||
|
||||
If the tokens_type is cjkchar+bpe:
|
||||
|
||||
python3 ./text2token.py \
|
||||
--text texts.txt \
|
||||
--tokens tokens.txt \
|
||||
--tokens-type cjkchar+bpe \
|
||||
--bpe-model bpe.model \
|
||||
--output hotwords.txt
|
||||
|
||||
"""
|
||||
import argparse
|
||||
|
||||
from sherpa_onnx import text2token
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--text",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the input texts",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--tokens",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The path to tokens.txt.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--tokens-type",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The type of modeling units, should be cjkchar, bpe or cjkchar+bpe",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--bpe-model",
|
||||
type=str,
|
||||
help="The path to bpe.model. Only required when tokens-type is bpe or cjkchar+bpe.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path where the encoded tokens will be written to.",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
|
||||
texts = []
|
||||
with open(args.text, "r", encoding="utf8") as f:
|
||||
for line in f:
|
||||
texts.append(line.strip())
|
||||
encoded_texts = text2token(
|
||||
texts,
|
||||
tokens=args.tokens,
|
||||
tokens_type=args.tokens_type,
|
||||
bpe_model=args.bpe_model,
|
||||
)
|
||||
with open(args.output, "w", encoding="utf8") as f:
|
||||
for txt in encoded_texts:
|
||||
f.write(" ".join(txt) + "\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user