Refactor hotwords，support loading hotwords from file (#296)

2023-09-14 19:33:17 +08:00
parent 087367d7fe
commit 47184f9db7
34 changed files with 803 additions and 300 deletions
--- a/sherpa-onnx/python/sherpa_onnx/cli.py
+++ b/sherpa-onnx/python/sherpa_onnx/cli.py
@@ -0,0 +1,55 @@
+# Copyright (c)  2023  Xiaomi Corporation
+
+import logging
+import click
+from pathlib import Path
+from sherpa_onnx import text2token
+
+
+@click.group()
+def cli():
+    """
+    The shell entry point to sherpa-onnx.
+    """
+    logging.basicConfig(
+        format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s",
+        level=logging.INFO,
+    )
+
+
+@cli.command(name="text2token")
+@click.argument("input", type=click.Path(exists=True, dir_okay=False))
+@click.argument("output", type=click.Path())
+@click.option(
+    "--tokens",
+    type=str,
+    required=True,
+    help="The path to tokens.txt.",
+)
+@click.option(
+    "--tokens-type",
+    type=str,
+    required=True,
+    help="The type of modeling units, should be cjkchar, bpe or cjkchar+bpe",
+)
+@click.option(
+    "--bpe-model",
+    type=str,
+    help="The path to bpe.model. Only required when tokens-type is bpe or cjkchar+bpe.",
+)
+def encode_text(
+    input: Path, output: Path, tokens: Path, tokens_type: str, bpe_model: Path
+):
+    """
+    Encode the texts given by the INPUT to tokens and write the results to the OUTPUT.
+    """
+    texts = []
+    with open(input, "r", encoding="utf8") as f:
+        for line in f:
+            texts.append(line.strip())
+    encoded_texts = text2token(
+        texts, tokens=tokens, tokens_type=tokens_type, bpe_model=bpe_model
+    )
+    with open(output, "w", encoding="utf8") as f:
+        for txt in encoded_texts:
+            f.write(" ".join(txt) + "\n")