56 lines
1.4 KiB
Python
56 lines
1.4 KiB
Python
|
|
# Copyright (c) 2023 Xiaomi Corporation
|
||
|
|
|
||
|
|
import logging
|
||
|
|
import click
|
||
|
|
from pathlib import Path
|
||
|
|
from sherpa_onnx import text2token
|
||
|
|
|
||
|
|
|
||
|
|
@click.group()
|
||
|
|
def cli():
|
||
|
|
"""
|
||
|
|
The shell entry point to sherpa-onnx.
|
||
|
|
"""
|
||
|
|
logging.basicConfig(
|
||
|
|
format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s",
|
||
|
|
level=logging.INFO,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
@cli.command(name="text2token")
|
||
|
|
@click.argument("input", type=click.Path(exists=True, dir_okay=False))
|
||
|
|
@click.argument("output", type=click.Path())
|
||
|
|
@click.option(
|
||
|
|
"--tokens",
|
||
|
|
type=str,
|
||
|
|
required=True,
|
||
|
|
help="The path to tokens.txt.",
|
||
|
|
)
|
||
|
|
@click.option(
|
||
|
|
"--tokens-type",
|
||
|
|
type=str,
|
||
|
|
required=True,
|
||
|
|
help="The type of modeling units, should be cjkchar, bpe or cjkchar+bpe",
|
||
|
|
)
|
||
|
|
@click.option(
|
||
|
|
"--bpe-model",
|
||
|
|
type=str,
|
||
|
|
help="The path to bpe.model. Only required when tokens-type is bpe or cjkchar+bpe.",
|
||
|
|
)
|
||
|
|
def encode_text(
|
||
|
|
input: Path, output: Path, tokens: Path, tokens_type: str, bpe_model: Path
|
||
|
|
):
|
||
|
|
"""
|
||
|
|
Encode the texts given by the INPUT to tokens and write the results to the OUTPUT.
|
||
|
|
"""
|
||
|
|
texts = []
|
||
|
|
with open(input, "r", encoding="utf8") as f:
|
||
|
|
for line in f:
|
||
|
|
texts.append(line.strip())
|
||
|
|
encoded_texts = text2token(
|
||
|
|
texts, tokens=tokens, tokens_type=tokens_type, bpe_model=bpe_model
|
||
|
|
)
|
||
|
|
with open(output, "w", encoding="utf8") as f:
|
||
|
|
for txt in encoded_texts:
|
||
|
|
f.write(" ".join(txt) + "\n")
|