Encode hotwords in C++ side (#828)

* Encode hotwords in C++ side
This commit is contained in:
Wei Kang
2024-05-20 19:41:36 +08:00
committed by GitHub
parent 8af2af8466
commit b012b78ceb
43 changed files with 714 additions and 102 deletions

View File

@@ -110,11 +110,9 @@ def get_args():
type=str,
default="",
help="""
The file containing hotwords, one words/phrases per line, and for each
phrase the bpe/cjkchar are separated by a space. For example:
▁HE LL O ▁WORLD
你 好 世 界
The file containing hotwords, one words/phrases per line, like
HELLO WORLD
你好世界
""",
)
@@ -128,6 +126,28 @@ def get_args():
""",
)
parser.add_argument(
"--modeling-unit",
type=str,
default="",
help="""
The modeling unit of the model, valid values are cjkchar, bpe, cjkchar+bpe.
Used only when hotwords-file is given.
""",
)
parser.add_argument(
"--bpe-vocab",
type=str,
default="",
help="""
The path to the bpe vocabulary, the bpe vocabulary is generated by
sentencepiece, you can also export the bpe vocabulary through a bpe model
by `scripts/export_bpe_vocab.py`. Used only when hotwords-file is given
and modeling-unit is bpe or cjkchar+bpe.
""",
)
parser.add_argument(
"--encoder",
default="",
@@ -347,6 +367,8 @@ def main():
decoding_method=args.decoding_method,
hotwords_file=args.hotwords_file,
hotwords_score=args.hotwords_score,
modeling_unit=args.modeling_unit,
bpe_vocab=args.bpe_vocab,
blank_penalty=args.blank_penalty,
debug=args.debug,
)

View File

@@ -198,11 +198,9 @@ def get_args():
type=str,
default="",
help="""
The file containing hotwords, one words/phrases per line, and for each
phrase the bpe/cjkchar are separated by a space. For example:
▁HE LL O ▁WORLD
你 好 世 界
The file containing hotwords, one words/phrases per line, like
HELLO WORLD
你好世界
""",
)
@@ -216,6 +214,28 @@ def get_args():
""",
)
parser.add_argument(
"--modeling-unit",
type=str,
default="",
help="""
The modeling unit of the model, valid values are cjkchar, bpe, cjkchar+bpe.
Used only when hotwords-file is given.
""",
)
parser.add_argument(
"--bpe-vocab",
type=str,
default="",
help="""
The path to the bpe vocabulary, the bpe vocabulary is generated by
sentencepiece, you can also export the bpe vocabulary through a bpe model
by `scripts/export_bpe_vocab.py`. Used only when hotwords-file is given
and modeling-unit is bpe or cjkchar+bpe.
""",
)
parser.add_argument(
"--blank-penalty",
type=float,
@@ -302,6 +322,8 @@ def main():
lm_scale=args.lm_scale,
hotwords_file=args.hotwords_file,
hotwords_score=args.hotwords_score,
modeling_unit=args.modeling_unit,
bpe_vocab=args.bpe_vocab,
blank_penalty=args.blank_penalty,
)
elif args.zipformer2_ctc: