Support replacing homonphonic phrases (#2153)

This commit is contained in:
Fangjun Kuang
2025-04-27 15:31:11 +08:00
committed by GitHub
parent e3280027f9
commit f64c58342b
42 changed files with 834 additions and 134 deletions

View File

@@ -98,6 +98,29 @@ for m in model.onnx model.int8.onnx; do
done
done
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/dict.tar.bz2
tar xf dict.tar.bz2
rm dict.tar.bz2
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/replace.fst
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/test-hr.wav
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/lexicon.txt
for m in model.onnx model.int8.onnx; do
for use_itn in 0 1; do
echo "$m $w $use_itn"
time $EXE \
--tokens=$repo/tokens.txt \
--sense-voice-model=$repo/$m \
--sense-voice-use-itn=$use_itn \
--hr-lexicon=./lexicon.txt \
--hr-dict-dir=./dict \
--hr-rule-fsts=./replace.fst \
./test-hr.wav
done
done
rm -rf dict replace.fst test-hr.wav lexicon.txt
# test wav reader for non-standard wav files
waves=(

View File

@@ -95,6 +95,18 @@ rm $name
ls -lh $repo
python3 ./python-api-examples/offline-sense-voice-ctc-decode-files.py
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/dict.tar.bz2
tar xf dict.tar.bz2
rm dict.tar.bz2
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/replace.fst
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/test-hr.wav
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/lexicon.txt
python3 ./python-api-examples/offline-sense-voice-ctc-decode-files-with-hr.py
rm -rf dict replace.fst test-hr.wav lexicon.txt
if [[ $(uname) == Linux ]]; then
# It needs ffmpeg
log "generate subtitles (Chinese)"