Support replacing homonphonic phrases (#2153)

2025-04-27 15:31:11 +08:00
parent e3280027f9
commit f64c58342b
42 changed files with 834 additions and 134 deletions
--- a/.github/scripts/test-offline-ctc.sh
+++ b/.github/scripts/test-offline-ctc.sh
@@ -98,6 +98,29 @@ for m in model.onnx model.int8.onnx; do
  done
 done

+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/dict.tar.bz2
+tar xf dict.tar.bz2
+rm dict.tar.bz2
+
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/replace.fst
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/test-hr.wav
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/lexicon.txt
+
+for m in model.onnx model.int8.onnx; do
+  for use_itn in 0 1; do
+    echo "$m $w $use_itn"
+    time $EXE \
+      --tokens=$repo/tokens.txt \
+      --sense-voice-model=$repo/$m \
+      --sense-voice-use-itn=$use_itn \
+      --hr-lexicon=./lexicon.txt \
+      --hr-dict-dir=./dict \
+      --hr-rule-fsts=./replace.fst \
+      ./test-hr.wav
+  done
+done
+
+rm -rf dict replace.fst test-hr.wav lexicon.txt

 # test wav reader for non-standard wav files
 waves=(
--- a/.github/scripts/test-python.sh
+++ b/.github/scripts/test-python.sh
@@ -95,6 +95,18 @@ rm $name
 ls -lh $repo
 python3 ./python-api-examples/offline-sense-voice-ctc-decode-files.py

+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/dict.tar.bz2
+tar xf dict.tar.bz2
+rm dict.tar.bz2
+
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/replace.fst
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/test-hr.wav
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/lexicon.txt
+
+python3 ./python-api-examples/offline-sense-voice-ctc-decode-files-with-hr.py
+
+rm -rf dict replace.fst test-hr.wav lexicon.txt
+
 if [[ $(uname) == Linux ]]; then
  # It needs ffmpeg
  log  "generate subtitles (Chinese)"