From c84a8338635c8611826e99d3c27afb3dbcc71a6f Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Thu, 6 Feb 2025 22:57:13 +0800
Subject: [PATCH] Add C++ and Python API for Kokoro 1.0 multilingual TTS model
 (#1795)

---
 .github/scripts/test-python.sh                |  24 +-
 .github/workflows/export-kokoro.yaml          |  16 +-
 .gitignore                                    |   1 +
 c-api-examples/kws-c-api.c                    |  13 +-
 cxx-api-examples/kws-cxx-api.cc               |  13 +-
 python-api-examples/offline-tts-play.py       |  51 +-
 python-api-examples/offline-tts.py            |  51 +-
 scripts/kokoro/v1.0/generate_voices_bin.py    |   2 +-
 sherpa-onnx/csrc/CMakeLists.txt               |   1 +
 sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc | 522 ++++++++++++++++++
 sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h  |  45 ++
 sherpa-onnx/csrc/melo-tts-lexicon.cc          |   2 +
 sherpa-onnx/csrc/melo-tts-lexicon.h           |   1 -
 sherpa-onnx/csrc/offline-tts-frontend.h       |   6 +
 sherpa-onnx/csrc/offline-tts-kokoro-impl.h    |  42 ++
 .../csrc/offline-tts-kokoro-model-config.cc   |  39 ++
 .../csrc/offline-tts-kokoro-model-config.h    |  12 +-
 sherpa-onnx/csrc/piper-phonemize-lexicon.cc   |   9 +-
 sherpa-onnx/jni/offline-tts.cc                |   2 -
 .../csrc/offline-tts-kokoro-model-config.cc   |   6 +-
 20 files changed, 819 insertions(+), 39 deletions(-)
 create mode 100644 sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
 create mode 100644 sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h

diff --git a/.github/scripts/test-python.sh b/.github/scripts/test-python.sh
index 39e6577a..dd4da512 100755
--- a/.github/scripts/test-python.sh
+++ b/.github/scripts/test-python.sh
@@ -267,6 +267,27 @@ log "Offline TTS test"
 # test waves are saved in ./tts
 mkdir ./tts
 
+log "kokoro-multi-lang-v1_0 test"
+
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
+tar xf kokoro-multi-lang-v1_0.tar.bz2
+rm kokoro-multi-lang-v1_0.tar.bz2
+
+python3 ./python-api-examples/offline-tts.py \
+  --debug=1 \
+  --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \
+  --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \
+  --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \
+  --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \
+  --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \
+  --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \
+  --num-threads=2 \
+  --sid=18 \
+  --output-filename="./tts/kokoro-18-zh-en.wav" \
+  "中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢？"
+
+rm -rf kokoro-multi-lang-v1_0
+
 log "kokoro-en-v0_19 test"
 
 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
@@ -580,13 +601,10 @@ if [[ x$OS != x'windows-latest' ]]; then
   repo=sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01
   log "Start testing ${repo}"
 
-  pushd $dir
   curl -LS -O https://github.com/pkufool/keyword-spotting-models/releases/download/v0.1/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz
   tar xf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz
   rm sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz
-  popd
 
-  repo=$dir/$repo
   ls -lh $repo
 
   python3 ./python-api-examples/keyword-spotter.py
diff --git a/.github/workflows/export-kokoro.yaml b/.github/workflows/export-kokoro.yaml
index adbc8bb1..53b5120c 100644
--- a/.github/workflows/export-kokoro.yaml
+++ b/.github/workflows/export-kokoro.yaml
@@ -4,7 +4,6 @@ on:
   push:
     branches:
       - export-kokoro
-      - kokoro-1.0-2
 
   workflow_dispatch:
 
@@ -76,6 +75,14 @@ jobs:
         if: matrix.version == '1.0'
         shell: bash
         run: |
+          curl -SL -O https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2
+          tar xvf dict.tar.bz2
+          rm dict.tar.bz2
+
+          curl -SL -o date-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/date.fst
+          curl -SL -o number-zh.fst  https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/number.fst
+          curl -SL -o phone-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/phone.fst
+
           src=scripts/kokoro/v1.0
 
           d=kokoro-multi-lang-v1_0
@@ -87,7 +94,12 @@ jobs:
           cp -v $src/tokens.txt $d/
           cp -v $src/lexicon*.txt $d/
           cp -v $src/README.md $d/README.md
+          cp -av dict $d/
+          cp -v ./*.fst $d/
           ls -lh $d/
+          echo "---"
+          ls -lh $d/dict
+
           tar cjfv $d.tar.bz2 $d
           rm -rf $d
 
@@ -180,6 +192,8 @@ jobs:
             cp -v ../scripts/kokoro/v1.0/lexicon*.txt .
             cp -v ../scripts/kokoro/v1.0/README.md ./README.md
             cp -v ../LICENSE ./
+            cp -av ../dict ./
+            cp -v ../*.fst $d/
 
             git lfs track "*.onnx"
             git add .
diff --git a/.gitignore b/.gitignore
index 2840d8ff..ea356b06 100644
--- a/.gitignore
+++ b/.gitignore
@@ -132,3 +132,4 @@ kokoro-en-v0_19
 lexicon.txt
 us_gold.json
 us_silver.json
+kokoro-multi-lang-v1_0
diff --git a/c-api-examples/kws-c-api.c b/c-api-examples/kws-c-api.c
index 3ac42758..ecd70ccf 100644
--- a/c-api-examples/kws-c-api.c
+++ b/c-api-examples/kws-c-api.c
@@ -25,27 +25,28 @@ int32_t main() {
 
   memset(&config, 0, sizeof(config));
   config.model_config.transducer.encoder =
-      "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/"
+      "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
       "encoder-epoch-12-avg-2-chunk-16-left-64.onnx";
 
   config.model_config.transducer.decoder =
-      "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/"
+      "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
       "decoder-epoch-12-avg-2-chunk-16-left-64.onnx";
 
   config.model_config.transducer.joiner =
-      "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/"
+      "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
       "joiner-epoch-12-avg-2-chunk-16-left-64.onnx";
 
   config.model_config.tokens =
-      "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt";
+      "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
+      "tokens.txt";
 
   config.model_config.provider = "cpu";
   config.model_config.num_threads = 1;
   config.model_config.debug = 1;
 
   config.keywords_file =
-      "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/"
-      "test_keywords.txt";
+      "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
+      "test_wavs/test_keywords.txt";
 
   const SherpaOnnxKeywordSpotter *kws = SherpaOnnxCreateKeywordSpotter(&config);
   if (!kws) {
diff --git a/cxx-api-examples/kws-cxx-api.cc b/cxx-api-examples/kws-cxx-api.cc
index cdcb86ba..44f73438 100644
--- a/cxx-api-examples/kws-cxx-api.cc
+++ b/cxx-api-examples/kws-cxx-api.cc
@@ -24,27 +24,28 @@ int32_t main() {
 
   KeywordSpotterConfig config;
   config.model_config.transducer.encoder =
-      "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/"
+      "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
       "encoder-epoch-12-avg-2-chunk-16-left-64.onnx";
 
   config.model_config.transducer.decoder =
-      "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/"
+      "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
       "decoder-epoch-12-avg-2-chunk-16-left-64.onnx";
 
   config.model_config.transducer.joiner =
-      "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/"
+      "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
       "joiner-epoch-12-avg-2-chunk-16-left-64.onnx";
 
   config.model_config.tokens =
-      "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt";
+      "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
+      "tokens.txt";
 
   config.model_config.provider = "cpu";
   config.model_config.num_threads = 1;
   config.model_config.debug = 1;
 
   config.keywords_file =
-      "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/"
-      "test_keywords.txt";
+      "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
+      "test_wavs/test_keywords.txt";
 
   KeywordSpotter kws = KeywordSpotter::Create(config);
   if (!kws.Get()) {
diff --git a/python-api-examples/offline-tts-play.py b/python-api-examples/offline-tts-play.py
index 5ece997b..8ec419ac 100755
--- a/python-api-examples/offline-tts-play.py
+++ b/python-api-examples/offline-tts-play.py
@@ -11,7 +11,7 @@ while the model is still generating.
 
 Usage:
 
-Example (1/6)
+Example (1/7)
 
 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
 tar xf vits-piper-en_US-amy-low.tar.bz2
@@ -23,7 +23,7 @@ python3 ./python-api-examples/offline-tts-play.py \
  --output-filename=./generated.wav \
  "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
 
-Example (2/6)
+Example (2/7)
 
 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
 tar xvf vits-zh-aishell3.tar.bz2
@@ -37,7 +37,7 @@ python3 ./python-api-examples/offline-tts-play.py \
  --output-filename=./liubei-21.wav \
  "勿以恶小而为之，勿以善小而不为。惟贤惟德，能服于人。122334"
 
-Example (3/6)
+Example (3/7)
 
 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
 tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
@@ -53,7 +53,7 @@ python3 ./python-api-examples/offline-tts-play.py \
  --output-filename=./test-2.wav \
  "当夜幕降临，星光点点，伴随着微风拂面，我在静谧中感受着时光的流转，思念如涟漪荡漾，梦境如画卷展开，我与自然融为一体，沉静在这片宁静的美丽之中，感受着生命的奇迹与温柔。2024年5月11号，拨打110或者18920240511。123456块钱。"
 
-Example (4/6)
+Example (4/7)
 
 curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
 tar xvf matcha-icefall-zh-baker.tar.bz2
@@ -71,7 +71,7 @@ python3 ./python-api-examples/offline-tts-play.py \
  --output-filename=./test-matcha.wav \
  "某某银行的副行长和一些行政领导表示，他们去过长江和长白山; 经济不断增长。2024年12月31号，拨打110或者18920240511。123456块钱。"
 
-Example (5/6)
+Example (5/7)
 
 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
 tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
@@ -88,7 +88,9 @@ python3 ./python-api-examples/offline-tts-play.py \
   --num-threads=2 \
  "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
 
-Example (6/6)
+Example (6/7)
+
+(This version of kokoro supports only English)
 
 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
 tar xf kokoro-en-v0_19.tar.bz2
@@ -105,6 +107,27 @@ python3 ./python-api-examples/offline-tts.py \
   --output-filename="./kokoro-10.wav" \
   "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be  a statesman, a businessman, an official, or a scholar."
 
+Example (7/7)
+
+(This version of kokoro supports English, Chinese, etc.)
+
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
+tar xf kokoro-multi-lang-v1_0.tar.bz2
+rm kokoro-multi-lang-v1_0.tar.bz2
+
+python3 ./python-api-examples/offline-tts-play.py \
+  --debug=1 \
+  --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \
+  --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \
+  --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \
+  --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \
+  --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \
+  --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \
+  --num-threads=2 \
+  --sid=18 \
+  --output-filename="./kokoro-18-zh-en.wav" \
+  "中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢？"
+
 You can find more models at
 https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
 
@@ -247,6 +270,20 @@ def add_kokoro_args(parser):
         help="Path to the dict directory of espeak-ng.",
     )
 
+    parser.add_argument(
+        "--kokoro-dict-dir",
+        type=str,
+        default="",
+        help="Path to the dict directory for models using jieba. Needed only by multilingual kokoro",
+    )
+
+    parser.add_argument(
+        "--kokoro-lexicon",
+        type=str,
+        default="",
+        help="Path to lexicon.txt for kokoro. Needed only by multilingual kokoro",
+    )
+
 
 def get_args():
     parser = argparse.ArgumentParser(
@@ -459,6 +496,8 @@ def main():
                 voices=args.kokoro_voices,
                 tokens=args.kokoro_tokens,
                 data_dir=args.kokoro_data_dir,
+                dict_dir=args.kokoro_dict_dir,
+                lexicon=args.kokoro_lexicon,
             ),
             provider=args.provider,
             debug=args.debug,
diff --git a/python-api-examples/offline-tts.py b/python-api-examples/offline-tts.py
index aace840f..c4e63b4f 100755
--- a/python-api-examples/offline-tts.py
+++ b/python-api-examples/offline-tts.py
@@ -12,7 +12,7 @@ generated audio.
 
 Usage:
 
-Example (1/6)
+Example (1/7)
 
 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
 tar xf vits-piper-en_US-amy-low.tar.bz2
@@ -24,7 +24,7 @@ python3 ./python-api-examples/offline-tts.py \
  --output-filename=./generated.wav \
  "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
 
-Example (2/6)
+Example (2/7)
 
 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
 tar xvf vits-icefall-zh-aishell3.tar.bz2
@@ -38,7 +38,7 @@ python3 ./python-api-examples/offline-tts.py \
  --output-filename=./liubei-21.wav \
  "勿以恶小而为之，勿以善小而不为。惟贤惟德，能服于人。122334"
 
-Example (3/6)
+Example (3/7)
 
 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
 tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
@@ -54,7 +54,7 @@ python3 ./python-api-examples/offline-tts.py \
  --output-filename=./test-2.wav \
  "当夜幕降临，星光点点，伴随着微风拂面，我在静谧中感受着时光的流转，思念如涟漪荡漾，梦境如画卷展开，我与自然融为一体，沉静在这片宁静的美丽之中，感受着生命的奇迹与温柔。2024年5月11号，拨打110或者18920240511。123456块钱。"
 
-Example (4/6)
+Example (4/7)
 
 curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
 tar xvf matcha-icefall-zh-baker.tar.bz2
@@ -72,7 +72,7 @@ python3 ./python-api-examples/offline-tts.py \
  --output-filename=./test-matcha.wav \
  "某某银行的副行长和一些行政领导表示，他们去过长江和长白山; 经济不断增长。2024年12月31号，拨打110或者18920240511。123456块钱。"
 
-Example (5/6)
+Example (5/7)
 
 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
 tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
@@ -89,7 +89,9 @@ python3 ./python-api-examples/offline-tts.py \
   --num-threads=2 \
  "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
 
-Example (6/6)
+Example (6/7)
+
+(This version of kokoro supports only English)
 
 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
 tar xf kokoro-en-v0_19.tar.bz2
@@ -106,6 +108,27 @@ python3 ./python-api-examples/offline-tts.py \
   --output-filename="./kokoro-10.wav" \
   "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be  a statesman, a businessman, an official, or a scholar."
 
+Example (7/7)
+
+(This version of kokoro supports English, Chinese, etc.)
+
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
+tar xf kokoro-multi-lang-v1_0.tar.bz2
+rm kokoro-multi-lang-v1_0.tar.bz2
+
+python3 ./python-api-examples/offline-tts.py \
+  --debug=1 \
+  --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \
+  --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \
+  --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \
+  --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \
+  --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \
+  --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \
+  --num-threads=2 \
+  --sid=18 \
+  --output-filename="./kokoro-18-zh-en.wav" \
+  "中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢？"
+
 You can find more models at
 https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
 
@@ -234,6 +257,20 @@ def add_kokoro_args(parser):
         help="Path to the dict directory of espeak-ng.",
     )
 
+    parser.add_argument(
+        "--kokoro-dict-dir",
+        type=str,
+        default="",
+        help="Path to the dict directory for models using jieba. Needed only by multilingual kokoro",
+    )
+
+    parser.add_argument(
+        "--kokoro-lexicon",
+        type=str,
+        default="",
+        help="Path to lexicon.txt for kokoro. Needed only by multilingual kokoro",
+    )
+
 
 def get_args():
     parser = argparse.ArgumentParser(
@@ -342,6 +379,8 @@ def main():
                 voices=args.kokoro_voices,
                 tokens=args.kokoro_tokens,
                 data_dir=args.kokoro_data_dir,
+                dict_dir=args.kokoro_dict_dir,
+                lexicon=args.kokoro_lexicon,
             ),
             provider=args.provider,
             debug=args.debug,
diff --git a/scripts/kokoro/v1.0/generate_voices_bin.py b/scripts/kokoro/v1.0/generate_voices_bin.py
index 84d1d20d..c89ce243 100755
--- a/scripts/kokoro/v1.0/generate_voices_bin.py
+++ b/scripts/kokoro/v1.0/generate_voices_bin.py
@@ -71,7 +71,7 @@ def main():
     with open("voices.bin", "wb") as f:
         for _, speaker in id2speaker.items():
             m = torch.load(
-                f"{speaker}.pt",
+                f"voices/{speaker}.pt",
                 weights_only=True,
                 map_location="cpu",
             ).numpy()
diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt
index d5303b75..4976f58f 100644
--- a/sherpa-onnx/csrc/CMakeLists.txt
+++ b/sherpa-onnx/csrc/CMakeLists.txt
@@ -153,6 +153,7 @@ if(SHERPA_ONNX_ENABLE_TTS)
   list(APPEND sources
     hifigan-vocoder.cc
     jieba-lexicon.cc
+    kokoro-multi-lang-lexicon.cc
     lexicon.cc
     melo-tts-lexicon.cc
     offline-tts-character-frontend.cc
diff --git a/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc b/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
new file mode 100644
index 00000000..1dab60c4
--- /dev/null
+++ b/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
@@ -0,0 +1,522 @@
+// sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
+//
+// Copyright (c)  2025  Xiaomi Corporation
+
+#include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h"
+
+#include <codecvt>
+#include <fstream>
+#include <locale>
+#include <regex>  // NOLINT
+#include <sstream>
+#include <strstream>
+#include <unordered_map>
+#include <utility>
+
+#if __ANDROID_API__ >= 9
+#include "android/asset_manager.h"
+#include "android/asset_manager_jni.h"
+#endif
+
+#if __OHOS__
+#include "rawfile/raw_file_manager.h"
+#endif
+
+#include "cppjieba/Jieba.hpp"
+#include "espeak-ng/speak_lib.h"
+#include "phoneme_ids.hpp"
+#include "phonemize.hpp"
+#include "sherpa-onnx/csrc/file-utils.h"
+#include "sherpa-onnx/csrc/onnx-utils.h"
+#include "sherpa-onnx/csrc/symbol-table.h"
+#include "sherpa-onnx/csrc/text-utils.h"
+
+namespace sherpa_onnx {
+
+void CallPhonemizeEspeak(const std::string &text,
+                         piper::eSpeakPhonemeConfig &config,  // NOLINT
+                         std::vector<std::vector<piper::Phoneme>> *phonemes);
+
+static std::wstring ToWideString(const std::string &s) {
+  // see
+  // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
+  std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+  return converter.from_bytes(s);
+}
+
+static std::string ToString(const std::wstring &s) {
+  // see
+  // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
+  std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+  return converter.to_bytes(s);
+}
+
+class KokoroMultiLangLexicon::Impl {
+ public:
+  Impl(const std::string &tokens, const std::string &lexicon,
+       const std::string &dict_dir, const std::string &data_dir,
+       const OfflineTtsKokoroModelMetaData &meta_data, bool debug)
+      : meta_data_(meta_data), debug_(debug) {
+    InitTokens(tokens);
+
+    InitLexicon(lexicon);
+
+    InitJieba(dict_dir);
+
+    InitEspeak(data_dir);  // See ./piper-phonemize-lexicon.cc
+  }
+
+  template <typename Manager>
+  Impl(Manager *mgr, const std::string &tokens, const std::string &lexicon,
+       const std::string &dict_dir, const std::string &data_dir,
+       const OfflineTtsKokoroModelMetaData &meta_data, bool debug)
+      : meta_data_(meta_data), debug_(debug) {
+    InitTokens(mgr, tokens);
+
+    InitLexicon(mgr, lexicon);
+
+    // we assume you have copied dict_dir and data_dir from assets to some path
+    InitJieba(dict_dir);
+
+    InitEspeak(data_dir);  // See ./piper-phonemize-lexicon.cc
+  }
+
+  std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text) const {
+    std::string text = ToLowerCase(_text);
+    if (debug_) {
+      SHERPA_ONNX_LOGE("After converting to lowercase:\n%s", text.c_str());
+    }
+
+    std::vector<std::pair<std::string, std::string>> replace_str_pairs = {
+        {"，", ","}, {":", ","},  {"、", ","}, {"；", ";"},   {"：", ":"},
+        {"。", "."}, {"？", "?"}, {"！", "!"}, {"\\s+", " "},
+    };
+    for (const auto &p : replace_str_pairs) {
+      std::regex re(p.first);
+      text = std::regex_replace(text, re, p.second);
+    }
+
+    if (debug_) {
+      SHERPA_ONNX_LOGE("After replacing punctuations and merging spaces:\n%s",
+                       text.c_str());
+    }
+
+    // https://en.cppreference.com/w/cpp/regex
+    // https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
+    std::string expr =
+        "([;:,.?!'\"…\\(\\)“”])|([\\u4e00-\\u9fff]+)|([\\u0000-\\u007f]+)";
+
+    auto ws = ToWideString(text);
+    std::wstring wexpr = ToWideString(expr);
+    std::wregex we(wexpr);
+
+    auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we);
+    auto end = std::wsregex_iterator();
+
+    std::vector<TokenIDs> ans;
+
+    for (std::wsregex_iterator i = begin; i != end; ++i) {
+      std::wsmatch match = *i;
+      std::wstring match_str = match.str();
+      auto ms = ToString(match_str);
+      uint8_t c = reinterpret_cast<const uint8_t *>(ms.data())[0];
+
+      std::vector<std::vector<int32_t>> ids_vec;
+
+      if (c < 0x80) {
+        if (debug_) {
+          SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
+        }
+        ids_vec = ConvertEnglishToTokenIDs(ms);
+      } else {
+        if (debug_) {
+          SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str());
+        }
+        ids_vec = ConvertChineseToTokenIDs(ms);
+      }
+
+      for (const auto &ids : ids_vec) {
+        if (ids.size() > 4) {
+          ans.emplace_back(ids);
+        } else {
+          if (ans.empty()) {
+            ans.emplace_back(ids);
+          } else {
+            ans.back().tokens.back() = ids[1];
+            ans.back().tokens.insert(ans.back().tokens.end(), ids.begin() + 2,
+                                     ids.end());
+          }
+        }
+      }
+    }
+
+    if (debug_) {
+      for (const auto &v : ans) {
+        std::ostringstream os;
+        os << "\n";
+        std::string sep;
+        for (auto i : v.tokens) {
+          os << sep << i;
+          sep = " ";
+        }
+        os << "\n";
+        SHERPA_ONNX_LOGE("%s", os.str().c_str());
+      }
+    }
+
+    return ans;
+  }
+
+ private:
+  bool IsPunctuation(const std::string &text) const {
+    if (text == ";" || text == ":" || text == "," || text == "." ||
+        text == "!" || text == "?" || text == "—" || text == "…" ||
+        text == "\"" || text == "(" || text == ")" || text == "“" ||
+        text == "”") {
+      return true;
+    }
+
+    return false;
+  }
+
+  std::vector<int32_t> ConvertWordToIds(const std::string &w) const {
+    std::vector<int32_t> ans;
+    if (word2ids_.count(w)) {
+      ans = word2ids_.at(w);
+      return ans;
+    }
+
+    std::vector<std::string> words = SplitUtf8(w);
+    for (const auto &word : words) {
+      if (word2ids_.count(word)) {
+        auto ids = ConvertWordToIds(word);
+        ans.insert(ans.end(), ids.begin(), ids.end());
+      } else {
+        SHERPA_ONNX_LOGE("Skip OOV: '%s'", word.c_str());
+      }
+    }
+
+    return ans;
+  }
+
+  std::vector<std::vector<int32_t>> ConvertChineseToTokenIDs(
+      const std::string &text) const {
+    bool is_hmm = true;
+
+    std::vector<std::string> words;
+    jieba_->Cut(text, words, is_hmm);
+    if (debug_) {
+      std::ostringstream os;
+      os << "After jieba processing:\n";
+
+      std::string sep;
+      for (const auto &w : words) {
+        os << sep << w;
+        sep = "_";
+      }
+      SHERPA_ONNX_LOGE("%s", os.str().c_str());
+    }
+
+    std::vector<std::vector<int32_t>> ans;
+    std::vector<int32_t> this_sentence;
+    int32_t max_len = meta_data_.max_token_len;
+
+    this_sentence.push_back(0);
+    for (const auto &w : words) {
+      auto ids = ConvertWordToIds(w);
+      if (this_sentence.size() + ids.size() > max_len - 2) {
+        this_sentence.push_back(0);
+        ans.push_back(std::move(this_sentence));
+
+        this_sentence.push_back(0);
+      }
+
+      this_sentence.insert(this_sentence.end(), ids.begin(), ids.end());
+    }
+
+    if (this_sentence.size() > 1) {
+      this_sentence.push_back(0);
+      ans.push_back(std::move(this_sentence));
+    }
+
+    if (debug_) {
+      for (const auto &v : ans) {
+        std::ostringstream os;
+        os << "\n";
+        std::string sep;
+        for (auto i : v) {
+          os << sep << i;
+          sep = " ";
+        }
+        os << "\n";
+        SHERPA_ONNX_LOGE("%s", os.str().c_str());
+      }
+    }
+
+    return ans;
+  }
+
+  std::vector<std::vector<int32_t>> ConvertEnglishToTokenIDs(
+      const std::string &text) const {
+    std::vector<std::string> words = SplitUtf8(text);
+    if (debug_) {
+      std::ostringstream os;
+      os << "After splitting to words: ";
+      std::string sep;
+      for (const auto &w : words) {
+        os << sep << w;
+        sep = "_";
+      }
+      SHERPA_ONNX_LOGE("%s", os.str().c_str());
+    }
+
+    std::vector<std::vector<int32_t>> ans;
+    int32_t max_len = meta_data_.max_token_len;
+    std::vector<int32_t> this_sentence;
+
+    int32_t space_id = token2id_.at(" ");
+
+    this_sentence.push_back(0);
+
+    for (const auto &word : words) {
+      if (IsPunctuation(word)) {
+        this_sentence.push_back(token2id_.at(word));
+
+        if (this_sentence.size() > max_len - 2) {
+          // this sentence is too long, split it
+          this_sentence.push_back(0);
+          ans.push_back(std::move(this_sentence));
+
+          this_sentence.push_back(0);
+          continue;
+        }
+
+        if (word == "." || word == "!" || word == "?" || word == ";") {
+          // Note: You can add more punctuations here to split the text
+          // into sentences. We just use four here: .!?;
+          this_sentence.push_back(0);
+          ans.push_back(std::move(this_sentence));
+
+          this_sentence.push_back(0);
+        }
+      } else if (word2ids_.count(word)) {
+        const auto &ids = word2ids_.at(word);
+        if (this_sentence.size() + ids.size() + 3 > max_len - 2) {
+          this_sentence.push_back(0);
+          ans.push_back(std::move(this_sentence));
+
+          this_sentence.push_back(0);
+        }
+
+        this_sentence.insert(this_sentence.end(), ids.begin(), ids.end());
+        this_sentence.push_back(space_id);
+      } else {
+        SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'", word.c_str());
+
+        piper::eSpeakPhonemeConfig config;
+
+        config.voice = "en-us";
+
+        std::vector<std::vector<piper::Phoneme>> phonemes;
+
+        CallPhonemizeEspeak(word, config, &phonemes);
+        // Note phonemes[i] contains a vector of unicode codepoints;
+        // we need to convert them to utf8
+
+        std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
+
+        std::vector<int32_t> ids;
+        for (const auto &v : phonemes) {
+          for (const auto p : v) {
+            auto token = conv.to_bytes(p);
+            if (token2id_.count(token)) {
+              ids.push_back(token2id_.at(token));
+            } else {
+              SHERPA_ONNX_LOGE("Skip OOV token '%s' from '%s'", token.c_str(),
+                               word.c_str());
+            }
+          }
+        }
+
+        if (this_sentence.size() + ids.size() + 3 > max_len - 2) {
+          this_sentence.push_back(0);
+          ans.push_back(std::move(this_sentence));
+
+          this_sentence.push_back(0);
+        }
+
+        this_sentence.insert(this_sentence.end(), ids.begin(), ids.end());
+        this_sentence.push_back(space_id);
+      }
+    }
+
+    if (this_sentence.size() > 1) {
+      this_sentence.push_back(0);
+      ans.push_back(std::move(this_sentence));
+    }
+
+    if (debug_) {
+      for (const auto &v : ans) {
+        std::ostringstream os;
+        os << "\n";
+        std::string sep;
+        for (auto i : v) {
+          os << sep << i;
+          sep = " ";
+        }
+        os << "\n";
+        SHERPA_ONNX_LOGE("%s", os.str().c_str());
+      }
+    }
+
+    return ans;
+  }
+
+  void InitTokens(const std::string &tokens) {
+    std::ifstream is(tokens);
+    InitTokens(is);
+  }
+
+  template <typename Manager>
+  void InitTokens(Manager *mgr, const std::string &tokens) {
+    auto buf = ReadFile(mgr, tokens);
+
+    std::istrstream is(buf.data(), buf.size());
+    InitTokens(is);
+  }
+
+  void InitTokens(std::istream &is) {
+    token2id_ = ReadTokens(is);  // defined in ./symbol-table.cc
+  }
+
+  void InitLexicon(const std::string &lexicon) {
+    std::vector<std::string> files;
+    SplitStringToVector(lexicon, ",", false, &files);
+    for (const auto &f : files) {
+      std::ifstream is(f);
+      InitLexicon(is);
+    }
+  }
+
+  template <typename Manager>
+  void InitLexicon(Manager *mgr, const std::string &lexicon) {
+    std::vector<std::string> files;
+    SplitStringToVector(lexicon, ",", false, &files);
+    for (const auto &f : files) {
+      auto buf = ReadFile(mgr, f);
+
+      std::istrstream is(buf.data(), buf.size());
+      InitLexicon(is);
+    }
+  }
+
+  void InitLexicon(std::istream &is) {
+    std::string word;
+    std::vector<std::string> token_list;
+    std::string token;
+
+    std::string line;
+    int32_t line_num = 0;
+    int32_t num_warn = 0;
+    while (std::getline(is, line)) {
+      ++line_num;
+      std::istringstream iss(line);
+
+      token_list.clear();
+      iss >> word;
+      ToLowerCase(&word);
+
+      if (word2ids_.count(word)) {
+        num_warn += 1;
+        if (num_warn < 10) {
+          SHERPA_ONNX_LOGE("Duplicated word: %s at line %d:%s. Ignore it.",
+                           word.c_str(), line_num, line.c_str());
+        }
+        continue;
+      }
+
+      while (iss >> token) {
+        token_list.push_back(std::move(token));
+      }
+
+      std::vector<int32_t> ids = ConvertTokensToIds(token2id_, token_list);
+
+      if (ids.empty()) {
+        SHERPA_ONNX_LOGE(
+            "Invalid pronunciation for word '%s' at line %d:%s. Ignore it",
+            word.c_str(), line_num, line.c_str());
+        continue;
+      }
+
+      word2ids_.insert({std::move(word), std::move(ids)});
+    }
+  }
+
+  void InitJieba(const std::string &dict_dir) {
+    std::string dict = dict_dir + "/jieba.dict.utf8";
+    std::string hmm = dict_dir + "/hmm_model.utf8";
+    std::string user_dict = dict_dir + "/user.dict.utf8";
+    std::string idf = dict_dir + "/idf.utf8";
+    std::string stop_word = dict_dir + "/stop_words.utf8";
+
+    AssertFileExists(dict);
+    AssertFileExists(hmm);
+    AssertFileExists(user_dict);
+    AssertFileExists(idf);
+    AssertFileExists(stop_word);
+
+    jieba_ =
+        std::make_unique<cppjieba::Jieba>(dict, hmm, user_dict, idf, stop_word);
+  }
+
+ private:
+  OfflineTtsKokoroModelMetaData meta_data_;
+
+  // word to token IDs
+  std::unordered_map<std::string, std::vector<int32_t>> word2ids_;
+
+  // tokens.txt is saved in token2id_
+  std::unordered_map<std::string, int32_t> token2id_;
+
+  std::unique_ptr<cppjieba::Jieba> jieba_;
+  bool debug_ = false;
+};
+
+KokoroMultiLangLexicon::~KokoroMultiLangLexicon() = default;
+
+KokoroMultiLangLexicon::KokoroMultiLangLexicon(
+    const std::string &tokens, const std::string &lexicon,
+    const std::string &dict_dir, const std::string &data_dir,
+    const OfflineTtsKokoroModelMetaData &meta_data, bool debug)
+    : impl_(std::make_unique<Impl>(tokens, lexicon, dict_dir, data_dir,
+                                   meta_data, debug)) {}
+
+template <typename Manager>
+KokoroMultiLangLexicon::KokoroMultiLangLexicon(
+    Manager *mgr, const std::string &tokens, const std::string &lexicon,
+    const std::string &dict_dir, const std::string &data_dir,
+    const OfflineTtsKokoroModelMetaData &meta_data, bool debug)
+    : impl_(std::make_unique<Impl>(mgr, tokens, lexicon, dict_dir, data_dir,
+                                   meta_data, debug)) {}
+
+std::vector<TokenIDs> KokoroMultiLangLexicon::ConvertTextToTokenIds(
+    const std::string &text, const std::string & /*unused_voice = ""*/) const {
+  return impl_->ConvertTextToTokenIds(text);
+}
+
+#if __ANDROID_API__ >= 9
+template KokoroMultiLangLexicon::KokoroMultiLangLexicon(
+    AAssetManager *mgr, const std::string &tokens, const std::string &lexicon,
+    const std::string &dict_dir, const std::string &data_dir,
+    const OfflineTtsKokoroModelMetaData &meta_data, bool debug);
+#endif
+
+#if __OHOS__
+template KokoroMultiLangLexicon::KokoroMultiLangLexicon(
+    NativeResourceManager *mgr, const std::string &tokens,
+    const std::string &lexicon, const std::string &dict_dir,
+    const std::string &data_dir, const OfflineTtsKokoroModelMetaData &meta_data,
+    bool debug);
+#endif
+
+}  // namespace sherpa_onnx
diff --git a/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h b/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h
new file mode 100644
index 00000000..db066781
--- /dev/null
+++ b/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h
@@ -0,0 +1,45 @@
+// sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h
+//
+// Copyright (c)  2025  Xiaomi Corporation
+
+#ifndef SHERPA_ONNX_CSRC_KOKORO_MULTI_LANG_LEXICON_H_
+#define SHERPA_ONNX_CSRC_KOKORO_MULTI_LANG_LEXICON_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "sherpa-onnx/csrc/offline-tts-frontend.h"
+#include "sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h"
+
+namespace sherpa_onnx {
+
+class KokoroMultiLangLexicon : public OfflineTtsFrontend {
+ public:
+  ~KokoroMultiLangLexicon() override;
+
+  KokoroMultiLangLexicon(const std::string &tokens, const std::string &lexicon,
+                         const std::string &dict_dir,
+                         const std::string &data_dir,
+                         const OfflineTtsKokoroModelMetaData &meta_data,
+                         bool debug);
+
+  template <typename Manager>
+  KokoroMultiLangLexicon(Manager *mgr, const std::string &tokens,
+                         const std::string &lexicon,
+                         const std::string &dict_dir,
+                         const std::string &data_dir,
+                         const OfflineTtsKokoroModelMetaData &meta_data,
+                         bool debug);
+
+  std::vector<TokenIDs> ConvertTextToTokenIds(
+      const std::string &text, const std::string &voice = "") const override;
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace sherpa_onnx
+
+#endif  // SHERPA_ONNX_CSRC_KOKORO_MULTI_LANG_LEXICON_H_
diff --git a/sherpa-onnx/csrc/melo-tts-lexicon.cc b/sherpa-onnx/csrc/melo-tts-lexicon.cc
index ec729cdb..48b854f8 100644
--- a/sherpa-onnx/csrc/melo-tts-lexicon.cc
+++ b/sherpa-onnx/csrc/melo-tts-lexicon.cc
@@ -6,7 +6,9 @@
 
 #include <fstream>
 #include <regex>  // NOLINT
+#include <sstream>
 #include <strstream>
+#include <unordered_map>
 #include <utility>
 #if __ANDROID_API__ >= 9
 #include "android/asset_manager.h"
diff --git a/sherpa-onnx/csrc/melo-tts-lexicon.h b/sherpa-onnx/csrc/melo-tts-lexicon.h
index ef7dd029..96b68c7a 100644
--- a/sherpa-onnx/csrc/melo-tts-lexicon.h
+++ b/sherpa-onnx/csrc/melo-tts-lexicon.h
@@ -7,7 +7,6 @@
 
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
 #include "sherpa-onnx/csrc/offline-tts-frontend.h"
diff --git a/sherpa-onnx/csrc/offline-tts-frontend.h b/sherpa-onnx/csrc/offline-tts-frontend.h
index cba50e36..43c4501c 100644
--- a/sherpa-onnx/csrc/offline-tts-frontend.h
+++ b/sherpa-onnx/csrc/offline-tts-frontend.h
@@ -19,6 +19,9 @@ struct TokenIDs {
   /*implicit*/ TokenIDs(std::vector<int64_t> tokens)  // NOLINT
       : tokens{std::move(tokens)} {}
 
+  /*implicit*/ TokenIDs(const std::vector<int32_t> &tokens)  // NOLINT
+      : tokens{tokens.begin(), tokens.end()} {}
+
   TokenIDs(std::vector<int64_t> tokens,  // NOLINT
            std::vector<int64_t> tones)   // NOLINT
       : tokens{std::move(tokens)}, tones{std::move(tones)} {}
@@ -51,6 +54,9 @@ class OfflineTtsFrontend {
       const std::string &text, const std::string &voice = "") const = 0;
 };
 
+// implementation is in ./piper-phonemize-lexicon.cc
+void InitEspeak(const std::string &data_dir);
+
 }  // namespace sherpa_onnx
 
 #endif  // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_
diff --git a/sherpa-onnx/csrc/offline-tts-kokoro-impl.h b/sherpa-onnx/csrc/offline-tts-kokoro-impl.h
index 4c3efbf6..510f031c 100644
--- a/sherpa-onnx/csrc/offline-tts-kokoro-impl.h
+++ b/sherpa-onnx/csrc/offline-tts-kokoro-impl.h
@@ -13,6 +13,7 @@
 #include "fst/extensions/far/far.h"
 #include "kaldifst/csrc/kaldi-fst-io.h"
 #include "kaldifst/csrc/text-normalizer.h"
+#include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h"
 #include "sherpa-onnx/csrc/lexicon.h"
 #include "sherpa-onnx/csrc/macros.h"
 #include "sherpa-onnx/csrc/offline-tts-frontend.h"
@@ -314,6 +315,27 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
   template <typename Manager>
   void InitFrontend(Manager *mgr) {
     const auto &meta_data = model_->GetMetaData();
+
+    if (meta_data.version >= 2) {
+      // this is a multi-lingual model, we require that you pass lexicon
+      // and dict_dir
+      if (config_.model.kokoro.lexicon.empty() ||
+          config_.model.kokoro.dict_dir.empty()) {
+        SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version);
+        SHERPA_ONNX_LOGE(
+            "You are using a multi-lingual Kokoro model (e.g., Kokoro >= "
+            "v1.0). please pass --kokoro-lexicon and --kokoro-dict-dir");
+        SHERPA_ONNX_EXIT(-1);
+      }
+
+      frontend_ = std::make_unique<KokoroMultiLangLexicon>(
+          mgr, config_.model.kokoro.tokens, config_.model.kokoro.lexicon,
+          config_.model.kokoro.dict_dir, config_.model.kokoro.data_dir,
+          meta_data, config_.model.debug);
+
+      return;
+    }
+
     frontend_ = std::make_unique<PiperPhonemizeLexicon>(
         mgr, config_.model.kokoro.tokens, config_.model.kokoro.data_dir,
         meta_data);
@@ -321,7 +343,27 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
 
   void InitFrontend() {
     const auto &meta_data = model_->GetMetaData();
+    if (meta_data.version >= 2) {
+      // this is a multi-lingual model, we require that you pass lexicon
+      // and dict_dir
+      if (config_.model.kokoro.lexicon.empty() ||
+          config_.model.kokoro.dict_dir.empty()) {
+        SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version);
+        SHERPA_ONNX_LOGE(
+            "You are using a multi-lingual Kokoro model (e.g., Kokoro >= "
+            "v1.0). please pass --kokoro-lexicon and --kokoro-dict-dir");
+        SHERPA_ONNX_EXIT(-1);
+      }
 
+      frontend_ = std::make_unique<KokoroMultiLangLexicon>(
+          config_.model.kokoro.tokens, config_.model.kokoro.lexicon,
+          config_.model.kokoro.dict_dir, config_.model.kokoro.data_dir,
+          meta_data, config_.model.debug);
+
+      return;
+    }
+
+    // this is for kokoro v0.19, which supports only English
     frontend_ = std::make_unique<PiperPhonemizeLexicon>(
         config_.model.kokoro.tokens, config_.model.kokoro.data_dir, meta_data);
   }
diff --git a/sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc b/sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc
index 3eb5ad7e..59645060 100644
--- a/sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc
+++ b/sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc
@@ -8,6 +8,7 @@
 
 #include "sherpa-onnx/csrc/file-utils.h"
 #include "sherpa-onnx/csrc/macros.h"
+#include "sherpa-onnx/csrc/text-utils.h"
 
 namespace sherpa_onnx {
 
@@ -17,8 +18,16 @@ void OfflineTtsKokoroModelConfig::Register(ParseOptions *po) {
                "Path to voices.bin for Kokoro models");
   po->Register("kokoro-tokens", &tokens,
                "Path to tokens.txt for Kokoro models");
+  po->Register(
+      "kokoro-lexicon", &lexicon,
+      "Path to lexicon.txt for Kokoro models. Used only for Kokoro >= v1.0"
+      "You can pass multiple files, separated by ','. Example: "
+      "./lexicon-us-en.txt,./lexicon-zh.txt");
   po->Register("kokoro-data-dir", &data_dir,
                "Path to the directory containing dict for espeak-ng.");
+  po->Register("kokoro-dict-dir", &dict_dir,
+               "Path to the directory containing dict for jieba. "
+               "Used only for Kokoro >= v1.0");
   po->Register("kokoro-length-scale", &length_scale,
                "Speech speed. Larger->Slower; Smaller->faster.");
 }
@@ -44,6 +53,19 @@ bool OfflineTtsKokoroModelConfig::Validate() const {
     return false;
   }
 
+  if (!lexicon.empty()) {
+    std::vector<std::string> files;
+    SplitStringToVector(lexicon, ",", false, &files);
+    for (const auto &f : files) {
+      if (!FileExists(f)) {
+        SHERPA_ONNX_LOGE(
+            "lexicon '%s' does not exist. Please re-check --kokoro-lexicon",
+            f.c_str());
+        return false;
+      }
+    }
+  }
+
   if (data_dir.empty()) {
     SHERPA_ONNX_LOGE("Please provide --kokoro-data-dir");
     return false;
@@ -77,6 +99,21 @@ bool OfflineTtsKokoroModelConfig::Validate() const {
     return false;
   }
 
+  if (!dict_dir.empty()) {
+    std::vector<std::string> required_files = {
+        "jieba.dict.utf8", "hmm_model.utf8",  "user.dict.utf8",
+        "idf.utf8",        "stop_words.utf8",
+    };
+
+    for (const auto &f : required_files) {
+      if (!FileExists(dict_dir + "/" + f)) {
+        SHERPA_ONNX_LOGE("'%s/%s' does not exist. Please check kokoro-dict-dir",
+                         dict_dir.c_str(), f.c_str());
+        return false;
+      }
+    }
+  }
+
   return true;
 }
 
@@ -87,7 +124,9 @@ std::string OfflineTtsKokoroModelConfig::ToString() const {
   os << "model=\"" << model << "\", ";
   os << "voices=\"" << voices << "\", ";
   os << "tokens=\"" << tokens << "\", ";
+  os << "lexicon=\"" << lexicon << "\", ";
   os << "data_dir=\"" << data_dir << "\", ";
+  os << "dict_dir=\"" << dict_dir << "\", ";
   os << "length_scale=" << length_scale << ")";
 
   return os.str();
diff --git a/sherpa-onnx/csrc/offline-tts-kokoro-model-config.h b/sherpa-onnx/csrc/offline-tts-kokoro-model-config.h
index a4a68aca..fae17927 100644
--- a/sherpa-onnx/csrc/offline-tts-kokoro-model-config.h
+++ b/sherpa-onnx/csrc/offline-tts-kokoro-model-config.h
@@ -16,8 +16,14 @@ struct OfflineTtsKokoroModelConfig {
   std::string voices;
   std::string tokens;
 
+  // Note: You can pass multiple files, separated by ",", to lexicon
+  // Example: lexicon = "./lexicon-gb-en.txt,./lexicon-zh.txt";
+  std::string lexicon;
+
   std::string data_dir;
 
+  std::string dict_dir;
+
   // speed = 1 / length_scale
   float length_scale = 1.0;
 
@@ -26,11 +32,15 @@ struct OfflineTtsKokoroModelConfig {
   OfflineTtsKokoroModelConfig(const std::string &model,
                               const std::string &voices,
                               const std::string &tokens,
-                              const std::string &data_dir, float length_scale)
+                              const std::string &lexicon,
+                              const std::string &data_dir,
+                              const std::string &dict_dir, float length_scale)
       : model(model),
         voices(voices),
         tokens(tokens),
+        lexicon(lexicon),
         data_dir(data_dir),
+        dict_dir(dict_dir),
         length_scale(length_scale) {}
 
   void Register(ParseOptions *po);
diff --git a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc
index 70ca3775..608a1ccd 100644
--- a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc
+++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc
@@ -32,10 +32,9 @@
 
 namespace sherpa_onnx {
 
-static void CallPhonemizeEspeak(
-    const std::string &text,
-    piper::eSpeakPhonemeConfig &config,  // NOLINT
-    std::vector<std::vector<piper::Phoneme>> *phonemes) {
+void CallPhonemizeEspeak(const std::string &text,
+                         piper::eSpeakPhonemeConfig &config,  // NOLINT
+                         std::vector<std::vector<piper::Phoneme>> *phonemes) {
   static std::mutex espeak_mutex;
 
   std::lock_guard<std::mutex> lock(espeak_mutex);
@@ -245,7 +244,7 @@ static std::vector<int64_t> CoquiPhonemesToIds(
   return ans;
 }
 
-static void InitEspeak(const std::string &data_dir) {
+void InitEspeak(const std::string &data_dir) {
   static std::once_flag init_flag;
   std::call_once(init_flag, [data_dir]() {
     int32_t result =
diff --git a/sherpa-onnx/jni/offline-tts.cc b/sherpa-onnx/jni/offline-tts.cc
index 6af10788..e80b90a5 100644
--- a/sherpa-onnx/jni/offline-tts.cc
+++ b/sherpa-onnx/jni/offline-tts.cc
@@ -241,7 +241,6 @@ Java_com_k2fsa_sherpa_onnx_OfflineTts_generateImpl(JNIEnv *env, jobject /*obj*/,
                                                    jlong ptr, jstring text,
                                                    jint sid, jfloat speed) {
   const char *p_text = env->GetStringUTFChars(text, nullptr);
-  SHERPA_ONNX_LOGE("string is: %s", p_text);
 
   auto audio = reinterpret_cast<sherpa_onnx::OfflineTts *>(ptr)->Generate(
       p_text, sid, speed);
@@ -267,7 +266,6 @@ Java_com_k2fsa_sherpa_onnx_OfflineTts_generateWithCallbackImpl(
     JNIEnv *env, jobject /*obj*/, jlong ptr, jstring text, jint sid,
     jfloat speed, jobject callback) {
   const char *p_text = env->GetStringUTFChars(text, nullptr);
-  SHERPA_ONNX_LOGE("string is: %s", p_text);
 
   std::function<int32_t(const float *, int32_t, float)> callback_wrapper =
       [env, callback](const float *samples, int32_t n,
diff --git a/sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.cc b/sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.cc
index fbb24db5..d9a00ca4 100644
--- a/sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.cc
+++ b/sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.cc
@@ -16,13 +16,17 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) {
   py::class_<PyClass>(*m, "OfflineTtsKokoroModelConfig")
       .def(py::init<>())
       .def(py::init<const std::string &, const std::string &,
+                    const std::string &, const std::string &,
                     const std::string &, const std::string &, float>(),
            py::arg("model"), py::arg("voices"), py::arg("tokens"),
-           py::arg("data_dir"), py::arg("length_scale") = 1.0)
+           py::arg("lexicon") = "", py::arg("data_dir"),
+           py::arg("dict_dir") = "", py::arg("length_scale") = 1.0)
       .def_readwrite("model", &PyClass::model)
       .def_readwrite("voices", &PyClass::voices)
       .def_readwrite("tokens", &PyClass::tokens)
+      .def_readwrite("lexicon", &PyClass::lexicon)
       .def_readwrite("data_dir", &PyClass::data_dir)
+      .def_readwrite("dict_dir", &PyClass::dict_dir)
       .def_readwrite("length_scale", &PyClass::length_scale)
       .def("__str__", &PyClass::ToString)
       .def("validate", &PyClass::Validate);