Support non-streaming zipformer CTC ASR models (#2340)

This PR adds support for non-streaming Zipformer CTC ASR models across 
multiple language bindings, WebAssembly, examples, and CI workflows.

- Introduces a new OfflineZipformerCtcModelConfig in C/C++, Python, Swift, Java, Kotlin, Go, Dart, Pascal, and C# APIs
- Updates initialization, freeing, and recognition logic to include Zipformer CTC in WASM and Node.js
- Adds example scripts and CI steps for downloading, building, and running Zipformer CTC models

Model doc is available at
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html
This commit is contained in:
Fangjun Kuang
2025-07-04 15:57:07 +08:00
committed by GitHub
parent ef16455cb5
commit 3bf986d08d
71 changed files with 2121 additions and 68 deletions

View File

@@ -484,6 +484,9 @@ static sherpa_onnx::OfflineRecognizerConfig GetOfflineRecognizerConfig(
recognizer_config.model_config.dolphin.model =
SHERPA_ONNX_OR(config->model_config.dolphin.model, "");
recognizer_config.model_config.zipformer_ctc.model =
SHERPA_ONNX_OR(config->model_config.zipformer_ctc.model, "");
recognizer_config.lm_config.model =
SHERPA_ONNX_OR(config->lm_config.model, "");
recognizer_config.lm_config.scale =

View File

@@ -451,6 +451,10 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineDolphinModelConfig {
const char *model;
} SherpaOnnxOfflineDolphinModelConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineZipformerCtcModelConfig {
const char *model;
} SherpaOnnxOfflineZipformerCtcModelConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig {
SherpaOnnxOfflineTransducerModelConfig transducer;
SherpaOnnxOfflineParaformerModelConfig paraformer;
@@ -474,6 +478,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig {
SherpaOnnxOfflineMoonshineModelConfig moonshine;
SherpaOnnxOfflineFireRedAsrModelConfig fire_red_asr;
SherpaOnnxOfflineDolphinModelConfig dolphin;
SherpaOnnxOfflineZipformerCtcModelConfig zipformer_ctc;
} SherpaOnnxOfflineModelConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig {

View File

@@ -252,6 +252,9 @@ OfflineRecognizer OfflineRecognizer::Create(
c.model_config.dolphin.model = config.model_config.dolphin.model.c_str();
c.model_config.zipformer_ctc.model =
config.model_config.zipformer_ctc.model.c_str();
c.lm_config.model = config.lm_config.model.c_str();
c.lm_config.scale = config.lm_config.scale;

View File

@@ -241,6 +241,10 @@ struct SHERPA_ONNX_API OfflineDolphinModelConfig {
std::string model;
};
struct SHERPA_ONNX_API OfflineZipformerCtcModelConfig {
std::string model;
};
struct SHERPA_ONNX_API OfflineMoonshineModelConfig {
std::string preprocessor;
std::string encoder;
@@ -267,6 +271,7 @@ struct SHERPA_ONNX_API OfflineModelConfig {
OfflineMoonshineModelConfig moonshine;
OfflineFireRedAsrModelConfig fire_red_asr;
OfflineDolphinModelConfig dolphin;
OfflineZipformerCtcModelConfig zipformer_ctc;
};
struct SHERPA_ONNX_API OfflineLMConfig {

View File

@@ -113,6 +113,16 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
const OfflineModelConfig &config) {
if (!config.dolphin.model.empty()) {
return std::make_unique<OfflineDolphinModel>(config);
} else if (!config.nemo_ctc.model.empty()) {
return std::make_unique<OfflineNemoEncDecCtcModel>(config);
} else if (!config.tdnn.model.empty()) {
return std::make_unique<OfflineTdnnCtcModel>(config);
} else if (!config.zipformer_ctc.model.empty()) {
return std::make_unique<OfflineZipformerCtcModel>(config);
} else if (!config.wenet_ctc.model.empty()) {
return std::make_unique<OfflineWenetCtcModel>(config);
} else if (!config.telespeech_ctc.empty()) {
return std::make_unique<OfflineTeleSpeechCtcModel>(config);
}
// TODO(fangjun): Refactor it. We don't need to use model_type here
@@ -167,6 +177,16 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
Manager *mgr, const OfflineModelConfig &config) {
if (!config.dolphin.model.empty()) {
return std::make_unique<OfflineDolphinModel>(mgr, config);
} else if (!config.nemo_ctc.model.empty()) {
return std::make_unique<OfflineNemoEncDecCtcModel>(mgr, config);
} else if (!config.tdnn.model.empty()) {
return std::make_unique<OfflineTdnnCtcModel>(mgr, config);
} else if (!config.zipformer_ctc.model.empty()) {
return std::make_unique<OfflineZipformerCtcModel>(mgr, config);
} else if (!config.wenet_ctc.model.empty()) {
return std::make_unique<OfflineWenetCtcModel>(mgr, config);
} else if (!config.telespeech_ctc.empty()) {
return std::make_unique<OfflineTeleSpeechCtcModel>(mgr, config);
}
// TODO(fangjun): Refactor it. We don't need to use model_type here

View File

@@ -33,6 +33,7 @@ java_files += OfflineWhisperModelConfig.java
java_files += OfflineFireRedAsrModelConfig.java
java_files += OfflineMoonshineModelConfig.java
java_files += OfflineNemoEncDecCtcModelConfig.java
java_files += OfflineZipformerCtcModelConfig.java
java_files += OfflineSenseVoiceModelConfig.java
java_files += OfflineDolphinModelConfig.java
java_files += OfflineModelConfig.java

View File

@@ -11,6 +11,7 @@ public class OfflineModelConfig {
private final OfflineNemoEncDecCtcModelConfig nemo;
private final OfflineSenseVoiceModelConfig senseVoice;
private final OfflineDolphinModelConfig dolphin;
private final OfflineZipformerCtcModelConfig zipformerCtc;
private final String teleSpeech;
private final String tokens;
private final int numThreads;
@@ -28,6 +29,7 @@ public class OfflineModelConfig {
this.fireRedAsr = builder.fireRedAsr;
this.moonshine = builder.moonshine;
this.nemo = builder.nemo;
this.zipformerCtc = builder.zipformerCtc;
this.senseVoice = builder.senseVoice;
this.dolphin = builder.dolphin;
this.teleSpeech = builder.teleSpeech;
@@ -52,7 +54,7 @@ public class OfflineModelConfig {
return transducer;
}
public OfflineWhisperModelConfig getZipformer2Ctc() {
public OfflineWhisperModelConfig getWhisper() {
return whisper;
}
@@ -68,6 +70,14 @@ public class OfflineModelConfig {
return dolphin;
}
public OfflineNemoEncDecCtcModelConfig getNemo() {
return nemo;
}
public OfflineZipformerCtcModelConfig getZipformerCtc() {
return zipformerCtc;
}
public String getTokens() {
return tokens;
}
@@ -109,6 +119,7 @@ public class OfflineModelConfig {
private OfflineNemoEncDecCtcModelConfig nemo = OfflineNemoEncDecCtcModelConfig.builder().build();
private OfflineSenseVoiceModelConfig senseVoice = OfflineSenseVoiceModelConfig.builder().build();
private OfflineDolphinModelConfig dolphin = OfflineDolphinModelConfig.builder().build();
private OfflineZipformerCtcModelConfig zipformerCtc = OfflineZipformerCtcModelConfig.builder().build();
private String teleSpeech = "";
private String tokens = "";
private int numThreads = 1;
@@ -142,6 +153,11 @@ public class OfflineModelConfig {
return this;
}
public Builder setZipformerCtc(OfflineZipformerCtcModelConfig zipformerCtc) {
this.zipformerCtc = zipformerCtc;
return this;
}
public Builder setTeleSpeech(String teleSpeech) {
this.teleSpeech = teleSpeech;
return this;

View File

@@ -0,0 +1,32 @@
// Copyright 2025 Xiaomi Corporation
package com.k2fsa.sherpa.onnx;
public class OfflineZipformerCtcModelConfig {
private final String model;
private OfflineZipformerCtcModelConfig(Builder builder) {
this.model = builder.model;
}
public static Builder builder() {
return new Builder();
}
public String getModel() {
return model;
}
public static class Builder {
private String model = "";
public OfflineZipformerCtcModelConfig build() {
return new OfflineZipformerCtcModelConfig(this);
}
public Builder setModel(String model) {
this.model = model;
return this;
}
}
}

View File

@@ -269,6 +269,21 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) {
ans.model_config.nemo_ctc.model = p;
env->ReleaseStringUTFChars(s, p);
// zipformer ctc
fid =
env->GetFieldID(model_config_cls, "zipformerCtc",
"Lcom/k2fsa/sherpa/onnx/OfflineZipformerCtcModelConfig;");
jobject zipformer_ctc_config = env->GetObjectField(model_config, fid);
jclass zipformer_ctc_config_cls = env->GetObjectClass(zipformer_ctc_config);
fid =
env->GetFieldID(zipformer_ctc_config_cls, "model", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(zipformer_ctc_config, fid);
p = env->GetStringUTFChars(s, nullptr);
ans.model_config.zipformer_ctc.model = p;
env->ReleaseStringUTFChars(s, p);
// dolphin
fid = env->GetFieldID(model_config_cls, "dolphin",
"Lcom/k2fsa/sherpa/onnx/OfflineDolphinModelConfig;");

View File

@@ -29,6 +29,10 @@ data class OfflineDolphinModelConfig(
var model: String = "",
)
data class OfflineZipformerCtcModelConfig(
var model: String = "",
)
data class OfflineWhisperModelConfig(
var encoder: String = "",
var decoder: String = "",
@@ -64,6 +68,7 @@ data class OfflineModelConfig(
var nemo: OfflineNemoEncDecCtcModelConfig = OfflineNemoEncDecCtcModelConfig(),
var senseVoice: OfflineSenseVoiceModelConfig = OfflineSenseVoiceModelConfig(),
var dolphin: OfflineDolphinModelConfig = OfflineDolphinModelConfig(),
var zipformerCtc: OfflineZipformerCtcModelConfig = OfflineZipformerCtcModelConfig(),
var teleSpeech: String = "",
var numThreads: Int = 1,
var debug: Boolean = false,
@@ -559,6 +564,16 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? {
modelType = "nemo_transducer",
)
}
31 -> {
val modelDir = "sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03"
return OfflineModelConfig(
zipformerCtc = OfflineZipformerCtcModelConfig(
model = "$modelDir/model.int8.onnx",
),
tokens = "$modelDir/tokens.txt",
)
}
}
return null
}

View File

@@ -412,6 +412,7 @@ fun getModelConfig(type: Int): OnlineModelConfig? {
model = "$modelDir/model.onnx",
),
tokens = "$modelDir/tokens.txt",
modelType = "zipformer2",
)
}
@@ -422,6 +423,7 @@ fun getModelConfig(type: Int): OnlineModelConfig? {
model = "$modelDir/model.fp16.onnx",
),
tokens = "$modelDir/tokens.txt",
modelType = "zipformer2",
)
}

View File

@@ -284,6 +284,11 @@ type
function ToString: AnsiString;
end;
TSherpaOnnxOfflineZipformerCtcModelConfig = record
Model: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOfflineWhisperModelConfig = record
Encoder: AnsiString;
Decoder: AnsiString;
@@ -346,6 +351,7 @@ type
Moonshine: TSherpaOnnxOfflineMoonshineModelConfig;
FireRedAsr: TSherpaOnnxOfflineFireRedAsrModelConfig;
Dolphin: TSherpaOnnxOfflineDolphinModelConfig;
ZipformerCtc: TSherpaOnnxOfflineZipformerCtcModelConfig;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig);
function ToString: AnsiString;
end;
@@ -726,6 +732,9 @@ type
SherpaOnnxOfflineDolphinModelConfig = record
Model: PAnsiChar;
end;
SherpaOnnxOfflineZipformerCtcModelConfig = record
Model: PAnsiChar;
end;
SherpaOnnxOfflineWhisperModelConfig = record
Encoder: PAnsiChar;
Decoder: PAnsiChar;
@@ -773,6 +782,7 @@ type
Moonshine: SherpaOnnxOfflineMoonshineModelConfig;
FireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig;
Dolphin: SherpaOnnxOfflineDolphinModelConfig;
ZipformerCtc: SherpaOnnxOfflineZipformerCtcModelConfig;
end;
SherpaOnnxOfflineRecognizerConfig = record
@@ -1536,6 +1546,12 @@ begin
[Self.Model]);
end;
function TSherpaOnnxOfflineZipformerCtcModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineZipformerCtcModelConfig(Model := %s)',
[Self.Model]);
end;
function TSherpaOnnxOfflineWhisperModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineWhisperModelConfig(' +
@@ -1610,14 +1626,15 @@ begin
'SenseVoice := %s, ' +
'Moonshine := %s, ' +
'FireRedAsr := %s, ' +
'Dolphin := %s' +
'Dolphin := %s, ' +
'ZipformerCtc := %s' +
')',
[Self.Transducer.ToString, Self.Paraformer.ToString,
Self.NeMoCtc.ToString, Self.Whisper.ToString, Self.Tdnn.ToString,
Self.Tokens, Self.NumThreads, Self.Debug.ToString, Self.Provider,
Self.ModelType, Self.ModelingUnit, Self.BpeVocab,
Self.TeleSpeechCtc, Self.SenseVoice.ToString, Self.Moonshine.ToString,
Self.FireRedAsr.ToString, Self.Dolphin.ToString
Self.FireRedAsr.ToString, Self.Dolphin.ToString, Self.ZipformerCtc.ToString
]);
end;
@@ -1688,6 +1705,7 @@ begin
C.ModelConfig.FireRedAsr.Decoder := PAnsiChar(Config.ModelConfig.FireRedAsr.Decoder);
C.ModelConfig.Dolphin.Model := PAnsiChar(Config.ModelConfig.Dolphin.Model);
C.ModelConfig.ZipformerCtc.Model := PAnsiChar(Config.ModelConfig.ZipformerCtc.Model);
C.LMConfig.Model := PAnsiChar(Config.LMConfig.Model);
C.LMConfig.Scale := Config.LMConfig.Scale;

View File

@@ -527,6 +527,87 @@ class OfflineRecognizer(object):
self.config = recognizer_config
return self
@classmethod
def from_zipformer_ctc(
cls,
model: str,
tokens: str,
num_threads: int = 1,
sample_rate: int = 16000,
feature_dim: int = 80,
decoding_method: str = "greedy_search",
debug: bool = False,
provider: str = "cpu",
rule_fsts: str = "",
rule_fars: str = "",
hr_dict_dir: str = "",
hr_rule_fsts: str = "",
hr_lexicon: str = "",
):
"""
Please refer to
`<https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/index.html>`_
to download pre-trained models for different languages, e.g., Chinese,
English, etc.
Args:
model:
Path to ``model.onnx``.
tokens:
Path to ``tokens.txt``. Each line in ``tokens.txt`` contains two
columns::
symbol integer_id
num_threads:
Number of threads for neural network computation.
sample_rate:
Sample rate of the training data used to train the model.
feature_dim:
Dimension of the feature used to train the model.
decoding_method:
Valid values are greedy_search.
debug:
True to show debug messages.
provider:
onnxruntime execution providers. Valid values are: cpu, cuda, coreml.
rule_fsts:
If not empty, it specifies fsts for inverse text normalization.
If there are multiple fsts, they are separated by a comma.
rule_fars:
If not empty, it specifies fst archives for inverse text normalization.
If there are multiple archives, they are separated by a comma.
"""
self = cls.__new__(cls)
model_config = OfflineModelConfig(
zipformer_ctc=OfflineZipformerCtcModelConfig(model=model),
tokens=tokens,
num_threads=num_threads,
debug=debug,
provider=provider,
)
feat_config = FeatureExtractorConfig(
sampling_rate=sample_rate,
feature_dim=feature_dim,
)
recognizer_config = OfflineRecognizerConfig(
feat_config=feat_config,
model_config=model_config,
decoding_method=decoding_method,
rule_fsts=rule_fsts,
rule_fars=rule_fars,
hr=HomophoneReplacerConfig(
dict_dir=hr_dict_dir,
lexicon=hr_lexicon,
rule_fsts=hr_rule_fsts,
),
)
self.recognizer = _Recognizer(recognizer_config)
self.config = recognizer_config
return self
@classmethod
def from_nemo_ctc(
cls,