Support non-streaming zipformer CTC ASR models (#2340)
This PR adds support for non-streaming Zipformer CTC ASR models across multiple language bindings, WebAssembly, examples, and CI workflows. - Introduces a new OfflineZipformerCtcModelConfig in C/C++, Python, Swift, Java, Kotlin, Go, Dart, Pascal, and C# APIs - Updates initialization, freeing, and recognition logic to include Zipformer CTC in WASM and Node.js - Adds example scripts and CI steps for downloading, building, and running Zipformer CTC models Model doc is available at https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html
This commit is contained in:
@@ -484,6 +484,9 @@ static sherpa_onnx::OfflineRecognizerConfig GetOfflineRecognizerConfig(
|
||||
recognizer_config.model_config.dolphin.model =
|
||||
SHERPA_ONNX_OR(config->model_config.dolphin.model, "");
|
||||
|
||||
recognizer_config.model_config.zipformer_ctc.model =
|
||||
SHERPA_ONNX_OR(config->model_config.zipformer_ctc.model, "");
|
||||
|
||||
recognizer_config.lm_config.model =
|
||||
SHERPA_ONNX_OR(config->lm_config.model, "");
|
||||
recognizer_config.lm_config.scale =
|
||||
|
||||
@@ -451,6 +451,10 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineDolphinModelConfig {
|
||||
const char *model;
|
||||
} SherpaOnnxOfflineDolphinModelConfig;
|
||||
|
||||
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineZipformerCtcModelConfig {
|
||||
const char *model;
|
||||
} SherpaOnnxOfflineZipformerCtcModelConfig;
|
||||
|
||||
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig {
|
||||
SherpaOnnxOfflineTransducerModelConfig transducer;
|
||||
SherpaOnnxOfflineParaformerModelConfig paraformer;
|
||||
@@ -474,6 +478,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig {
|
||||
SherpaOnnxOfflineMoonshineModelConfig moonshine;
|
||||
SherpaOnnxOfflineFireRedAsrModelConfig fire_red_asr;
|
||||
SherpaOnnxOfflineDolphinModelConfig dolphin;
|
||||
SherpaOnnxOfflineZipformerCtcModelConfig zipformer_ctc;
|
||||
} SherpaOnnxOfflineModelConfig;
|
||||
|
||||
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig {
|
||||
|
||||
@@ -252,6 +252,9 @@ OfflineRecognizer OfflineRecognizer::Create(
|
||||
|
||||
c.model_config.dolphin.model = config.model_config.dolphin.model.c_str();
|
||||
|
||||
c.model_config.zipformer_ctc.model =
|
||||
config.model_config.zipformer_ctc.model.c_str();
|
||||
|
||||
c.lm_config.model = config.lm_config.model.c_str();
|
||||
c.lm_config.scale = config.lm_config.scale;
|
||||
|
||||
|
||||
@@ -241,6 +241,10 @@ struct SHERPA_ONNX_API OfflineDolphinModelConfig {
|
||||
std::string model;
|
||||
};
|
||||
|
||||
struct SHERPA_ONNX_API OfflineZipformerCtcModelConfig {
|
||||
std::string model;
|
||||
};
|
||||
|
||||
struct SHERPA_ONNX_API OfflineMoonshineModelConfig {
|
||||
std::string preprocessor;
|
||||
std::string encoder;
|
||||
@@ -267,6 +271,7 @@ struct SHERPA_ONNX_API OfflineModelConfig {
|
||||
OfflineMoonshineModelConfig moonshine;
|
||||
OfflineFireRedAsrModelConfig fire_red_asr;
|
||||
OfflineDolphinModelConfig dolphin;
|
||||
OfflineZipformerCtcModelConfig zipformer_ctc;
|
||||
};
|
||||
|
||||
struct SHERPA_ONNX_API OfflineLMConfig {
|
||||
|
||||
@@ -113,6 +113,16 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
|
||||
const OfflineModelConfig &config) {
|
||||
if (!config.dolphin.model.empty()) {
|
||||
return std::make_unique<OfflineDolphinModel>(config);
|
||||
} else if (!config.nemo_ctc.model.empty()) {
|
||||
return std::make_unique<OfflineNemoEncDecCtcModel>(config);
|
||||
} else if (!config.tdnn.model.empty()) {
|
||||
return std::make_unique<OfflineTdnnCtcModel>(config);
|
||||
} else if (!config.zipformer_ctc.model.empty()) {
|
||||
return std::make_unique<OfflineZipformerCtcModel>(config);
|
||||
} else if (!config.wenet_ctc.model.empty()) {
|
||||
return std::make_unique<OfflineWenetCtcModel>(config);
|
||||
} else if (!config.telespeech_ctc.empty()) {
|
||||
return std::make_unique<OfflineTeleSpeechCtcModel>(config);
|
||||
}
|
||||
|
||||
// TODO(fangjun): Refactor it. We don't need to use model_type here
|
||||
@@ -167,6 +177,16 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
|
||||
Manager *mgr, const OfflineModelConfig &config) {
|
||||
if (!config.dolphin.model.empty()) {
|
||||
return std::make_unique<OfflineDolphinModel>(mgr, config);
|
||||
} else if (!config.nemo_ctc.model.empty()) {
|
||||
return std::make_unique<OfflineNemoEncDecCtcModel>(mgr, config);
|
||||
} else if (!config.tdnn.model.empty()) {
|
||||
return std::make_unique<OfflineTdnnCtcModel>(mgr, config);
|
||||
} else if (!config.zipformer_ctc.model.empty()) {
|
||||
return std::make_unique<OfflineZipformerCtcModel>(mgr, config);
|
||||
} else if (!config.wenet_ctc.model.empty()) {
|
||||
return std::make_unique<OfflineWenetCtcModel>(mgr, config);
|
||||
} else if (!config.telespeech_ctc.empty()) {
|
||||
return std::make_unique<OfflineTeleSpeechCtcModel>(mgr, config);
|
||||
}
|
||||
|
||||
// TODO(fangjun): Refactor it. We don't need to use model_type here
|
||||
|
||||
@@ -33,6 +33,7 @@ java_files += OfflineWhisperModelConfig.java
|
||||
java_files += OfflineFireRedAsrModelConfig.java
|
||||
java_files += OfflineMoonshineModelConfig.java
|
||||
java_files += OfflineNemoEncDecCtcModelConfig.java
|
||||
java_files += OfflineZipformerCtcModelConfig.java
|
||||
java_files += OfflineSenseVoiceModelConfig.java
|
||||
java_files += OfflineDolphinModelConfig.java
|
||||
java_files += OfflineModelConfig.java
|
||||
|
||||
@@ -11,6 +11,7 @@ public class OfflineModelConfig {
|
||||
private final OfflineNemoEncDecCtcModelConfig nemo;
|
||||
private final OfflineSenseVoiceModelConfig senseVoice;
|
||||
private final OfflineDolphinModelConfig dolphin;
|
||||
private final OfflineZipformerCtcModelConfig zipformerCtc;
|
||||
private final String teleSpeech;
|
||||
private final String tokens;
|
||||
private final int numThreads;
|
||||
@@ -28,6 +29,7 @@ public class OfflineModelConfig {
|
||||
this.fireRedAsr = builder.fireRedAsr;
|
||||
this.moonshine = builder.moonshine;
|
||||
this.nemo = builder.nemo;
|
||||
this.zipformerCtc = builder.zipformerCtc;
|
||||
this.senseVoice = builder.senseVoice;
|
||||
this.dolphin = builder.dolphin;
|
||||
this.teleSpeech = builder.teleSpeech;
|
||||
@@ -52,7 +54,7 @@ public class OfflineModelConfig {
|
||||
return transducer;
|
||||
}
|
||||
|
||||
public OfflineWhisperModelConfig getZipformer2Ctc() {
|
||||
public OfflineWhisperModelConfig getWhisper() {
|
||||
return whisper;
|
||||
}
|
||||
|
||||
@@ -68,6 +70,14 @@ public class OfflineModelConfig {
|
||||
return dolphin;
|
||||
}
|
||||
|
||||
public OfflineNemoEncDecCtcModelConfig getNemo() {
|
||||
return nemo;
|
||||
}
|
||||
|
||||
public OfflineZipformerCtcModelConfig getZipformerCtc() {
|
||||
return zipformerCtc;
|
||||
}
|
||||
|
||||
public String getTokens() {
|
||||
return tokens;
|
||||
}
|
||||
@@ -109,6 +119,7 @@ public class OfflineModelConfig {
|
||||
private OfflineNemoEncDecCtcModelConfig nemo = OfflineNemoEncDecCtcModelConfig.builder().build();
|
||||
private OfflineSenseVoiceModelConfig senseVoice = OfflineSenseVoiceModelConfig.builder().build();
|
||||
private OfflineDolphinModelConfig dolphin = OfflineDolphinModelConfig.builder().build();
|
||||
private OfflineZipformerCtcModelConfig zipformerCtc = OfflineZipformerCtcModelConfig.builder().build();
|
||||
private String teleSpeech = "";
|
||||
private String tokens = "";
|
||||
private int numThreads = 1;
|
||||
@@ -142,6 +153,11 @@ public class OfflineModelConfig {
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setZipformerCtc(OfflineZipformerCtcModelConfig zipformerCtc) {
|
||||
this.zipformerCtc = zipformerCtc;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setTeleSpeech(String teleSpeech) {
|
||||
this.teleSpeech = teleSpeech;
|
||||
return this;
|
||||
|
||||
@@ -0,0 +1,32 @@
|
||||
// Copyright 2025 Xiaomi Corporation
|
||||
|
||||
package com.k2fsa.sherpa.onnx;
|
||||
|
||||
public class OfflineZipformerCtcModelConfig {
|
||||
private final String model;
|
||||
|
||||
private OfflineZipformerCtcModelConfig(Builder builder) {
|
||||
this.model = builder.model;
|
||||
}
|
||||
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
public String getModel() {
|
||||
return model;
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
private String model = "";
|
||||
|
||||
public OfflineZipformerCtcModelConfig build() {
|
||||
return new OfflineZipformerCtcModelConfig(this);
|
||||
}
|
||||
|
||||
public Builder setModel(String model) {
|
||||
this.model = model;
|
||||
return this;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -269,6 +269,21 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) {
|
||||
ans.model_config.nemo_ctc.model = p;
|
||||
env->ReleaseStringUTFChars(s, p);
|
||||
|
||||
// zipformer ctc
|
||||
fid =
|
||||
env->GetFieldID(model_config_cls, "zipformerCtc",
|
||||
"Lcom/k2fsa/sherpa/onnx/OfflineZipformerCtcModelConfig;");
|
||||
jobject zipformer_ctc_config = env->GetObjectField(model_config, fid);
|
||||
jclass zipformer_ctc_config_cls = env->GetObjectClass(zipformer_ctc_config);
|
||||
|
||||
fid =
|
||||
env->GetFieldID(zipformer_ctc_config_cls, "model", "Ljava/lang/String;");
|
||||
|
||||
s = (jstring)env->GetObjectField(zipformer_ctc_config, fid);
|
||||
p = env->GetStringUTFChars(s, nullptr);
|
||||
ans.model_config.zipformer_ctc.model = p;
|
||||
env->ReleaseStringUTFChars(s, p);
|
||||
|
||||
// dolphin
|
||||
fid = env->GetFieldID(model_config_cls, "dolphin",
|
||||
"Lcom/k2fsa/sherpa/onnx/OfflineDolphinModelConfig;");
|
||||
|
||||
@@ -29,6 +29,10 @@ data class OfflineDolphinModelConfig(
|
||||
var model: String = "",
|
||||
)
|
||||
|
||||
data class OfflineZipformerCtcModelConfig(
|
||||
var model: String = "",
|
||||
)
|
||||
|
||||
data class OfflineWhisperModelConfig(
|
||||
var encoder: String = "",
|
||||
var decoder: String = "",
|
||||
@@ -64,6 +68,7 @@ data class OfflineModelConfig(
|
||||
var nemo: OfflineNemoEncDecCtcModelConfig = OfflineNemoEncDecCtcModelConfig(),
|
||||
var senseVoice: OfflineSenseVoiceModelConfig = OfflineSenseVoiceModelConfig(),
|
||||
var dolphin: OfflineDolphinModelConfig = OfflineDolphinModelConfig(),
|
||||
var zipformerCtc: OfflineZipformerCtcModelConfig = OfflineZipformerCtcModelConfig(),
|
||||
var teleSpeech: String = "",
|
||||
var numThreads: Int = 1,
|
||||
var debug: Boolean = false,
|
||||
@@ -559,6 +564,16 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? {
|
||||
modelType = "nemo_transducer",
|
||||
)
|
||||
}
|
||||
|
||||
31 -> {
|
||||
val modelDir = "sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03"
|
||||
return OfflineModelConfig(
|
||||
zipformerCtc = OfflineZipformerCtcModelConfig(
|
||||
model = "$modelDir/model.int8.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
)
|
||||
}
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
@@ -412,6 +412,7 @@ fun getModelConfig(type: Int): OnlineModelConfig? {
|
||||
model = "$modelDir/model.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
modelType = "zipformer2",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -422,6 +423,7 @@ fun getModelConfig(type: Int): OnlineModelConfig? {
|
||||
model = "$modelDir/model.fp16.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
modelType = "zipformer2",
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -284,6 +284,11 @@ type
|
||||
function ToString: AnsiString;
|
||||
end;
|
||||
|
||||
TSherpaOnnxOfflineZipformerCtcModelConfig = record
|
||||
Model: AnsiString;
|
||||
function ToString: AnsiString;
|
||||
end;
|
||||
|
||||
TSherpaOnnxOfflineWhisperModelConfig = record
|
||||
Encoder: AnsiString;
|
||||
Decoder: AnsiString;
|
||||
@@ -346,6 +351,7 @@ type
|
||||
Moonshine: TSherpaOnnxOfflineMoonshineModelConfig;
|
||||
FireRedAsr: TSherpaOnnxOfflineFireRedAsrModelConfig;
|
||||
Dolphin: TSherpaOnnxOfflineDolphinModelConfig;
|
||||
ZipformerCtc: TSherpaOnnxOfflineZipformerCtcModelConfig;
|
||||
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig);
|
||||
function ToString: AnsiString;
|
||||
end;
|
||||
@@ -726,6 +732,9 @@ type
|
||||
SherpaOnnxOfflineDolphinModelConfig = record
|
||||
Model: PAnsiChar;
|
||||
end;
|
||||
SherpaOnnxOfflineZipformerCtcModelConfig = record
|
||||
Model: PAnsiChar;
|
||||
end;
|
||||
SherpaOnnxOfflineWhisperModelConfig = record
|
||||
Encoder: PAnsiChar;
|
||||
Decoder: PAnsiChar;
|
||||
@@ -773,6 +782,7 @@ type
|
||||
Moonshine: SherpaOnnxOfflineMoonshineModelConfig;
|
||||
FireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig;
|
||||
Dolphin: SherpaOnnxOfflineDolphinModelConfig;
|
||||
ZipformerCtc: SherpaOnnxOfflineZipformerCtcModelConfig;
|
||||
end;
|
||||
|
||||
SherpaOnnxOfflineRecognizerConfig = record
|
||||
@@ -1536,6 +1546,12 @@ begin
|
||||
[Self.Model]);
|
||||
end;
|
||||
|
||||
function TSherpaOnnxOfflineZipformerCtcModelConfig.ToString: AnsiString;
|
||||
begin
|
||||
Result := Format('TSherpaOnnxOfflineZipformerCtcModelConfig(Model := %s)',
|
||||
[Self.Model]);
|
||||
end;
|
||||
|
||||
function TSherpaOnnxOfflineWhisperModelConfig.ToString: AnsiString;
|
||||
begin
|
||||
Result := Format('TSherpaOnnxOfflineWhisperModelConfig(' +
|
||||
@@ -1610,14 +1626,15 @@ begin
|
||||
'SenseVoice := %s, ' +
|
||||
'Moonshine := %s, ' +
|
||||
'FireRedAsr := %s, ' +
|
||||
'Dolphin := %s' +
|
||||
'Dolphin := %s, ' +
|
||||
'ZipformerCtc := %s' +
|
||||
')',
|
||||
[Self.Transducer.ToString, Self.Paraformer.ToString,
|
||||
Self.NeMoCtc.ToString, Self.Whisper.ToString, Self.Tdnn.ToString,
|
||||
Self.Tokens, Self.NumThreads, Self.Debug.ToString, Self.Provider,
|
||||
Self.ModelType, Self.ModelingUnit, Self.BpeVocab,
|
||||
Self.TeleSpeechCtc, Self.SenseVoice.ToString, Self.Moonshine.ToString,
|
||||
Self.FireRedAsr.ToString, Self.Dolphin.ToString
|
||||
Self.FireRedAsr.ToString, Self.Dolphin.ToString, Self.ZipformerCtc.ToString
|
||||
]);
|
||||
end;
|
||||
|
||||
@@ -1688,6 +1705,7 @@ begin
|
||||
C.ModelConfig.FireRedAsr.Decoder := PAnsiChar(Config.ModelConfig.FireRedAsr.Decoder);
|
||||
|
||||
C.ModelConfig.Dolphin.Model := PAnsiChar(Config.ModelConfig.Dolphin.Model);
|
||||
C.ModelConfig.ZipformerCtc.Model := PAnsiChar(Config.ModelConfig.ZipformerCtc.Model);
|
||||
|
||||
C.LMConfig.Model := PAnsiChar(Config.LMConfig.Model);
|
||||
C.LMConfig.Scale := Config.LMConfig.Scale;
|
||||
|
||||
@@ -527,6 +527,87 @@ class OfflineRecognizer(object):
|
||||
self.config = recognizer_config
|
||||
return self
|
||||
|
||||
@classmethod
|
||||
def from_zipformer_ctc(
|
||||
cls,
|
||||
model: str,
|
||||
tokens: str,
|
||||
num_threads: int = 1,
|
||||
sample_rate: int = 16000,
|
||||
feature_dim: int = 80,
|
||||
decoding_method: str = "greedy_search",
|
||||
debug: bool = False,
|
||||
provider: str = "cpu",
|
||||
rule_fsts: str = "",
|
||||
rule_fars: str = "",
|
||||
hr_dict_dir: str = "",
|
||||
hr_rule_fsts: str = "",
|
||||
hr_lexicon: str = "",
|
||||
):
|
||||
"""
|
||||
Please refer to
|
||||
`<https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/index.html>`_
|
||||
to download pre-trained models for different languages, e.g., Chinese,
|
||||
English, etc.
|
||||
|
||||
Args:
|
||||
model:
|
||||
Path to ``model.onnx``.
|
||||
tokens:
|
||||
Path to ``tokens.txt``. Each line in ``tokens.txt`` contains two
|
||||
columns::
|
||||
|
||||
symbol integer_id
|
||||
|
||||
num_threads:
|
||||
Number of threads for neural network computation.
|
||||
sample_rate:
|
||||
Sample rate of the training data used to train the model.
|
||||
feature_dim:
|
||||
Dimension of the feature used to train the model.
|
||||
decoding_method:
|
||||
Valid values are greedy_search.
|
||||
debug:
|
||||
True to show debug messages.
|
||||
provider:
|
||||
onnxruntime execution providers. Valid values are: cpu, cuda, coreml.
|
||||
rule_fsts:
|
||||
If not empty, it specifies fsts for inverse text normalization.
|
||||
If there are multiple fsts, they are separated by a comma.
|
||||
rule_fars:
|
||||
If not empty, it specifies fst archives for inverse text normalization.
|
||||
If there are multiple archives, they are separated by a comma.
|
||||
"""
|
||||
self = cls.__new__(cls)
|
||||
model_config = OfflineModelConfig(
|
||||
zipformer_ctc=OfflineZipformerCtcModelConfig(model=model),
|
||||
tokens=tokens,
|
||||
num_threads=num_threads,
|
||||
debug=debug,
|
||||
provider=provider,
|
||||
)
|
||||
|
||||
feat_config = FeatureExtractorConfig(
|
||||
sampling_rate=sample_rate,
|
||||
feature_dim=feature_dim,
|
||||
)
|
||||
|
||||
recognizer_config = OfflineRecognizerConfig(
|
||||
feat_config=feat_config,
|
||||
model_config=model_config,
|
||||
decoding_method=decoding_method,
|
||||
rule_fsts=rule_fsts,
|
||||
rule_fars=rule_fars,
|
||||
hr=HomophoneReplacerConfig(
|
||||
dict_dir=hr_dict_dir,
|
||||
lexicon=hr_lexicon,
|
||||
rule_fsts=hr_rule_fsts,
|
||||
),
|
||||
)
|
||||
self.recognizer = _Recognizer(recognizer_config)
|
||||
self.config = recognizer_config
|
||||
return self
|
||||
|
||||
@classmethod
|
||||
def from_nemo_ctc(
|
||||
cls,
|
||||
|
||||
Reference in New Issue
Block a user