From 88c65781985c38ec0ebcdeaa11e9329dd6dbf629 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Fri, 8 May 2026 11:13:06 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: dysata/Wav2Vec2-Ru-Child Source: Original Platform --- .gitattributes | 35 +++++++++++ README.md | 130 +++++++++++++++++++++++++++++++++++++++ added_tokens.json | 4 ++ config.json | 116 ++++++++++++++++++++++++++++++++++ model.safetensors | 3 + preprocessor_config.json | 10 +++ special_tokens_map.json | 6 ++ tokenizer_config.json | 49 +++++++++++++++ vocab.json | 39 ++++++++++++ 9 files changed, 392 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 added_tokens.json create mode 100644 config.json create mode 100644 model.safetensors create mode 100644 preprocessor_config.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer_config.json create mode 100644 vocab.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a6344aa --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..9acadee --- /dev/null +++ b/README.md @@ -0,0 +1,130 @@ +--- +language: +- ru +license: mit +library_name: transformers +pipeline_tag: automatic-speech-recognition +tags: +- wav2vec2 +- speech +- russian +- children +- ctc +- forced-alignment +- pronunciation +- phonetics +datasets: +- dysata/rwords +model-index: +- name: Wav2Vec2-Ru-Child + results: [] +--- + +# Wav2Vec2-Ru-Child + +Модель автоматического распознавания речи (ASR) для русского языка, дообученная на записях детского чтения. + +## Model Details + +### Architecture + +- **Base model:** wav2vec2-large +- **Architecture:** `Wav2Vec2ForCTC` +- **Hidden size:** 1024 +- **Layers:** 24 transformer layers +- **Attention heads:** 16 +- **Parameters:** ~317M +- **Vocabulary:** 37 токенов (33 буквы русского алфавита + 4 служебных) +- **CTC loss:** mean reduction + +### Intended Use + +Модель предназначена для: +- Распознавания русской детской речи +- Forced alignment (выравнивание текста по аудио на уровне букв) +- Анализа произношения — выявление ошибок в детском чтении +- Классификации качества произношения отдельных звуков (например, звука "Р") + +## How to Use + +### Speech Recognition + +```python +import torch +import librosa +from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC + +processor = Wav2Vec2Processor.from_pretrained("dysata/Wav2Vec2-Ru-Child") +model = Wav2Vec2ForCTC.from_pretrained("dysata/Wav2Vec2-Ru-Child") + +audio, sr = librosa.load("audio.wav", sr=16000) + +processed = processor([audio], sampling_rate=16000, + return_tensors="pt", padding="longest") + +with torch.no_grad(): + logits = model(processed.input_values, + attention_mask=processed.attention_mask).logits + +predicted_ids = torch.argmax(logits, dim=-1) +transcription = processor.decode(predicted_ids[0]) +print(transcription) +``` + +### Forced Alignment + +Модель может использоваться для побуквенного выравнивания эталонного текста по аудио через CTC forced alignment (trellis + backtrack + merge_repeats). Это позволяет определить временные границы каждой буквы в записи. + +### Hidden States для классификации + +```python +with torch.no_grad(): + outputs = model(processed.input_values, + attention_mask=processed.attention_mask, + output_hidden_states=True, return_dict=True) + last_hidden_state = outputs.hidden_states[-1] # [batch, frames, 1024] +``` + +Вектора последнего скрытого слоя (1024-мерные) могут быть использованы как признаки для классификации качества произношения отдельных звуков. + +## Training + +Модель дообучена на записях детского чтения на русском языке. Аудиозаписи преобразованы в формат WAV 16 кГц и вручную оттранскрибированы. + +## Technical Specifications + +| Parameter | Value | +|---|---| +| Sample rate | 16 kHz | +| Feature extractor | 7-layer CNN | +| Transformer layers | 24 | +| Hidden size | 1024 | +| Vocab size | 37 | +| Precision | float32 | +| Format | Safetensors | + +## Vocabulary + +Алфавит модели: ``, ``, ``, ``, `|` (разделитель слов), а-я (33 буквы русского алфавита). + +## Limitations + +- Модель обучена на детской речи и может показывать худшие результаты на взрослой речи +- Только русский язык +- Оптимальное качество на записях в формате WAV 16 кГц + +## Citation + +```bibtex +@misc{wav2vec2-ru-child, + author = {Павел Рудич}, + title = {Wav2Vec2-Ru-Child: Russian Children's Speech Recognition Model}, + year = {2025}, + publisher = {Hugging Face}, + url = {https://huggingface.co/dysata/Wav2Vec2-Ru-Child} +} +``` + +## Funding + +Фонд содействия инновациям (fasie). diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..c8ca52d --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,4 @@ +{ + "[PAD]": 38, + "[UNK]": 37 +} diff --git a/config.json b/config.json new file mode 100644 index 0000000..05c49f4 --- /dev/null +++ b/config.json @@ -0,0 +1,116 @@ +{ + "_name_or_path": "./model6m", + "activation_dropout": 0.0, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForCTC" + ], + "attention_dropout": 0.0, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 768, + "contrastive_logits_temperature": 0.1, + "conv_bias": true, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "mean", + "ctc_zero_infinity": true, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.0, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.0, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "num_adapter_layers": 3, + "num_attention_heads": 16, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 24, + "num_negatives": 100, + "output_hidden_size": 1024, + "pad_token_id": 0, + "proj_codevector_dim": 768, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.49.0", + "use_weighted_layer_sum": false, + "vocab_size": 37, + "xvector_output_dim": 512 +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..5c987b6 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f5b304a55228829a041a5aac128708c195d9c5280e6ab80ab6b7d217a6e6cba +size 1261959180 diff --git a/preprocessor_config.json b/preprocessor_config.json new file mode 100644 index 0000000..77f5a72 --- /dev/null +++ b/preprocessor_config.json @@ -0,0 +1,10 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "processor_class": "Wav2Vec2Processor", + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..0e26582 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "" +} diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..9c3d9c0 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,49 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "1": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "2": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "3": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "processor_class": "Wav2Vec2Processor", + "replace_word_delimiter_char": " ", + "target_lang": null, + "tokenizer_class": "Wav2Vec2CTCTokenizer", + "unk_token": "", + "word_delimiter_token": "|" +} diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..c12c9dc --- /dev/null +++ b/vocab.json @@ -0,0 +1,39 @@ +{ + "": 2, + "": 0, + "": 1, + "": 3, + "|": 4, + "а": 5, + "б": 6, + "в": 7, + "г": 8, + "д": 9, + "е": 10, + "ж": 11, + "з": 12, + "и": 13, + "й": 14, + "к": 15, + "л": 16, + "м": 17, + "н": 18, + "о": 19, + "п": 20, + "р": 21, + "с": 22, + "т": 23, + "у": 24, + "ф": 25, + "х": 26, + "ц": 27, + "ч": 28, + "ш": 29, + "щ": 30, + "ъ": 31, + "ы": 32, + "ь": 33, + "э": 34, + "ю": 35, + "я": 36 +}