commit 2215700b54e0b82330fe62cb4bc3107a9e69a131 Author: ModelHub XC Date: Mon May 18 18:21:50 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: ocordeiro/w2v-bert-2.0-portuguese-colab-CV16.0 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a6344aa --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..01a3eb9 --- /dev/null +++ b/README.md @@ -0,0 +1,53 @@ +--- +base_model: ocordeiro/w2v-bert-2.0-portuguese-colab-CV16.0 +tags: +- generated_from_trainer +datasets: +- common_voice_16_0 +model-index: +- name: w2v-bert-2.0-portuguese-colab-CV16.0 + results: [] +--- + + + +# w2v-bert-2.0-portuguese-colab-CV16.0 + +This model is a fine-tuned version of [ocordeiro/w2v-bert-2.0-portuguese-colab-CV16.0](https://huggingface.co/ocordeiro/w2v-bert-2.0-portuguese-colab-CV16.0) on the common_voice_16_0 dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 5e-05 +- train_batch_size: 16 +- eval_batch_size: 8 +- seed: 42 +- gradient_accumulation_steps: 2 +- total_train_batch_size: 32 +- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 +- lr_scheduler_type: linear +- lr_scheduler_warmup_steps: 500 +- num_epochs: 10 +- mixed_precision_training: Native AMP + +### Framework versions + +- Transformers 4.37.1 +- Pytorch 2.1.0+cu121 +- Datasets 2.16.1 +- Tokenizers 0.15.1 diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..a3d9871 --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,4 @@ +{ + "": 50, + "": 49 +} diff --git a/config.json b/config.json new file mode 100644 index 0000000..3530d70 --- /dev/null +++ b/config.json @@ -0,0 +1,82 @@ +{ + "_name_or_path": "ocordeiro/w2v-bert-2.0-portuguese-colab-CV16.0", + "activation_dropout": 0.0, + "adapter_act": "relu", + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": true, + "apply_spec_augment": false, + "architectures": [ + "Wav2Vec2BertForCTC" + ], + "attention_dropout": 0.0, + "bos_token_id": 1, + "classifier_proj_size": 768, + "codevector_dim": 768, + "conformer_conv_dropout": 0.1, + "contrastive_logits_temperature": 0.1, + "conv_depthwise_kernel_size": 31, + "ctc_loss_reduction": "mean", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "eos_token_id": 2, + "feat_proj_dropout": 0.0, + "feat_quantizer_dropout": 0.0, + "feature_projection_input_dim": 160, + "final_dropout": 0.1, + "hidden_act": "swish", + "hidden_dropout": 0.0, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "left_max_position_embeddings": 64, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.0, + "max_source_positions": 5000, + "model_type": "wav2vec2-bert", + "num_adapter_layers": 1, + "num_attention_heads": 16, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_hidden_layers": 24, + "num_negatives": 100, + "output_hidden_size": 1024, + "pad_token_id": 48, + "position_embeddings_type": "relative_key", + "proj_codevector_dim": 768, + "right_max_position_embeddings": 8, + "rotary_embedding_base": 10000, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.37.1", + "use_intermediate_ffn_before_adapter": false, + "use_weighted_layer_sum": false, + "vocab_size": 51, + "xvector_output_dim": 512 +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..471e281 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ba2ba06ed9f2c95fe7ebe81ee66e01551d453bc574157caad50e5b07c03f583 +size 2423023660 diff --git a/preprocessor_config.json b/preprocessor_config.json new file mode 100644 index 0000000..b6a54f3 --- /dev/null +++ b/preprocessor_config.json @@ -0,0 +1,11 @@ +{ + "feature_extractor_type": "SeamlessM4TFeatureExtractor", + "feature_size": 80, + "num_mel_bins": 80, + "padding_side": "right", + "padding_value": 0.0, + "processor_class": "Wav2Vec2BertProcessor", + "return_attention_mask": true, + "sampling_rate": 16000, + "stride": 2 +} diff --git a/runs/Jan26_14-29-45_dcf3d28718fb/events.out.tfevents.1706279978.dcf3d28718fb.1548.0 b/runs/Jan26_14-29-45_dcf3d28718fb/events.out.tfevents.1706279978.dcf3d28718fb.1548.0 new file mode 100644 index 0000000..017b9b2 --- /dev/null +++ b/runs/Jan26_14-29-45_dcf3d28718fb/events.out.tfevents.1706279978.dcf3d28718fb.1548.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5bb970a058e46c00e540b822be308bfe7b7e2714e1cb61b897fba58f088c286 +size 10423 diff --git a/runs/Jan26_21-20-16_0fa4d1b278ac/events.out.tfevents.1706304511.0fa4d1b278ac.3170.0 b/runs/Jan26_21-20-16_0fa4d1b278ac/events.out.tfevents.1706304511.0fa4d1b278ac.3170.0 new file mode 100644 index 0000000..dbf3625 --- /dev/null +++ b/runs/Jan26_21-20-16_0fa4d1b278ac/events.out.tfevents.1706304511.0fa4d1b278ac.3170.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d33aac2664324ec356cd88e884b31ee0e49871e33c7413eea702591f43b7d832 +size 5699 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..1c2a036 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "", + "eos_token": "", + "pad_token": "[PAD]", + "unk_token": "[UNK]" +} diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..daebfba --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,48 @@ +{ + "added_tokens_decoder": { + "47": { + "content": "[UNK]", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "48": { + "content": "[PAD]", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": true, + "do_lower_case": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "[PAD]", + "processor_class": "Wav2Vec2BertProcessor", + "replace_word_delimiter_char": " ", + "target_lang": null, + "tokenizer_class": "Wav2Vec2CTCTokenizer", + "unk_token": "[UNK]", + "word_delimiter_token": "|" +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..5a76851 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f52dd3ec78cad4352cec7a95698c2e8b6f5055e0fba1697bb6bbdcaad29314f8 +size 4728 diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..e778e9f --- /dev/null +++ b/vocab.json @@ -0,0 +1,51 @@ +{ + "&": 1, + "[PAD]": 48, + "[UNK]": 47, + "a": 2, + "b": 3, + "c": 4, + "d": 5, + "e": 6, + "f": 7, + "g": 8, + "h": 9, + "i": 10, + "j": 11, + "k": 12, + "l": 13, + "m": 14, + "n": 15, + "o": 16, + "p": 17, + "q": 18, + "r": 19, + "s": 20, + "t": 21, + "u": 22, + "v": 23, + "w": 24, + "x": 25, + "y": 26, + "z": 27, + "|": 0, + "´": 28, + "à": 29, + "á": 30, + "â": 31, + "ã": 32, + "ç": 33, + "è": 34, + "é": 35, + "ê": 36, + "í": 37, + "ñ": 38, + "ó": 39, + "ô": 40, + "õ": 41, + "ú": 42, + "ü": 43, + "š": 44, + "ž": 45, + "‐": 46 +}