commit c806b214c141ba6e663cec5a17e2bb71c814b6eb Author: ModelHub XC Date: Fri May 8 11:39:12 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: Taeham/wav2vec2-ksponspeech Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..ac481c8 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,27 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0348ea9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +checkpoint-*/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..0708123 --- /dev/null +++ b/README.md @@ -0,0 +1,59 @@ +--- +license: apache-2.0 +tags: +- generated_from_trainer +model-index: +- name: wav2vec2-ksponspeech + results: [] + +--- + + +# wav2vec2-ksponspeech + +This model is a fine-tuned version of [Wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on the None dataset. +It achieves the following results on the evaluation set: + +- **WER(Word Error Rate)** for Third party test data : 0.373 + +**For improving WER:** +- Numeric / Character Unification +- Decoding the word with the correct notation (from word based on pronounciation) +- Uniform use of special characters (. / ?) +- Converting non-existent words to existing words + +## Model description + +Korean Wav2vec with Ksponspeech dataset. + +This model was trained by two dataset : + +- Train1 : https://huggingface.co/datasets/Taeham/wav2vec2-ksponspeech-train (1 ~ 20000th data in Ksponspeech) +- Train2 : https://huggingface.co/datasets/Taeham/wav2vec2-ksponspeech-train2 (20100 ~ 40100th data in Ksponspeech) +- Validation : https://huggingface.co/datasets/Taeham/wav2vec2-ksponspeech-test (20000 ~ 20100th data in Ksponspeech) +- Third party test : https://huggingface.co/datasets/Taeham/wav2vec2-ksponspeech-test (60000 ~ 20100th data in Ksponspeech) + +### Hardward Specification +- GPU : GEFORCE RTX 3080ti 12GB +- CPU : Intel i9-12900k +- RAM : 32GB + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 0.0003 +- train_batch_size: 4 +- eval_batch_size: 4 +- seed: 42 +- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 +- lr_scheduler_type: linear +- lr_scheduler_warmup_steps: 500 +- num_epochs: 30 +- mixed_precision_training: Native AMP + +### Framework versions + +- Transformers 4.19.4 +- Pytorch 1.11.0 +- Datasets 2.2.2 +- Tokenizers 0.12.1 diff --git a/config.json b/config.json new file mode 100644 index 0000000..09b54e3 --- /dev/null +++ b/config.json @@ -0,0 +1,116 @@ +{ + "_name_or_path": "Taeham/wav2vec2-ksponspeech-dataset", + "activation_dropout": 0.0, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForCTC" + ], + "attention_dropout": 0.0, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 768, + "contrastive_logits_temperature": 0.1, + "conv_bias": true, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "mean", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.0, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout": 0.0, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "num_adapter_layers": 3, + "num_attention_heads": 16, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 24, + "num_negatives": 100, + "output_hidden_size": 1024, + "pad_token_id": 446, + "proj_codevector_dim": 768, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.19.4", + "use_weighted_layer_sum": false, + "vocab_size": 447, + "xvector_output_dim": 512 +} diff --git a/preprocessor_config.json b/preprocessor_config.json new file mode 100644 index 0000000..73caa15 --- /dev/null +++ b/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/pytorch_model.bin b/pytorch_model.bin new file mode 100644 index 0000000..17fbaeb --- /dev/null +++ b/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9754b5f78124378cde68d989d93268f0057dacec33108b00a2bc43af94f7be70 +size 1263731377 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..9abf719 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1 @@ +{"bos_token": "", "eos_token": "", "unk_token": "[UNK]", "pad_token": "[PAD]"} \ No newline at end of file diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..c31a8e0 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1 @@ +{"unk_token": "[UNK]", "bos_token": "", "eos_token": "", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "replace_word_delimiter_char": " ", "tokenizer_class": "Wav2Vec2CTCTokenizer"} \ No newline at end of file diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..4425ee1 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:269fcaf8aa2fd58f145ddede481f8e6c69a36857fb3ecce2e59ae50ebdd6d628 +size 3183 diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..0264fac --- /dev/null +++ b/vocab.json @@ -0,0 +1 @@ +{"c": 0, "므": 1, "제": 2, "쭈": 3, "사": 4, "쌔": 5, "외": 6, "떠": 7, "투": 8, "궤": 9, "뉘": 10, "6": 11, "시": 12, "쐐": 13, "g": 14, "얘": 15, "5": 16, "W": 17, "캐": 18, "초": 19, "조": 20, "K": 21, "쬐": 22, "씨": 23, "세": 24, "j": 25, "ㅈ": 26, "t": 27, "쥬": 28, "퉈": 29, "쁴": 30, "l": 31, "뙈": 32, "구": 33, "꺄": 34, "R": 35, "재": 36, "데": 37, "퀘": 38, "니": 39, "네": 40, "ㅊ": 41, "ㄿ": 42, "뀌": 43, "z": 44, "히": 45, "듀": 46, "쬬": 47, "ㄻ": 48, "끼": 49, "뻐": 50, "라": 51, "웨": 52, "뛔": 53, "쫘": 54, "수": 55, "빠": 56, "츄": 57, "셰": 58, "D": 59, "V": 60, "바": 61, "희": 62, "워": 63, "쟤": 64, "퇴": 65, "C": 66, "T": 67, "게": 68, "쑈": 69, "늬": 70, "뤠": 71, "ㄱ": 72, "싸": 73, "롸": 74, "훼": 75, "ㅆ": 76, "저": 77, "주": 78, "썌": 79, "개": 80, "콰": 81, "듸": 82, "G": 83, "꼬": 84, "새": 85, "허": 86, "f": 87, "쇄": 88, "그": 89, "L": 90, "쪄": 91, "토": 92, "고": 93, "다": 94, "죄": 95, "U": 96, "묘": 97, "뚸": 98, "트": 99, "ㄳ": 100, "죠": 101, "꾀": 102, "펴": 103, "더": 104, "뵈": 105, "럐": 106, "풰": 107, "쓔": 108, "쒀": 109, "애": 110, "o": 111, "피": 112, "쪼": 113, "4": 114, "녀": 115, "너": 116, "ㄴ": 117, "봬": 118, "돠": 119, "퀴": 120, "벼": 121, "줴": 122, "타": 123, "가": 124, "려": 125, "3": 126, "쉐": 127, "모": 128, "꾸": 129, "H": 130, "뱌": 131, "뿌": 132, "뭬": 133, "1": 134, "르": 135, "2": 136, "지": 137, "브": 138, "하": 139, "처": 140, "유": 141, "ㄼ": 142, "끄": 143, "w": 144, "찌": 145, "표": 146, "쒸": 147, "삐": 148, "써": 149, "쩨": 150, "ㄾ": 151, "쓰": 152, "*": 153, "푸": 154, "뉴": 155, "A": 156, "텨": 157, "뛰": 158, "테": 159, "규": 160, "때": 162, "쨰": 163, "돼": 164, "뫼": 165, "땨": 166, "쮜": 167, "샤": 168, "!": 169, "솨": 170, "치": 171, "뽀": 172, "ㄶ": 173, "류": 174, "쾌": 175, "궈": 176, "y": 177, "쭤": 178, "씌": 179, "귀": 180, "키": 181, "k": 182, "츠": 183, "Z": 184, "홰": 185, "휴": 186, "v": 187, "졔": 188, "퐈": 189, "패": 190, "우": 191, "쳐": 192, "부": 193, "ㅎ": 194, "러": 195, "9": 196, "ㅋ": 197, "스": 198, "배": 199, "까": 200, "J": 201, "붜": 202, "케": 203, "B": 204, "빼": 205, "8": 206, "뾰": 207, "뜨": 208, "P": 209, "호": 210, "M": 211, "페": 212, "e": 213, "꿔": 214, "채": 215, "눠": 216, "쇼": 217, "깨": 218, "뙤": 219, "쑤": 220, "쌰": 221, "햐": 222, "최": 223, "뷔": 224, "쨔": 225, "ㅁ": 226, "무": 227, "i": 228, "췌": 229, "쎄": 230, "티": 231, "괘": 232, "의": 233, "쵸": 234, "갸": 235, "s": 236, "왜": 237, "챠": 238, "나": 239, "위": 240, "ㅌ": 241, "거": 242, "취": 243, "쥐": 244, "꺠": 245, "푀": 246, "툐": 247, "뤄": 248, "느": 249, "즈": 250, "쎠": 251, "혀": 252, "으": 253, "흐": 254, "?": 255, "메": 256, "터": 257, "해": 258, "뜌": 259, "ㄷ": 260, "쉬": 261, "d": 262, "뇌": 263, "쒜": 264, "ㄵ": 265, "껴": 266, "계": 267, "뤼": 268, "쇠": 269, "뎌": 270, "괴": 271, "쿠": 272, "도": 273, "어": 274, "루": 275, "료": 276, "차": 277, "틔": 278, "u": 279, "뻬": 280, "추": 281, "비": 282, "꽤": 283, "X": 284, "O": 285, "카": 286, "냐": 287, "h": 288, "소": 289, "긔": 290, "코": 291, "튜": 292, "F": 293, "E": 294, "댜": 295, "S": 296, "휘": 297, "또": 298, "뀨": 299, "레": 300, "줘": 301, "걔": 302, "ㄺ": 303, "춰": 304, "ㅍ": 305, "I": 306, "버": 307, "0": 308, "됴": 309, "프": 310, "떄": 311, "미": 312, "뢰": 313, "ㄹ": 314, "꼐": 315, "뭐": 316, "꺼": 317, "쏘": 318, "째": 319, "매": 320, "먀": 321, "과": 322, "누": 323, "7": 324, "내": 325, "ㅀ": 326, "튀": 327, "리": 328, "머": 329, "디": 330, "퉤": 331, "퍄": 332, "뮈": 333, "커": 334, "보": 335, "여": 336, "와": 337, "겨": 338, "뺴": 339, "꾜": 340, "혜": 341, "쟈": 342, "쀠": 343, "노": 344, "오": 345, "쪠": 346, "짜": 347, "셔": 348, "례": 349, "자": 350, "뺘": 351, "폐": 352, "예": 353, "쮸": 354, "쐬": 355, "ㅅ": 356, "b": 357, "뽜": 358, "요": 359, "띠": 360, "마": 361, "두": 362, "후": 363, "ㄲ": 364, "꿰": 365, "이": 366, "x": 367, "크": 368, "께": 369, "떼": 370, "베": 371, "녜": 372, "좌": 373, "기": 374, "%": 375, "띄": 376, "N": 377, "슈": 378, "숴": 379, "드": 380, "뗘": 381, "퍼": 382, "져": 383, "대": 384, "뷰": 385, "봐": 386, "퓨": 387, "뚜": 388, "섀": 389, "랴": 390, "교": 391, "꽈": 392, "뫠": 393, "쯔": 394, "켸": 395, "효": 396, "켜": 397, "쿼": 398, "Y": 399, "체": 400, "회": 401, "냬": 402, "ㅂ": 403, "되": 404, "화": 405, "뼈": 406, "포": 407, "태": 408, "q": 409, "야": 410, "m": 411, "ㅄ": 412, "r": 413, "p": 414, "쩌": 415, "헤": 416, "파": 417, "a": 418, "뒤": 419, ".": 420, "촤": 421, "뮤": 422, "따": 423, "큐": 424, "훠": 425, "쿄": 426, "햬": 427, "뵤": 428, "로": 429, "Q": 430, "서": 431, "며": 432, "래": 433, "뇨": 434, "둬": 435, "n": 436, "에": 437, "쎼": 438, "아": 439, "ㅇ": 440, "캬": 441, "놔": 442, "쏴": 443, "쁘": 444, "|": 161, "[UNK]": 445, "[PAD]": 446} \ No newline at end of file