初始化项目,由ModelHub XC社区提供模型
Model: ctl/wav2vec2-large-xlsr-cantonese Source: Original Platform
This commit is contained in:
17
.gitattributes
vendored
Normal file
17
.gitattributes
vendored
Normal file
@@ -0,0 +1,17 @@
|
||||
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
||||
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||
*.model filter=lfs diff=lfs merge=lfs -text
|
||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||
137
README.md
Normal file
137
README.md
Normal file
@@ -0,0 +1,137 @@
|
||||
---
|
||||
language:
|
||||
- yue
|
||||
language_bcp47:
|
||||
- zh-HK
|
||||
datasets:
|
||||
- common_voice
|
||||
metrics:
|
||||
- cer
|
||||
|
||||
tags:
|
||||
- audio
|
||||
- automatic-speech-recognition
|
||||
- speech
|
||||
- xlsr-fine-tuning-week
|
||||
license: apache-2.0
|
||||
model-index:
|
||||
- name: wav2vec2-large-xlsr-cantonese
|
||||
results:
|
||||
- task:
|
||||
name: Speech Recognition
|
||||
type: automatic-speech-recognition
|
||||
dataset:
|
||||
name: Common Voice zh-HK
|
||||
type: common_voice
|
||||
args: zh-HK
|
||||
metrics:
|
||||
- name: Test CER
|
||||
type: cer
|
||||
value: 15.36
|
||||
---
|
||||
|
||||
# Wav2Vec2-Large-XLSR-53-Cantonese
|
||||
|
||||
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Cantonese using the [Common Voice](https://huggingface.co/datasets/common_voice).
|
||||
When using this model, make sure that your speech input is sampled at 16kHz.
|
||||
|
||||
## Usage
|
||||
|
||||
The model can be used directly (without a language model) as follows:
|
||||
|
||||
```python
|
||||
import torch
|
||||
import torchaudio
|
||||
from datasets import load_dataset
|
||||
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
||||
|
||||
test_dataset = load_dataset("common_voice", "zh-HK", split="test[:2%]")
|
||||
|
||||
processor = Wav2Vec2Processor.from_pretrained("ctl/wav2vec2-large-xlsr-cantonese")
|
||||
model = Wav2Vec2ForCTC.from_pretrained("ctl/wav2vec2-large-xlsr-cantonese")
|
||||
|
||||
resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
||||
|
||||
# Preprocessing the datasets.
|
||||
# We need to read the aduio files as arrays
|
||||
def speech_file_to_array_fn(batch):
|
||||
speech_array, sampling_rate = torchaudio.load(batch["path"])
|
||||
batch["speech"] = resampler(speech_array).squeeze().numpy()
|
||||
return batch
|
||||
|
||||
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
||||
inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
|
||||
|
||||
with torch.no_grad():
|
||||
logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
|
||||
|
||||
predicted_ids = torch.argmax(logits, dim=-1)
|
||||
|
||||
print("Prediction:", processor.batch_decode(predicted_ids))
|
||||
print("Reference:", test_dataset["sentence"][:2])
|
||||
```
|
||||
|
||||
|
||||
## Evaluation
|
||||
|
||||
The model can be evaluated as follows on the Chinese (Hong Kong) test data of Common Voice.
|
||||
|
||||
|
||||
```python
|
||||
!pip install jiwer
|
||||
import torch
|
||||
import torchaudio
|
||||
from datasets import load_dataset, load_metric
|
||||
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
||||
import re
|
||||
import argparse
|
||||
|
||||
lang_id = "zh-HK"
|
||||
model_id = "ctl/wav2vec2-large-xlsr-cantonese"
|
||||
|
||||
chars_to_ignore_regex = '[\,\?\.\!\-\;\:"\“\%\‘\”\<5C>\.\⋯\!\-\:\–\。\》\,\)\,\?\;\~\~\…\︰\,\(\」\‧\《\﹔\、\—\/\,\「\﹖\·\']'
|
||||
|
||||
test_dataset = load_dataset("common_voice", f"{lang_id}", split="test")
|
||||
cer = load_metric("cer")
|
||||
|
||||
processor = Wav2Vec2Processor.from_pretrained(f"{model_id}")
|
||||
model = Wav2Vec2ForCTC.from_pretrained(f"{model_id}")
|
||||
model.to("cuda")
|
||||
|
||||
resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
||||
|
||||
# Preprocessing the datasets.
|
||||
# We need to read the aduio files as arrays
|
||||
def speech_file_to_array_fn(batch):
|
||||
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
|
||||
speech_array, sampling_rate = torchaudio.load(batch["path"])
|
||||
batch["speech"] = resampler(speech_array).squeeze().numpy()
|
||||
return batch
|
||||
|
||||
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
||||
|
||||
# Preprocessing the datasets.
|
||||
# We need to read the aduio files as arrays
|
||||
def evaluate(batch):
|
||||
inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
|
||||
with torch.no_grad():
|
||||
logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
|
||||
|
||||
pred_ids = torch.argmax(logits, dim=-1)
|
||||
batch["pred_strings"] = processor.batch_decode(pred_ids)
|
||||
return batch
|
||||
|
||||
result = test_dataset.map(evaluate, batched=True, batch_size=16)
|
||||
|
||||
print("CER: {:2f}".format(100 * cer.compute(predictions=result["pred_strings"], references=result["sentence"])))
|
||||
```
|
||||
|
||||
|
||||
**Test Result**: 15.51 %
|
||||
|
||||
|
||||
## Training
|
||||
|
||||
The Common Voice `train`, `validation` were used for training.
|
||||
|
||||
The script used for training will be posted [here](https://github.com/chutaklee/CantoASR)
|
||||
116
cer.py
Normal file
116
cer.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021 The HuggingFace Datasets Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Character Error Ratio (CER) metric. """
|
||||
|
||||
import jiwer
|
||||
import jiwer.transforms as tr
|
||||
from typing import List
|
||||
|
||||
import datasets
|
||||
|
||||
class SentencesToListOfCharacters(tr.AbstractTransform):
|
||||
def process_string(self, s: str):
|
||||
return list(s)
|
||||
|
||||
def process_list(self, inp: List[str]):
|
||||
chars = []
|
||||
|
||||
for sentence in inp:
|
||||
chars.extend(self.process_string(sentence))
|
||||
|
||||
return chars
|
||||
|
||||
|
||||
cer_transform = tr.Compose(
|
||||
[
|
||||
tr.RemoveMultipleSpaces(),
|
||||
tr.Strip(),
|
||||
SentencesToListOfCharacters(),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
_CITATION = """\
|
||||
@inproceedings{inproceedings,
|
||||
author = {Morris, Andrew and Maier, Viktoria and Green, Phil},
|
||||
year = {2004},
|
||||
month = {01},
|
||||
pages = {},
|
||||
title = {From WER and RIL to MER and WIL: improved evaluation measures for connected speech recognition.}
|
||||
}
|
||||
"""
|
||||
|
||||
_DESCRIPTION = """\
|
||||
Character error rate (CER) is a common metric of the performance of an automatic speech recognition system.
|
||||
|
||||
CER is similar to Word Error Rate (WER), but operate on character insted of word. Please refer to docs of WER for further information.
|
||||
|
||||
Character error rate can be computed as:
|
||||
|
||||
CER = (S + D + I) / N = (S + D + I) / (S + D + C)
|
||||
|
||||
where
|
||||
|
||||
S is the number of substitutions,
|
||||
D is the number of deletions,
|
||||
I is the number of insertions,
|
||||
C is the number of correct characters,
|
||||
N is the number of characters in the reference (N=S+D+C).
|
||||
|
||||
CER's output is always a number between 0 and 1. This value indicates the percentage of characters that were incorrectly predicted. The lower the value, the better the
|
||||
performance of the ASR system with a CER of 0 being a perfect score.
|
||||
"""
|
||||
|
||||
_KWARGS_DESCRIPTION = """
|
||||
Computes CER score of transcribed segments against references.
|
||||
Args:
|
||||
references: list of references for each speech input.
|
||||
predictions: list of transcribtions to score.
|
||||
Returns:
|
||||
(float): the character error rate
|
||||
|
||||
Examples:
|
||||
|
||||
>>> predictions = ["this is the prediction", "there is an other sample"]
|
||||
>>> references = ["this is the reference", "there is another one"]
|
||||
>>> cer = datasets.load_metric("cer")
|
||||
>>> cer_score = cer.compute(predictions=predictions, references=references)
|
||||
>>> print(cer_score)
|
||||
0.34146341463414637
|
||||
"""
|
||||
|
||||
|
||||
@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
||||
class CER(datasets.Metric):
|
||||
def _info(self):
|
||||
return datasets.MetricInfo(
|
||||
description=_DESCRIPTION,
|
||||
citation=_CITATION,
|
||||
inputs_description=_KWARGS_DESCRIPTION,
|
||||
features=datasets.Features(
|
||||
{
|
||||
"predictions": datasets.Value("string", id="sequence"),
|
||||
"references": datasets.Value("string", id="sequence"),
|
||||
}
|
||||
),
|
||||
codebase_urls=["https://github.com/jitsi/jiwer/"],
|
||||
reference_urls=[
|
||||
"https://en.wikipedia.org/wiki/Word_error_rate",
|
||||
"https://sites.google.com/site/textdigitisation/qualitymeasures/computingerrorrates",
|
||||
],
|
||||
)
|
||||
|
||||
def _compute(self, predictions, references):
|
||||
return jiwer.wer(references, predictions, truth_transform=cer_transform, hypothesis_transform=cer_transform)
|
||||
263
cer_memory_efficient.py
Normal file
263
cer_memory_efficient.py
Normal file
@@ -0,0 +1,263 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021 The HuggingFace Datasets Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Word Error Ratio (WER) metric. """
|
||||
|
||||
import datasets
|
||||
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""This module provides functions to calculate error rate in different level.
|
||||
e.g. wer for word-level, cer for char-level.
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
# credit: https://github.com/PaddlePaddle/DeepSpeech/blob/d7e753546a813f7493c8834ca1a4b3f37a7ff139/deepspeech/utils/error_rate.py
|
||||
|
||||
def _levenshtein_distance(ref, hyp):
|
||||
"""Levenshtein distance is a string metric for measuring the difference
|
||||
between two sequences. Informally, the levenshtein disctance is defined as
|
||||
the minimum number of single-character edits (substitutions, insertions or
|
||||
deletions) required to change one word into the other. We can naturally
|
||||
extend the edits to word level when calculate levenshtein disctance for
|
||||
two sentences.
|
||||
"""
|
||||
m = len(ref)
|
||||
n = len(hyp)
|
||||
|
||||
# special case
|
||||
if ref == hyp:
|
||||
return 0
|
||||
if m == 0:
|
||||
return n
|
||||
if n == 0:
|
||||
return m
|
||||
|
||||
if m < n:
|
||||
ref, hyp = hyp, ref
|
||||
m, n = n, m
|
||||
|
||||
# use O(min(m, n)) space
|
||||
distance = np.zeros((2, n + 1), dtype=np.int32)
|
||||
|
||||
# initialize distance matrix
|
||||
for j in range(n + 1):
|
||||
distance[0][j] = j
|
||||
|
||||
# calculate levenshtein distance
|
||||
for i in range(1, m + 1):
|
||||
prev_row_idx = (i - 1) % 2
|
||||
cur_row_idx = i % 2
|
||||
distance[cur_row_idx][0] = i
|
||||
for j in range(1, n + 1):
|
||||
if ref[i - 1] == hyp[j - 1]:
|
||||
distance[cur_row_idx][j] = distance[prev_row_idx][j - 1]
|
||||
else:
|
||||
s_num = distance[prev_row_idx][j - 1] + 1
|
||||
i_num = distance[cur_row_idx][j - 1] + 1
|
||||
d_num = distance[prev_row_idx][j] + 1
|
||||
distance[cur_row_idx][j] = min(s_num, i_num, d_num)
|
||||
|
||||
return distance[m % 2][n]
|
||||
|
||||
|
||||
def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '):
|
||||
"""Compute the levenshtein distance between reference sequence and
|
||||
hypothesis sequence in word-level.
|
||||
|
||||
:param reference: The reference sentence.
|
||||
:type reference: str
|
||||
:param hypothesis: The hypothesis sentence.
|
||||
:type hypothesis: str
|
||||
:param ignore_case: Whether case-sensitive or not.
|
||||
:type ignore_case: bool
|
||||
:param delimiter: Delimiter of input sentences.
|
||||
:type delimiter: char
|
||||
:return: Levenshtein distance and word number of reference sentence.
|
||||
:rtype: list
|
||||
"""
|
||||
if ignore_case == True:
|
||||
reference = reference.lower()
|
||||
hypothesis = hypothesis.lower()
|
||||
|
||||
ref_words = list(filter(None, reference.split(delimiter)))
|
||||
hyp_words = list(filter(None, hypothesis.split(delimiter)))
|
||||
|
||||
edit_distance = _levenshtein_distance(ref_words, hyp_words)
|
||||
return float(edit_distance), len(ref_words)
|
||||
|
||||
|
||||
def char_errors(reference, hypothesis, ignore_case=False, remove_space=False):
|
||||
"""Compute the levenshtein distance between reference sequence and
|
||||
hypothesis sequence in char-level.
|
||||
|
||||
:param reference: The reference sentence.
|
||||
:type reference: str
|
||||
:param hypothesis: The hypothesis sentence.
|
||||
:type hypothesis: str
|
||||
:param ignore_case: Whether case-sensitive or not.
|
||||
:type ignore_case: bool
|
||||
:param remove_space: Whether remove internal space characters
|
||||
:type remove_space: bool
|
||||
:return: Levenshtein distance and length of reference sentence.
|
||||
:rtype: list
|
||||
"""
|
||||
if ignore_case == True:
|
||||
reference = reference.lower()
|
||||
hypothesis = hypothesis.lower()
|
||||
|
||||
join_char = ' '
|
||||
if remove_space == True:
|
||||
join_char = ''
|
||||
|
||||
reference = join_char.join(list(filter(None, reference.split(' '))))
|
||||
hypothesis = join_char.join(list(filter(None, hypothesis.split(' '))))
|
||||
|
||||
edit_distance = _levenshtein_distance(reference, hypothesis)
|
||||
return float(edit_distance), len(reference)
|
||||
|
||||
|
||||
def cer(reference, hypothesis, ignore_case=False, remove_space=True):
|
||||
"""Calculate charactor error rate (CER). CER compares reference text and
|
||||
hypothesis text in char-level. CER is defined as:
|
||||
|
||||
.. math::
|
||||
CER = (Sc + Dc + Ic) / Nc
|
||||
|
||||
where
|
||||
|
||||
.. code-block:: text
|
||||
|
||||
Sc is the number of characters substituted,
|
||||
Dc is the number of characters deleted,
|
||||
Ic is the number of characters inserted
|
||||
Nc is the number of characters in the reference
|
||||
|
||||
We can use levenshtein distance to calculate CER. Chinese input should be
|
||||
encoded to unicode. Please draw an attention that the leading and tailing
|
||||
space characters will be truncated and multiple consecutive space
|
||||
characters in a sentence will be replaced by one space character.
|
||||
|
||||
:param reference: The reference sentence.
|
||||
:type reference: str
|
||||
:param hypothesis: The hypothesis sentence.
|
||||
:type hypothesis: str
|
||||
:param ignore_case: Whether case-sensitive or not.
|
||||
:type ignore_case: bool
|
||||
:param remove_space: Whether remove internal space characters
|
||||
:type remove_space: bool
|
||||
:return: Character error rate.
|
||||
:rtype: float
|
||||
:raises ValueError: If the reference length is zero.
|
||||
"""
|
||||
edit_distance, ref_len = char_errors(reference, hypothesis, ignore_case,
|
||||
remove_space)
|
||||
|
||||
if ref_len == 0:
|
||||
raise ValueError("Length of reference should be greater than 0.")
|
||||
|
||||
cer = float(edit_distance) / ref_len
|
||||
return edit_distance, ref_len, cer
|
||||
|
||||
|
||||
|
||||
_CITATION = """\
|
||||
@inproceedings{inproceedings,
|
||||
author = {Morris, Andrew and Maier, Viktoria and Green, Phil},
|
||||
year = {2004},
|
||||
month = {01},
|
||||
pages = {},
|
||||
title = {From WER and RIL to MER and WIL: improved evaluation measures for connected speech recognition.}
|
||||
}
|
||||
"""
|
||||
|
||||
_DESCRIPTION = """\
|
||||
Word error rate (WER) is a common metric of the performance of an automatic speech recognition system.
|
||||
|
||||
The general difficulty of measuring performance lies in the fact that the recognized word sequence can have a different length from the reference word sequence (supposedly the correct one). The WER is derived from the Levenshtein distance, working at the word level instead of the phoneme level. The WER is a valuable tool for comparing different systems as well as for evaluating improvements within one system. This kind of measurement, however, provides no details on the nature of translation errors and further work is therefore required to identify the main source(s) of error and to focus any research effort.
|
||||
|
||||
This problem is solved by first aligning the recognized word sequence with the reference (spoken) word sequence using dynamic string alignment. Examination of this issue is seen through a theory called the power law that states the correlation between perplexity and word error rate.
|
||||
|
||||
Word error rate can then be computed as:
|
||||
|
||||
WER = (S + D + I) / N = (S + D + I) / (S + D + C)
|
||||
|
||||
where
|
||||
|
||||
S is the number of substitutions,
|
||||
D is the number of deletions,
|
||||
I is the number of insertions,
|
||||
C is the number of correct words,
|
||||
N is the number of words in the reference (N=S+D+C).
|
||||
|
||||
WER's output is always a number between 0 and 1. This value indicates the percentage of words that were incorrectly predicted. The lower the value, the better the
|
||||
performance of the ASR system with a WER of 0 being a perfect score.
|
||||
"""
|
||||
|
||||
_KWARGS_DESCRIPTION = """
|
||||
Computes WER score of transcribed segments against references.
|
||||
Args:
|
||||
references: list of references for each speech input.
|
||||
predictions: list of transcribtions to score.
|
||||
Returns:
|
||||
(float): the word error rate
|
||||
|
||||
Examples:
|
||||
|
||||
>>> predictions = ["this is the prediction", "there is an other sample"]
|
||||
>>> references = ["this is the reference", "there is another one"]
|
||||
>>> wer = datasets.load_metric("wer")
|
||||
>>> wer_score = wer.compute(predictions=predictions, references=references)
|
||||
>>> print(wer_score)
|
||||
0.5
|
||||
"""
|
||||
|
||||
|
||||
@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
||||
class CER(datasets.Metric):
|
||||
def _info(self):
|
||||
return datasets.MetricInfo(
|
||||
description=_DESCRIPTION,
|
||||
citation=_CITATION,
|
||||
inputs_description=_KWARGS_DESCRIPTION,
|
||||
features=datasets.Features(
|
||||
{
|
||||
"predictions": datasets.Value("string", id="sequence"),
|
||||
"references": datasets.Value("string", id="sequence"),
|
||||
}
|
||||
),
|
||||
codebase_urls=["https://github.com/jitsi/jiwer/"],
|
||||
reference_urls=[
|
||||
"https://en.wikipedia.org/wiki/Word_error_rate",
|
||||
],
|
||||
)
|
||||
|
||||
def _compute(self, predictions, references):
|
||||
total_edit_distance, total_ref_len = 0, 0
|
||||
for pred, ref in zip(predictions, references):
|
||||
edit_distance, ref_len, _ = cer(ref, pred)
|
||||
total_edit_distance += edit_distance
|
||||
total_ref_len += ref_len
|
||||
return total_edit_distance / total_ref_len
|
||||
76
config.json
Normal file
76
config.json
Normal file
@@ -0,0 +1,76 @@
|
||||
{
|
||||
"_name_or_path": "../wav2vec2-large-xlsr-cantonese",
|
||||
"activation_dropout": 0.0,
|
||||
"apply_spec_augment": true,
|
||||
"architectures": [
|
||||
"Wav2Vec2ForCTC"
|
||||
],
|
||||
"attention_dropout": 0.1,
|
||||
"bos_token_id": 1,
|
||||
"conv_bias": true,
|
||||
"conv_dim": [
|
||||
512,
|
||||
512,
|
||||
512,
|
||||
512,
|
||||
512,
|
||||
512,
|
||||
512
|
||||
],
|
||||
"conv_kernel": [
|
||||
10,
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
2,
|
||||
2
|
||||
],
|
||||
"conv_stride": [
|
||||
5,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2
|
||||
],
|
||||
"ctc_loss_reduction": "mean",
|
||||
"ctc_zero_infinity": false,
|
||||
"do_stable_layer_norm": true,
|
||||
"eos_token_id": 2,
|
||||
"feat_extract_activation": "gelu",
|
||||
"feat_extract_dropout": 0.0,
|
||||
"feat_extract_norm": "layer",
|
||||
"feat_proj_dropout": 0.0,
|
||||
"final_dropout": 0.0,
|
||||
"gradient_checkpointing": true,
|
||||
"hidden_act": "gelu",
|
||||
"hidden_dropout": 0.1,
|
||||
"hidden_size": 1024,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 4096,
|
||||
"layer_norm_eps": 1e-05,
|
||||
"layerdrop": 0.1,
|
||||
"mask_channel_length": 10,
|
||||
"mask_channel_min_space": 1,
|
||||
"mask_channel_other": 0.0,
|
||||
"mask_channel_prob": 0.0,
|
||||
"mask_channel_selection": "static",
|
||||
"mask_feature_length": 10,
|
||||
"mask_feature_prob": 0.0,
|
||||
"mask_time_length": 10,
|
||||
"mask_time_min_space": 1,
|
||||
"mask_time_other": 0.0,
|
||||
"mask_time_prob": 0.05,
|
||||
"mask_time_selection": "static",
|
||||
"model_type": "wav2vec2",
|
||||
"num_attention_heads": 16,
|
||||
"num_conv_pos_embedding_groups": 16,
|
||||
"num_conv_pos_embeddings": 128,
|
||||
"num_feat_extract_layers": 7,
|
||||
"num_hidden_layers": 24,
|
||||
"pad_token_id": 3580,
|
||||
"transformers_version": "4.5.0.dev0",
|
||||
"vocab_size": 3581
|
||||
}
|
||||
3
flax_model.msgpack
Normal file
3
flax_model.msgpack
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:be24b09e034b87ca13dbe80e6ed94a927a4e964a3b5e0909c88e0553a2f71636
|
||||
size 1276452378
|
||||
3
optimizer.pt
Normal file
3
optimizer.pt
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:43e18eaa9f3172c4eeb16164003cf9e81328d3bd3d6495d30e620dd05f4dcf84
|
||||
size 2519432706
|
||||
8
preprocessor_config.json
Normal file
8
preprocessor_config.json
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"do_normalize": true,
|
||||
"feature_size": 1,
|
||||
"padding_side": "right",
|
||||
"padding_value": 0.0,
|
||||
"return_attention_mask": true,
|
||||
"sampling_rate": 16000
|
||||
}
|
||||
3
pytorch_model.bin
Normal file
3
pytorch_model.bin
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:7b19c066e5d9a6534bd548f19fb40a7b109231f8cc5a699b427d11d76b5f1fb4
|
||||
size 1276610874
|
||||
3
scheduler.pt
Normal file
3
scheduler.pt
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:38f20ce0b19191999e2beb095d59a62a34c37b17460e0941ed45d106b60fb522
|
||||
size 623
|
||||
1
special_tokens_map.json
Normal file
1
special_tokens_map.json
Normal file
@@ -0,0 +1 @@
|
||||
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]"}
|
||||
119
test_cer.py
Normal file
119
test_cer.py
Normal file
@@ -0,0 +1,119 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021 The HuggingFace Datasets Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import unittest
|
||||
import cer
|
||||
|
||||
cer = cer.CER()
|
||||
|
||||
class TestCER(unittest.TestCase):
|
||||
def test_cer_case_senstive(self):
|
||||
refs = ['White House']
|
||||
preds = ['white house']
|
||||
# S = 2, D = 0, I = 0, N = 9, CER = 2 / 11
|
||||
char_error_rate = cer.compute(predictions=preds, references=refs)
|
||||
self.assertTrue(abs(char_error_rate - 0.1818181818) < 1e-6)
|
||||
|
||||
def test_cer_whitespace(self):
|
||||
refs = ['were wolf']
|
||||
preds = ['werewolf']
|
||||
# S = 0, D = 0, I = 1, N = 9, CER = 1 / 9
|
||||
char_error_rate = cer.compute(predictions=preds, references=refs)
|
||||
self.assertTrue(abs(char_error_rate - 0.1111111) < 1e-6)
|
||||
|
||||
# consecutive whitespaces case 0
|
||||
refs = ['werewolf']
|
||||
preds = ['weae wolf']
|
||||
# S = 1, D = 1, I = 0, N = 8, CER = 0.25
|
||||
char_error_rate = cer.compute(predictions=preds, references=refs)
|
||||
self.assertTrue(abs(char_error_rate - 0.25) < 1e-6)
|
||||
|
||||
# consecutive whitespaces case 1
|
||||
refs = ['were wolf']
|
||||
preds = ['were wolf']
|
||||
# S = 0, D = 0, I = 0, N = 9, CER = 0
|
||||
char_error_rate = cer.compute(predictions=preds, references=refs)
|
||||
self.assertTrue(abs(char_error_rate - 0.0) < 1e-6)
|
||||
|
||||
# consecutive whitespaces case 2
|
||||
refs = ['were wolf']
|
||||
preds = ['were wolf']
|
||||
# S = 0, D = 0, I = 0, N = 9, CER = 0
|
||||
char_error_rate = cer.compute(predictions=preds, references=refs)
|
||||
self.assertTrue(abs(char_error_rate - 0.0) < 1e-6)
|
||||
|
||||
def test_cer_sub(self):
|
||||
refs = ['werewolf']
|
||||
preds = ['weaewolf']
|
||||
# S = 1, D = 0, I = 0, N = 8, CER = 0.125
|
||||
char_error_rate = cer.compute(predictions=preds, references=refs)
|
||||
self.assertTrue(abs(char_error_rate - 0.125) < 1e-6)
|
||||
|
||||
def test_cer_del(self):
|
||||
refs = ['werewolf']
|
||||
preds = ['wereawolf']
|
||||
# S = 0, D = 1, I = 0, N = 8, CER = 0.125
|
||||
char_error_rate = cer.compute(predictions=preds, references=refs)
|
||||
self.assertTrue(abs(char_error_rate - 0.125) < 1e-6)
|
||||
|
||||
def test_cer_insert(self):
|
||||
refs = ['werewolf']
|
||||
preds = ['wereolf']
|
||||
# S = 0, D = 0, I = 1, N = 8, CER = 0.125
|
||||
char_error_rate = cer.compute(predictions=preds, references=refs)
|
||||
self.assertTrue(abs(char_error_rate - 0.125) < 1e-6)
|
||||
|
||||
def test_cer_equal(self):
|
||||
refs = ['werewolf']
|
||||
char_error_rate = cer.compute(predictions=refs, references=refs)
|
||||
self.assertEqual(char_error_rate, 0.0)
|
||||
|
||||
def test_cer_list_of_seqs(self):
|
||||
refs = ['werewolf', 'I am your father']
|
||||
char_error_rate = cer.compute(predictions=refs, references=refs)
|
||||
self.assertEqual(char_error_rate, 0.0)
|
||||
|
||||
refs = ['werewolf', 'I am your father', 'doge']
|
||||
preds = ['werxwolf', 'I am your father', 'doge']
|
||||
# S = 1, D = 0, I = 0, N = 28, CER = 1 / 28
|
||||
char_error_rate = cer.compute(predictions=preds, references=refs)
|
||||
self.assertTrue(abs(char_error_rate - 0.03571428) < 1e-6)
|
||||
|
||||
def test_cer_unicode(self):
|
||||
refs = [u'我能吞下玻璃而不伤身体']
|
||||
preds = [u' 能吞虾玻璃而 不霜身体啦']
|
||||
# S = 3, D = 2, I = 0, N = 11
|
||||
# CER = 5 / 11
|
||||
char_error_rate = cer.compute(predictions=preds, references=refs)
|
||||
self.assertTrue(abs(char_error_rate - 0.4545454545) < 1e-6)
|
||||
|
||||
refs = [u'我能吞', u'下玻璃而不伤身体']
|
||||
preds = [u'我 能 吞 下 玻 璃', u'而不伤身体']
|
||||
# S = 0, D = 5, I = 0, N = 11
|
||||
# CER = 5 / 11
|
||||
char_error_rate = cer.compute(predictions=preds, references=refs)
|
||||
self.assertTrue(abs(char_error_rate - 0.454545454545) < 1e-6)
|
||||
|
||||
refs = [u'我能吞下玻璃而不伤身体']
|
||||
char_error_rate = cer.compute(predictions=refs, references=refs)
|
||||
self.assertFalse(char_error_rate, 0.0)
|
||||
|
||||
def test_cer_empty(self):
|
||||
refs = ''
|
||||
preds = 'Hypothesis'
|
||||
with self.assertRaises(ValueError):
|
||||
char_error_rate = cer.compute(predictions=preds, references=refs)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
1
tokenizer_config.json
Normal file
1
tokenizer_config.json
Normal file
@@ -0,0 +1 @@
|
||||
{"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|"}
|
||||
1010
trainer_state.json
Normal file
1010
trainer_state.json
Normal file
File diff suppressed because it is too large
Load Diff
3
training_args.bin
Normal file
3
training_args.bin
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:72bc532f588092a951ee7231632ada16afbbf1f720df843a7d3c93a5acd1c712
|
||||
size 2287
|
||||
1
vocab.json
Normal file
1
vocab.json
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user