Fix model output Unicode U+2581 separator related issues

2026-02-11 16:39:16 +08:00
parent 4aebd0a465
commit a1609fb499
1 changed files with 5 additions and 2 deletions
--- a/fastapi_funasr.py
+++ b/fastapi_funasr.py
@@ -168,6 +168,7 @@ def test_funasr(audio_file, lang):
            segment_path = f"temp_seg_{i}.wav"
            torchaudio.save(segment_path, segment, sample_rate)
            ts1 = time.time()
+            text = None
            if model_type == "sensevoice":
                res = model.generate(
                    input=segment_path,
@@ -203,14 +204,12 @@ def test_funasr(audio_file, lang):
                # paraformer模型会一个字一个字输出，中间夹太多空格会影响1-cer的结果
                if lang == "zh":
                    text = text.replace(" ", "")
-                text = text.replace("_", " ")
            elif model_type == "conformer":
                res = model.generate(
                    input=segment_path,
                    batch_size_s=300
                )
                text = res[0]["text"]
-                text = text.replace("_", " ")
            # elif model_type == "uni_asr":
            #     if i == 0:
            #         os.remove(segment_path)
@@ -221,6 +220,10 @@ def test_funasr(audio_file, lang):
            #     text = res[0]["text"]
            else:
                raise RuntimeError("unknown model type")
+            if text is not None:
+                # some models output "▁" (9601, Unicode U+2581) as separator between words, replace them with space for better readability
+                text = text.replace("_", " ")
+                text = text.replace(chr(9601), " ")
            ts2 = time.time()
            generated_text += text
            processing_time += (ts2 - ts1)