From a1609fb4996908e357670367fb9bfb40d0227604 Mon Sep 17 00:00:00 2001 From: sunjichen Date: Wed, 11 Feb 2026 16:39:16 +0800 Subject: [PATCH] Fix model output Unicode U+2581 separator related issues --- fastapi_funasr.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fastapi_funasr.py b/fastapi_funasr.py index 1f5273b..8f7870c 100644 --- a/fastapi_funasr.py +++ b/fastapi_funasr.py @@ -168,6 +168,7 @@ def test_funasr(audio_file, lang): segment_path = f"temp_seg_{i}.wav" torchaudio.save(segment_path, segment, sample_rate) ts1 = time.time() + text = None if model_type == "sensevoice": res = model.generate( input=segment_path, @@ -203,14 +204,12 @@ def test_funasr(audio_file, lang): # paraformer模型会一个字一个字输出,中间夹太多空格会影响1-cer的结果 if lang == "zh": text = text.replace(" ", "") - text = text.replace("_", " ") elif model_type == "conformer": res = model.generate( input=segment_path, batch_size_s=300 ) text = res[0]["text"] - text = text.replace("_", " ") # elif model_type == "uni_asr": # if i == 0: # os.remove(segment_path) @@ -221,6 +220,10 @@ def test_funasr(audio_file, lang): # text = res[0]["text"] else: raise RuntimeError("unknown model type") + if text is not None: + # some models output "▁" (9601, Unicode U+2581) as separator between words, replace them with space for better readability + text = text.replace("_", " ") + text = text.replace(chr(9601), " ") ts2 = time.time() generated_text += text processing_time += (ts2 - ts1)