From a1609fb4996908e357670367fb9bfb40d0227604 Mon Sep 17 00:00:00 2001
From: sunjichen <sunjichen@4paradigm.com>
Date: Wed, 11 Feb 2026 16:39:16 +0800
Subject: [PATCH] Fix  model output Unicode U+2581 separator related issues

---
 fastapi_funasr.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fastapi_funasr.py b/fastapi_funasr.py
index 1f5273b..8f7870c 100644
--- a/fastapi_funasr.py
+++ b/fastapi_funasr.py
@@ -168,6 +168,7 @@ def test_funasr(audio_file, lang):
             segment_path = f"temp_seg_{i}.wav"
             torchaudio.save(segment_path, segment, sample_rate)
             ts1 = time.time()
+            text = None
             if model_type == "sensevoice":
                 res = model.generate(
                     input=segment_path,
@@ -203,14 +204,12 @@ def test_funasr(audio_file, lang):
                 # paraformer模型会一个字一个字输出，中间夹太多空格会影响1-cer的结果
                 if lang == "zh":
                     text = text.replace(" ", "")
-                text = text.replace("_", " ")
             elif model_type == "conformer":
                 res = model.generate(
                     input=segment_path,
                     batch_size_s=300
                 )
                 text = res[0]["text"]
-                text = text.replace("_", " ")
             # elif model_type == "uni_asr":
             #     if i == 0:
             #         os.remove(segment_path)
@@ -221,6 +220,10 @@ def test_funasr(audio_file, lang):
             #     text = res[0]["text"]
             else:
                 raise RuntimeError("unknown model type")
+            if text is not None:
+                # some models output "▁" (9601, Unicode U+2581) as separator between words, replace them with space for better readability
+                text = text.replace("_", " ")
+                text = text.replace(chr(9601), " ")
             ts2 = time.time()
             generated_text += text
             processing_time += (ts2 - ts1)