From 5729cef9487fd19290c71052242f0b7bd5565f1e Mon Sep 17 00:00:00 2001 From: x54-729 Date: Wed, 28 Feb 2024 13:33:29 +0800 Subject: [PATCH] fix no white space when using stream_chat with fast tokenizer --- tokenization_internlm2_fast.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tokenization_internlm2_fast.py b/tokenization_internlm2_fast.py index 1506e11..4d9d5f1 100644 --- a/tokenization_internlm2_fast.py +++ b/tokenization_internlm2_fast.py @@ -56,14 +56,14 @@ class InternLM2Converter(SpmConverter): return unk_id def decoder(self, replacement, add_prefix_space): - return decoders.Sequence( - [ - decoders.Replace("▁", " "), - decoders.ByteFallback(), - decoders.Fuse(), - decoders.Strip(content=" ", left=1), - ] - ) + decoders_sequence = [ + decoders.Replace("▁", " "), + decoders.ByteFallback(), + decoders.Fuse(), + ] + if self.proto.normalizer_spec.add_dummy_prefix: + decoders_sequence.append(decoders.Strip(content=" ", left=1)) + return decoders.Sequence(decoders_sequence) def tokenizer(self, proto): model_type = proto.trainer_spec.model_type