diff --git a/paddlenlp/transformers/tokenizer_utils.py b/paddlenlp/transformers/tokenizer_utils.py index 048c2fc40a7..a13a1f5698f 100644 --- a/paddlenlp/transformers/tokenizer_utils.py +++ b/paddlenlp/transformers/tokenizer_utils.py @@ -1881,10 +1881,14 @@ def decode_token( """tokenizer decoding for the streaming generation use case. This method can be overrided for tokenizer that doesn't follow this API""" # The prefix text is necessary only to defeat cleanup algorithms in the decode # which decide to add a space or not depending on the surrounding ids. - prefix_text = self.decode(all_input_ids[prefix_offset:read_offset], skip_special_tokens=False) - new_text = self.decode(all_input_ids[prefix_offset:], skip_special_tokens=False) + prefix_text = self.decode( + all_input_ids[prefix_offset:read_offset], skip_special_tokens=False, clean_up_tokenization_spaces=False + ) + new_text = self.decode( + all_input_ids[prefix_offset:], skip_special_tokens=False, clean_up_tokenization_spaces=False + ) - if len(new_text) > len(prefix_text) and not new_text.endswith("�"): + if len(new_text) > len(prefix_text) and not prefix_text.endswith("�") and not new_text.endswith("�"): # utf-8 char at the end means it's a potential unfinished byte sequence # from byte fallback tokenization. # If it's in the middle, it's probably a real invalid id generated