Skip to content

Commit

Permalink
[Tokenizer] Fix decode output with space in decode_token (#9010)
Browse files Browse the repository at this point in the history
* fix

* fix
  • Loading branch information
DrownFish19 committed Sep 19, 2024
1 parent 90cef20 commit c93bada
Showing 1 changed file with 7 additions and 3 deletions.
10 changes: 7 additions & 3 deletions paddlenlp/transformers/tokenizer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1881,10 +1881,14 @@ def decode_token(
"""tokenizer decoding for the streaming generation use case. This method can be overrided for tokenizer that doesn't follow this API"""
# The prefix text is necessary only to defeat cleanup algorithms in the decode
# which decide to add a space or not depending on the surrounding ids.
prefix_text = self.decode(all_input_ids[prefix_offset:read_offset], skip_special_tokens=False)
new_text = self.decode(all_input_ids[prefix_offset:], skip_special_tokens=False)
prefix_text = self.decode(
all_input_ids[prefix_offset:read_offset], skip_special_tokens=False, clean_up_tokenization_spaces=False
)
new_text = self.decode(
all_input_ids[prefix_offset:], skip_special_tokens=False, clean_up_tokenization_spaces=False
)

if len(new_text) > len(prefix_text) and not new_text.endswith("�"):
if len(new_text) > len(prefix_text) and not prefix_text.endswith("�") and not new_text.endswith("�"):
# utf-8 char at the end means it's a potential unfinished byte sequence
# from byte fallback tokenization.
# If it's in the middle, it's probably a real invalid id generated
Expand Down

0 comments on commit c93bada

Please sign in to comment.