File tree Expand file tree Collapse file tree 1 file changed +7
-2
lines changed Expand file tree Collapse file tree 1 file changed +7
-2
lines changed Original file line number Diff line number Diff line change @@ -3079,7 +3079,7 @@ static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
3079
3079
else if (collecting_special && (codepoint_type (utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type (utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type (utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
3080
3080
split_condition = true ;
3081
3081
}
3082
- else if (collecting_whitespace_lookahead && ( codepoint_type (utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type (utf_char_next) == CODEPOINT_TYPE_DIGIT) ) {
3082
+ else if (collecting_whitespace_lookahead && codepoint_type (utf_char_next) != CODEPOINT_TYPE_WHITESPACE ) {
3083
3083
split_condition = true ;
3084
3084
}
3085
3085
}
@@ -3101,7 +3101,12 @@ static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
3101
3101
collecting_whitespace_lookahead = false ;
3102
3102
}
3103
3103
else {
3104
- token += utf_char;
3104
+ if (codepoint_type (token) == CODEPOINT_TYPE_PUNCTUATION && codepoint_type (utf_char) == CODEPOINT_TYPE_LETTER) {
3105
+ bpe_words.emplace_back (token);
3106
+ token = utf_char;
3107
+ } else {
3108
+ token += utf_char;
3109
+ }
3105
3110
}
3106
3111
}
3107
3112
You can’t perform that action at this time.
0 commit comments