Skip to content

Commit 047ae5b

Browse files
authored
reduce error rate
1 parent 56a7a22 commit 047ae5b

File tree

1 file changed

+7
-2
lines changed

1 file changed

+7
-2
lines changed

whisper.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3079,7 +3079,7 @@ static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
30793079
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
30803080
split_condition = true;
30813081
}
3082-
else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
3082+
else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
30833083
split_condition = true;
30843084
}
30853085
}
@@ -3101,7 +3101,12 @@ static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
31013101
collecting_whitespace_lookahead = false;
31023102
}
31033103
else {
3104-
token += utf_char;
3104+
if (codepoint_type(token) == CODEPOINT_TYPE_PUNCTUATION && codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER) {
3105+
bpe_words.emplace_back(token);
3106+
token = utf_char;
3107+
} else {
3108+
token += utf_char;
3109+
}
31053110
}
31063111
}
31073112

0 commit comments

Comments
 (0)