Skip to content

Commit dcb4f79

Browse files
authored
Only match ASCII letters and digits (#2312)
* Move `isAsciiLetter`, `isDigit`, and `isHexDigit` to `StringUtil` * Only check for ASCII digits in `StringUtil.isNumeric()` * Replace calls to `Character.isLetter()`
1 parent c4dd25e commit dcb4f79

File tree

6 files changed

+97
-35
lines changed

6 files changed

+97
-35
lines changed

src/main/java/org/jsoup/internal/StringUtil.java

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ public static boolean startsWithNewline(final String string) {
172172
}
173173

174174
/**
175-
* Tests if a string is numeric, i.e. contains only digit characters
175+
* Tests if a string is numeric, i.e. contains only ASCII digit characters
176176
* @param string string to test
177177
* @return true if only digit chars, false if empty or null or contains non-digit chars
178178
*/
@@ -182,7 +182,7 @@ public static boolean isNumeric(String string) {
182182

183183
int l = string.length();
184184
for (int i = 0; i < l; i++) {
185-
if (!Character.isDigit(string.codePointAt(i)))
185+
if (!isDigit(string.charAt(i)))
186186
return false;
187187
}
188188
return true;
@@ -394,4 +394,16 @@ public static void releaseBuilderVoid(StringBuilder sb) {
394394
},
395395
StringJoiner::complete);
396396
}
397+
398+
public static boolean isAsciiLetter(char c) {
399+
return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z';
400+
}
401+
402+
public static boolean isDigit(char c) {
403+
return c >= '0' && c <= '9';
404+
}
405+
406+
public static boolean isHexDigit(char c) {
407+
return isDigit(c) || c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F';
408+
}
397409
}

src/main/java/org/jsoup/parser/CharacterReader.java

Lines changed: 6 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import org.jsoup.helper.Validate;
44
import org.jsoup.internal.SoftPool;
5+
import org.jsoup.internal.StringUtil;
56
import org.jspecify.annotations.Nullable;
67

78
import java.io.IOException;
@@ -480,19 +481,19 @@ String consumeLetterThenDigitSequence() {
480481
bufferUp();
481482
int start = bufPos;
482483
while (bufPos < bufLength) {
483-
if (Character.isLetter(charBuf[bufPos])) bufPos++;
484+
if (StringUtil.isAsciiLetter(charBuf[bufPos])) bufPos++;
484485
else break;
485486
}
486487
while (!isEmptyNoBufferUp()) {
487-
if (isDigit(charBuf[bufPos])) bufPos++;
488+
if (StringUtil.isDigit(charBuf[bufPos])) bufPos++;
488489
else break;
489490
}
490491

491492
return cacheString(charBuf, stringCache, start, bufPos - start);
492493
}
493494

494495
String consumeHexSequence() {
495-
return consumeMatching(CharacterReader::isHexDigit);
496+
return consumeMatching(StringUtil::isHexDigit);
496497
}
497498

498499
String consumeDigitSequence() {
@@ -556,23 +557,18 @@ boolean matchesAnySorted(char[] seq) {
556557
return !isEmpty() && Arrays.binarySearch(seq, charBuf[bufPos]) >= 0;
557558
}
558559

559-
boolean matchesLetter() {
560-
if (isEmpty()) return false;
561-
return Character.isLetter(charBuf[bufPos]);
562-
}
563-
564560
/**
565561
Checks if the current pos matches an ascii alpha (A-Z a-z) per https://infra.spec.whatwg.org/#ascii-alpha
566562
@return if it matches or not
567563
*/
568564
boolean matchesAsciiAlpha() {
569565
if (isEmpty()) return false;
570-
return isAsciiLetter(charBuf[bufPos]);
566+
return StringUtil.isAsciiLetter(charBuf[bufPos]);
571567
}
572568

573569
boolean matchesDigit() {
574570
if (isEmpty()) return false;
575-
return isDigit(charBuf[bufPos]);
571+
return StringUtil.isDigit(charBuf[bufPos]);
576572
}
577573

578574
boolean matchConsume(String seq) {
@@ -686,17 +682,4 @@ boolean rangeEquals(final int start, final int count, final String cached) {
686682
interface CharPredicate {
687683
boolean test(char c);
688684
}
689-
690-
// char predicate functions
691-
static boolean isAsciiLetter(char c) {
692-
return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z';
693-
}
694-
695-
static boolean isDigit(char c) {
696-
return c >= '0' && c <= '9';
697-
}
698-
699-
static boolean isHexDigit(char c) {
700-
return isDigit(c) || c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F';
701-
}
702685
}

src/main/java/org/jsoup/parser/TokenQueue.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -310,13 +310,13 @@ public static String escapeCssIdentifier(String in) {
310310
out.append(Hyphen_Minus);
311311

312312
char secondChar = q.current();
313-
if (CharacterReader.isDigit(secondChar)) {
313+
if (StringUtil.isDigit(secondChar)) {
314314
// If the character is the second character and is in the range [0-9] (U+0030 to U+0039) and the
315315
// first character is a "-" (U+002D), then the character escaped as code point.
316316
appendEscapedCodepoint(out, q.consume());
317317
}
318318
}
319-
} else if (CharacterReader.isDigit(firstChar)) {
319+
} else if (StringUtil.isDigit(firstChar)) {
320320
// If the character is the first character and is in the range [0-9] (U+0030 to U+0039), then the character
321321
// escaped as code point.
322322
appendEscapedCodepoint(out, q.consume());
@@ -447,11 +447,11 @@ private void consumeCssEscapeSequenceInto(StringBuilder out) {
447447
}
448448

449449
char firstEscaped = consume();
450-
if (!CharacterReader.isHexDigit(firstEscaped)) {
450+
if (!StringUtil.isHexDigit(firstEscaped)) {
451451
out.append(firstEscaped);
452452
} else {
453453
reader.unconsume(); // put back the first hex digit
454-
String hexString = reader.consumeMatching(CharacterReader::isHexDigit, 6); // consume up to 6 hex digits
454+
String hexString = reader.consumeMatching(StringUtil::isHexDigit, 6); // consume up to 6 hex digits
455455
int codePoint;
456456
try {
457457
codePoint = Integer.parseInt(hexString, 16);
@@ -487,12 +487,12 @@ private static boolean isNonAscii(char c) {
487487

488488
// https://www.w3.org/TR/css-syntax-3/#ident-start-code-point
489489
private static boolean isIdentStart(char c) {
490-
return c == '_' || CharacterReader.isAsciiLetter(c) || isNonAscii(c);
490+
return c == '_' || StringUtil.isAsciiLetter(c) || isNonAscii(c);
491491
}
492492

493493
// https://www.w3.org/TR/css-syntax-3/#ident-code-point
494494
private static boolean isIdent(char c) {
495-
return c == Hyphen_Minus || CharacterReader.isDigit(c) || isIdentStart(c);
495+
return c == Hyphen_Minus || StringUtil.isDigit(c) || isIdentStart(c);
496496
}
497497

498498
// https://www.w3.org/TR/css-syntax-3/#newline

src/main/java/org/jsoup/parser/Tokeniser.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ void advanceTransition(TokeniserState newState) {
197197
reader.matchConsume(prefix);
198198
nameRef = prefix;
199199
}
200-
if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
200+
if (inAttribute && (reader.matchesAsciiAlpha() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
201201
// don't want that to match
202202
reader.rewindToMark();
203203
return null;

src/main/java/org/jsoup/parser/TokeniserState.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1197,7 +1197,7 @@ private void anythingElse(Tokeniser t, CharacterReader r) {
11971197
},
11981198
DoctypeName {
11991199
@Override void read(Tokeniser t, CharacterReader r) {
1200-
if (r.matchesLetter()) {
1200+
if (r.matchesAsciiAlpha()) {
12011201
String name = r.consumeLetterSequence();
12021202
t.doctypePending.name.append(name);
12031203
return;
@@ -1672,7 +1672,7 @@ else if (r.matches('>')) {
16721672
* different else exit transitions.
16731673
*/
16741674
private static void handleDataEndTag(Tokeniser t, CharacterReader r, TokeniserState elseTransition) {
1675-
if (r.matchesLetter()) {
1675+
if (r.matchesAsciiAlpha()) {
16761676
String name = r.consumeLetterSequence();
16771677
t.tagPending.appendTagName(name);
16781678
t.dataBuffer.append(name);
@@ -1752,7 +1752,7 @@ private static void readEndTag(Tokeniser t, CharacterReader r, TokeniserState a,
17521752
}
17531753

17541754
private static void handleDataDoubleEscapeTag(Tokeniser t, CharacterReader r, TokeniserState primary, TokeniserState fallback) {
1755-
if (r.matchesLetter()) {
1755+
if (r.matchesAsciiAlpha()) {
17561756
String name = r.consumeLetterSequence();
17571757
t.dataBuffer.append(name);
17581758
t.emit(name);

src/test/java/org/jsoup/internal/StringUtilTest.java

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,4 +165,71 @@ void isAscii() {
165165
assertFalse(StringUtil.isAscii("测试"));
166166
assertFalse(StringUtil.isAscii("测试.com"));
167167
}
168+
169+
@Test void isAsciiLetter() {
170+
assertTrue(StringUtil.isAsciiLetter('a'));
171+
assertTrue(StringUtil.isAsciiLetter('n'));
172+
assertTrue(StringUtil.isAsciiLetter('z'));
173+
assertTrue(StringUtil.isAsciiLetter('A'));
174+
assertTrue(StringUtil.isAsciiLetter('N'));
175+
assertTrue(StringUtil.isAsciiLetter('Z'));
176+
177+
assertFalse(StringUtil.isAsciiLetter(' '));
178+
assertFalse(StringUtil.isAsciiLetter('-'));
179+
assertFalse(StringUtil.isAsciiLetter('0'));
180+
assertFalse(StringUtil.isAsciiLetter('ß'));
181+
assertFalse(StringUtil.isAsciiLetter('Ě'));
182+
}
183+
184+
@Test void isDigit() {
185+
assertTrue(StringUtil.isDigit('0'));
186+
assertTrue(StringUtil.isDigit('1'));
187+
assertTrue(StringUtil.isDigit('2'));
188+
assertTrue(StringUtil.isDigit('3'));
189+
assertTrue(StringUtil.isDigit('4'));
190+
assertTrue(StringUtil.isDigit('5'));
191+
assertTrue(StringUtil.isDigit('6'));
192+
assertTrue(StringUtil.isDigit('7'));
193+
assertTrue(StringUtil.isDigit('8'));
194+
assertTrue(StringUtil.isDigit('9'));
195+
196+
assertFalse(StringUtil.isDigit('a'));
197+
assertFalse(StringUtil.isDigit('A'));
198+
assertFalse(StringUtil.isDigit('ä'));
199+
assertFalse(StringUtil.isDigit('Ä'));
200+
assertFalse(StringUtil.isDigit('١'));
201+
assertFalse(StringUtil.isDigit('୳'));
202+
}
203+
204+
@Test void isHexDigit() {
205+
assertTrue(StringUtil.isHexDigit('0'));
206+
assertTrue(StringUtil.isHexDigit('1'));
207+
assertTrue(StringUtil.isHexDigit('2'));
208+
assertTrue(StringUtil.isHexDigit('3'));
209+
assertTrue(StringUtil.isHexDigit('4'));
210+
assertTrue(StringUtil.isHexDigit('5'));
211+
assertTrue(StringUtil.isHexDigit('6'));
212+
assertTrue(StringUtil.isHexDigit('7'));
213+
assertTrue(StringUtil.isHexDigit('8'));
214+
assertTrue(StringUtil.isHexDigit('9'));
215+
assertTrue(StringUtil.isHexDigit('a'));
216+
assertTrue(StringUtil.isHexDigit('b'));
217+
assertTrue(StringUtil.isHexDigit('c'));
218+
assertTrue(StringUtil.isHexDigit('d'));
219+
assertTrue(StringUtil.isHexDigit('e'));
220+
assertTrue(StringUtil.isHexDigit('f'));
221+
assertTrue(StringUtil.isHexDigit('A'));
222+
assertTrue(StringUtil.isHexDigit('B'));
223+
assertTrue(StringUtil.isHexDigit('C'));
224+
assertTrue(StringUtil.isHexDigit('D'));
225+
assertTrue(StringUtil.isHexDigit('E'));
226+
assertTrue(StringUtil.isHexDigit('F'));
227+
228+
assertFalse(StringUtil.isHexDigit('g'));
229+
assertFalse(StringUtil.isHexDigit('G'));
230+
assertFalse(StringUtil.isHexDigit('ä'));
231+
assertFalse(StringUtil.isHexDigit('Ä'));
232+
assertFalse(StringUtil.isHexDigit('١'));
233+
assertFalse(StringUtil.isHexDigit('୳'));
234+
}
168235
}

0 commit comments

Comments
 (0)