Skip to content

Commit d8f5a18

Browse files
Perf/lexer faster slow get char and size (#70543)
Co-authored-by: serge-sans-paille <[email protected]>
1 parent 31b9121 commit d8f5a18

File tree

3 files changed

+60
-53
lines changed

3 files changed

+60
-53
lines changed

clang/include/clang/Lex/Lexer.h

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -575,19 +575,23 @@ class Lexer : public PreprocessorLexer {
575575
/// sequence.
576576
static bool isNewLineEscaped(const char *BufferStart, const char *Str);
577577

578+
/// Represents a char and the number of bytes parsed to produce it.
579+
struct SizedChar {
580+
char Char;
581+
unsigned Size;
582+
};
583+
578584
/// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
579585
/// emit a warning.
580-
static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
581-
const LangOptions &LangOpts) {
586+
static inline SizedChar getCharAndSizeNoWarn(const char *Ptr,
587+
const LangOptions &LangOpts) {
582588
// If this is not a trigraph and not a UCN or escaped newline, return
583589
// quickly.
584590
if (isObviouslySimpleCharacter(Ptr[0])) {
585-
Size = 1;
586-
return *Ptr;
591+
return {*Ptr, 1u};
587592
}
588593

589-
Size = 0;
590-
return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
594+
return getCharAndSizeSlowNoWarn(Ptr, LangOpts);
591595
}
592596

593597
/// Returns the leading whitespace for line that corresponds to the given
@@ -665,8 +669,7 @@ class Lexer : public PreprocessorLexer {
665669
// quickly.
666670
if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
667671

668-
unsigned Size = 0;
669-
char C = getCharAndSizeSlow(Ptr, Size, &Tok);
672+
auto [C, Size] = getCharAndSizeSlow(Ptr, &Tok);
670673
Ptr += Size;
671674
return C;
672675
}
@@ -682,9 +685,7 @@ class Lexer : public PreprocessorLexer {
682685

683686
// Otherwise, re-lex the character with a current token, allowing
684687
// diagnostics to be emitted and flags to be set.
685-
Size = 0;
686-
getCharAndSizeSlow(Ptr, Size, &Tok);
687-
return Ptr+Size;
688+
return Ptr + getCharAndSizeSlow(Ptr, &Tok).Size;
688689
}
689690

690691
/// getCharAndSize - Peek a single 'character' from the specified buffer,
@@ -699,14 +700,14 @@ class Lexer : public PreprocessorLexer {
699700
return *Ptr;
700701
}
701702

702-
Size = 0;
703-
return getCharAndSizeSlow(Ptr, Size);
703+
auto CharAndSize = getCharAndSizeSlow(Ptr);
704+
Size = CharAndSize.Size;
705+
return CharAndSize.Char;
704706
}
705707

706708
/// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
707709
/// method.
708-
char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
709-
Token *Tok = nullptr);
710+
SizedChar getCharAndSizeSlow(const char *Ptr, Token *Tok = nullptr);
710711

711712
/// getEscapedNewLineSize - Return the size of the specified escaped newline,
712713
/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
@@ -720,8 +721,8 @@ class Lexer : public PreprocessorLexer {
720721

721722
/// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
722723
/// diagnostic.
723-
static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
724-
const LangOptions &LangOpts);
724+
static SizedChar getCharAndSizeSlowNoWarn(const char *Ptr,
725+
const LangOptions &LangOpts);
725726

726727
//===--------------------------------------------------------------------===//
727728
// Other lexer functions.

clang/lib/Lex/DependencyDirectivesScanner.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -565,9 +565,8 @@ Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {
565565
const char *BufPtr = Input.begin() + Tok.Offset;
566566
const char *AfterIdent = Input.begin() + Tok.getEnd();
567567
while (BufPtr < AfterIdent) {
568-
unsigned Size;
569-
Spelling[SpellingLength++] =
570-
Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
568+
auto [Char, Size] = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
569+
Spelling[SpellingLength++] = Char;
571570
BufPtr += Size;
572571
}
573572

clang/lib/Lex/Lexer.cpp

Lines changed: 40 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -287,9 +287,9 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
287287
if (tok::isStringLiteral(Tok.getKind())) {
288288
// Munch the encoding-prefix and opening double-quote.
289289
while (BufPtr < BufEnd) {
290-
unsigned Size;
291-
Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
292-
BufPtr += Size;
290+
auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
291+
Spelling[Length++] = CharAndSize.Char;
292+
BufPtr += CharAndSize.Size;
293293

294294
if (Spelling[Length - 1] == '"')
295295
break;
@@ -316,9 +316,9 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
316316
}
317317

318318
while (BufPtr < BufEnd) {
319-
unsigned Size;
320-
Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
321-
BufPtr += Size;
319+
auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
320+
Spelling[Length++] = CharAndSize.Char;
321+
BufPtr += CharAndSize.Size;
322322
}
323323

324324
assert(Length < Tok.getLength() &&
@@ -772,10 +772,9 @@ unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
772772
// If we have a character that may be a trigraph or escaped newline, use a
773773
// lexer to parse it correctly.
774774
for (; CharNo; --CharNo) {
775-
unsigned Size;
776-
Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
777-
TokPtr += Size;
778-
PhysOffset += Size;
775+
auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts);
776+
TokPtr += CharAndSize.Size;
777+
PhysOffset += CharAndSize.Size;
779778
}
780779

781780
// Final detail: if we end up on an escaped newline, we want to return the
@@ -1357,15 +1356,16 @@ SourceLocation Lexer::findLocationAfterToken(
13571356
///
13581357
/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
13591358
/// be updated to match.
1360-
char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
1361-
Token *Tok) {
1359+
Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
1360+
unsigned Size = 0;
13621361
// If we have a slash, look for an escaped newline.
13631362
if (Ptr[0] == '\\') {
13641363
++Size;
13651364
++Ptr;
13661365
Slash:
13671366
// Common case, backslash-char where the char is not whitespace.
1368-
if (!isWhitespace(Ptr[0])) return '\\';
1367+
if (!isWhitespace(Ptr[0]))
1368+
return {'\\', Size};
13691369

13701370
// See if we have optional whitespace characters between the slash and
13711371
// newline.
@@ -1382,11 +1382,13 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
13821382
Ptr += EscapedNewLineSize;
13831383

13841384
// Use slow version to accumulate a correct size field.
1385-
return getCharAndSizeSlow(Ptr, Size, Tok);
1385+
auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
1386+
CharAndSize.Size += Size;
1387+
return CharAndSize;
13861388
}
13871389

13881390
// Otherwise, this is not an escaped newline, just return the slash.
1389-
return '\\';
1391+
return {'\\', Size};
13901392
}
13911393

13921394
// If this is a trigraph, process it.
@@ -1401,13 +1403,12 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
14011403
Ptr += 3;
14021404
Size += 3;
14031405
if (C == '\\') goto Slash;
1404-
return C;
1406+
return {C, Size};
14051407
}
14061408
}
14071409

14081410
// If this is neither, return a single character.
1409-
++Size;
1410-
return *Ptr;
1411+
return {*Ptr, Size + 1u};
14111412
}
14121413

14131414
/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
@@ -1416,15 +1417,18 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
14161417
///
14171418
/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
14181419
/// be updated to match.
1419-
char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
1420-
const LangOptions &LangOpts) {
1420+
Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
1421+
const LangOptions &LangOpts) {
1422+
1423+
unsigned Size = 0;
14211424
// If we have a slash, look for an escaped newline.
14221425
if (Ptr[0] == '\\') {
14231426
++Size;
14241427
++Ptr;
14251428
Slash:
14261429
// Common case, backslash-char where the char is not whitespace.
1427-
if (!isWhitespace(Ptr[0])) return '\\';
1430+
if (!isWhitespace(Ptr[0]))
1431+
return {'\\', Size};
14281432

14291433
// See if we have optional whitespace characters followed by a newline.
14301434
if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
@@ -1433,11 +1437,13 @@ char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
14331437
Ptr += EscapedNewLineSize;
14341438

14351439
// Use slow version to accumulate a correct size field.
1436-
return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
1440+
auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
1441+
CharAndSize.Size += Size;
1442+
return CharAndSize;
14371443
}
14381444

14391445
// Otherwise, this is not an escaped newline, just return the slash.
1440-
return '\\';
1446+
return {'\\', Size};
14411447
}
14421448

14431449
// If this is a trigraph, process it.
@@ -1448,13 +1454,12 @@ char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
14481454
Ptr += 3;
14491455
Size += 3;
14501456
if (C == '\\') goto Slash;
1451-
return C;
1457+
return {C, Size};
14521458
}
14531459
}
14541460

14551461
// If this is neither, return a single character.
1456-
++Size;
1457-
return *Ptr;
1462+
return {*Ptr, Size + 1u};
14581463
}
14591464

14601465
//===----------------------------------------------------------------------===//
@@ -1964,11 +1969,14 @@ bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
19641969
/// isHexaLiteral - Return true if Start points to a hex constant.
19651970
/// in microsoft mode (where this is supposed to be several different tokens).
19661971
bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
1967-
unsigned Size;
1968-
char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
1972+
auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts);
1973+
char C1 = CharAndSize1.Char;
19691974
if (C1 != '0')
19701975
return false;
1971-
char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
1976+
1977+
auto CharAndSize2 =
1978+
Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts);
1979+
char C2 = CharAndSize2.Char;
19721980
return (C2 == 'x' || C2 == 'X');
19731981
}
19741982

@@ -2012,8 +2020,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
20122020

20132021
// If we have a digit separator, continue.
20142022
if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
2015-
unsigned NextSize;
2016-
char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, LangOpts);
2023+
auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);
20172024
if (isAsciiIdentifierContinue(Next)) {
20182025
if (!isLexingRawMode())
20192026
Diag(CurPtr, LangOpts.CPlusPlus
@@ -2085,8 +2092,8 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
20852092
unsigned Consumed = Size;
20862093
unsigned Chars = 1;
20872094
while (true) {
2088-
unsigned NextSize;
2089-
char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, LangOpts);
2095+
auto [Next, NextSize] =
2096+
getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);
20902097
if (!isAsciiIdentifierContinue(Next)) {
20912098
// End of suffix. Check whether this is on the allowed list.
20922099
const StringRef CompleteSuffix(Buffer, Chars);

0 commit comments

Comments
 (0)