Skip to content

Commit 8f0160b

Browse files
authored
Merge pull request #445 from hamishknight/trivia-pursuit-5.7
2 parents 3d44861 + 4328e73 commit 8f0160b

File tree

7 files changed

+236
-143
lines changed

7 files changed

+236
-143
lines changed

Sources/_RegexParser/Regex/AST/CustomCharClass.swift

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,16 @@ extension AST {
5151
public var lhs: Atom
5252
public var dashLoc: SourceLocation
5353
public var rhs: Atom
54+
public var trivia: [AST.Trivia]
5455

55-
public init(_ lhs: Atom, _ dashLoc: SourceLocation, _ rhs: Atom) {
56+
public init(
57+
_ lhs: Atom, _ dashLoc: SourceLocation, _ rhs: Atom,
58+
trivia: [AST.Trivia]
59+
) {
5660
self.lhs = lhs
5761
self.dashLoc = dashLoc
5862
self.rhs = rhs
63+
self.trivia = trivia
5964
}
6065
}
6166
public enum SetOp: String, Hashable {
@@ -95,6 +100,11 @@ extension CustomCC.Member {
95100
return false
96101
}
97102

103+
public var asTrivia: AST.Trivia? {
104+
guard case .trivia(let t) = self else { return nil }
105+
return t
106+
}
107+
98108
public var isSemantic: Bool {
99109
!isTrivia
100110
}

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 74 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,14 @@ extension Source {
149149
return result
150150
}
151151

152+
/// Perform a lookahead using a temporary source. Within the body of the
153+
/// lookahead, any modifications to the source will not be reflected outside
154+
/// the body.
155+
func lookahead<T>(_ body: (inout Source) throws -> T) rethrows -> T {
156+
var src = self
157+
return try body(&src)
158+
}
159+
152160
/// Attempt to eat the given character, returning its source location if
153161
/// successful, `nil` otherwise.
154162
mutating func tryEatWithLoc(_ c: Character) -> SourceLocation? {
@@ -413,9 +421,7 @@ extension Source {
413421
) throws -> (Located<Quant.Amount>, Located<Quant.Kind>, [AST.Trivia])? {
414422
var trivia: [AST.Trivia] = []
415423

416-
if let t = try lexNonSemanticWhitespace(context: context) {
417-
trivia.append(t)
418-
}
424+
if let t = lexNonSemanticWhitespace(context: context) { trivia.append(t) }
419425

420426
let amt: Located<Quant.Amount>? = try recordLoc { src in
421427
if src.tryEat("*") { return .zeroOrMore }
@@ -424,7 +430,7 @@ extension Source {
424430

425431
return try src.tryEating { src in
426432
guard src.tryEat("{"),
427-
let range = try src.lexRange(context: context),
433+
let range = try src.lexRange(context: context, trivia: &trivia),
428434
src.tryEat("}")
429435
else { return nil }
430436
return range.value
@@ -433,9 +439,7 @@ extension Source {
433439
guard let amt = amt else { return nil }
434440

435441
// PCRE allows non-semantic whitespace here in extended syntax mode.
436-
if let t = try lexNonSemanticWhitespace(context: context) {
437-
trivia.append(t)
438-
}
442+
if let t = lexNonSemanticWhitespace(context: context) { trivia.append(t) }
439443

440444
let kind: Located<Quant.Kind> = recordLoc { src in
441445
if src.tryEat("?") { return .reluctant }
@@ -452,11 +456,17 @@ extension Source {
452456
/// | ExpRange
453457
/// ExpRange -> '..<' <Int> | '...' <Int>
454458
/// | <Int> '..<' <Int> | <Int> '...' <Int>?
455-
mutating func lexRange(context: ParsingContext) throws -> Located<Quant.Amount>? {
459+
mutating func lexRange(
460+
context: ParsingContext, trivia: inout [AST.Trivia]
461+
) throws -> Located<Quant.Amount>? {
456462
try recordLoc { src in
457463
try src.tryEating { src in
464+
if let t = src.lexWhitespace() { trivia.append(t) }
465+
458466
let lowerOpt = try src.lexNumber()
459467

468+
if let t = src.lexWhitespace() { trivia.append(t) }
469+
460470
// ',' or '...' or '..<' or nothing
461471
// TODO: We ought to try and consume whitespace here and emit a
462472
// diagnostic for the user warning them that it would cause the range to
@@ -476,11 +486,15 @@ extension Source {
476486
closedRange = nil
477487
}
478488

489+
if let t = src.lexWhitespace() { trivia.append(t) }
490+
479491
let upperOpt = try src.lexNumber()?.map { upper in
480492
// If we have an open range, the upper bound should be adjusted down.
481493
closedRange == true ? upper : upper - 1
482494
}
483495

496+
if let t = src.lexWhitespace() { trivia.append(t) }
497+
484498
switch (lowerOpt, closedRange, upperOpt) {
485499
case let (l?, nil, nil):
486500
return .exactly(l)
@@ -625,11 +639,11 @@ extension Source {
625639
///
626640
mutating func lexComment(context: ParsingContext) throws -> AST.Trivia? {
627641
let trivia: Located<String>? = try recordLoc { src in
628-
if src.tryEat(sequence: "(?#") {
629-
return try src.expectQuoted(endingWith: ")").value
642+
if !context.isInCustomCharacterClass && src.tryEat(sequence: "(?#") {
643+
return try src.lexUntil(eating: ")").value
630644
}
631645
if context.experimentalComments, src.tryEat(sequence: "/*") {
632-
return try src.expectQuoted(endingWith: "*/").value
646+
return try src.lexUntil(eating: "*/").value
633647
}
634648
if context.endOfLineComments, src.tryEat("#") {
635649
// Try eat until we either exhaust the input, or hit a newline. Note
@@ -667,7 +681,7 @@ extension Source {
667681
/// Does nothing unless `SyntaxOptions.nonSemanticWhitespace` is set
668682
mutating func lexNonSemanticWhitespace(
669683
context: ParsingContext
670-
) throws -> AST.Trivia? {
684+
) -> AST.Trivia? {
671685
guard context.ignoreWhitespace else { return nil }
672686

673687
// FIXME: PCRE only treats space and tab characters as whitespace when
@@ -699,7 +713,7 @@ extension Source {
699713
if let comment = try lexComment(context: context) {
700714
return comment
701715
}
702-
if let whitespace = try lexNonSemanticWhitespace(context: context) {
716+
if let whitespace = lexNonSemanticWhitespace(context: context) {
703717
return whitespace
704718
}
705719
return nil
@@ -1178,8 +1192,7 @@ extension Source {
11781192
}
11791193
}
11801194

1181-
mutating func lexCustomCCStart(
1182-
) throws -> Located<CustomCC.Start>? {
1195+
mutating func lexCustomCCStart() -> Located<CustomCC.Start>? {
11831196
recordLoc { src in
11841197
// Make sure we don't have a POSIX character property. This may require
11851198
// walking to its ending to make sure we have a closing ':]', as otherwise
@@ -1240,8 +1253,9 @@ extension Source {
12401253

12411254
private func canLexPOSIXCharacterProperty() -> Bool {
12421255
do {
1243-
var src = self
1244-
return try src.lexPOSIXCharacterProperty() != nil
1256+
return try lookahead { src in
1257+
try src.lexPOSIXCharacterProperty() != nil
1258+
}
12451259
} catch {
12461260
// We want to tend on the side of lexing a POSIX character property, so
12471261
// even if it is invalid in some way (e.g invalid property names), still
@@ -1394,10 +1408,11 @@ extension Source {
13941408

13951409
/// Checks whether a numbered reference can be lexed.
13961410
private func canLexNumberedReference() -> Bool {
1397-
var src = self
1398-
_ = src.tryEat(anyOf: "+", "-")
1399-
guard let next = src.peek() else { return false }
1400-
return RadixKind.decimal.characterFilter(next)
1411+
lookahead { src in
1412+
_ = src.tryEat(anyOf: "+", "-")
1413+
guard let next = src.peek() else { return false }
1414+
return RadixKind.decimal.characterFilter(next)
1415+
}
14011416
}
14021417

14031418
/// Eat a named reference up to a given closing delimiter.
@@ -1587,53 +1602,55 @@ extension Source {
15871602

15881603
/// Whether we can lex a group-like reference after the specifier '(?'.
15891604
private func canLexGroupLikeReference() -> Bool {
1590-
var src = self
1591-
if src.tryEat("P") {
1592-
return src.tryEat(anyOf: "=", ">") != nil
1593-
}
1594-
if src.tryEat(anyOf: "&", "R") != nil {
1595-
return true
1605+
lookahead { src in
1606+
if src.tryEat("P") {
1607+
return src.tryEat(anyOf: "=", ">") != nil
1608+
}
1609+
if src.tryEat(anyOf: "&", "R") != nil {
1610+
return true
1611+
}
1612+
return src.canLexNumberedReference()
15961613
}
1597-
return src.canLexNumberedReference()
15981614
}
15991615

16001616
private func canLexMatchingOptionsAsAtom(context: ParsingContext) -> Bool {
1601-
var src = self
1602-
1603-
// See if we can lex a matching option sequence that terminates in ')'. Such
1604-
// a sequence is an atom. If an error is thrown, there are invalid elements
1605-
// of the matching option sequence. In such a case, we can lex as a group
1606-
// and diagnose the invalid group kind.
1607-
guard (try? src.lexMatchingOptionSequence(context: context)) != nil else {
1608-
return false
1617+
lookahead { src in
1618+
// See if we can lex a matching option sequence that terminates in ')'.
1619+
// Such a sequence is an atom. If an error is thrown, there are invalid
1620+
// elements of the matching option sequence. In such a case, we can lex as
1621+
// a group and diagnose the invalid group kind.
1622+
guard (try? src.lexMatchingOptionSequence(context: context)) != nil else {
1623+
return false
1624+
}
1625+
return src.tryEat(")")
16091626
}
1610-
return src.tryEat(")")
16111627
}
16121628

16131629
/// Whether a group specifier should be lexed as an atom instead of a group.
16141630
private func shouldLexGroupLikeAtom(context: ParsingContext) -> Bool {
1615-
var src = self
1616-
guard src.tryEat("(") else { return false }
1631+
lookahead { src in
1632+
guard src.tryEat("(") else { return false }
16171633

1618-
if src.tryEat("?") {
1619-
// The start of a reference '(?P=', '(?R', ...
1620-
if src.canLexGroupLikeReference() { return true }
1634+
if src.tryEat("?") {
1635+
// The start of a reference '(?P=', '(?R', ...
1636+
if src.canLexGroupLikeReference() { return true }
16211637

1622-
// The start of a PCRE callout.
1623-
if src.tryEat("C") { return true }
1638+
// The start of a PCRE callout.
1639+
if src.tryEat("C") { return true }
16241640

1625-
// The start of an Oniguruma 'of-contents' callout.
1626-
if src.tryEat("{") { return true }
1641+
// The start of an Oniguruma 'of-contents' callout.
1642+
if src.tryEat("{") { return true }
16271643

1628-
// A matching option atom (?x), (?i), ...
1629-
if src.canLexMatchingOptionsAsAtom(context: context) { return true }
1644+
// A matching option atom (?x), (?i), ...
1645+
if src.canLexMatchingOptionsAsAtom(context: context) { return true }
1646+
1647+
return false
1648+
}
1649+
// The start of a backreference directive or Oniguruma named callout.
1650+
if src.tryEat("*") { return true }
16301651

16311652
return false
16321653
}
1633-
// The start of a backreference directive or Oniguruma named callout.
1634-
if src.tryEat("*") { return true }
1635-
1636-
return false
16371654
}
16381655

16391656
/// Consume an escaped atom, starting from after the backslash
@@ -2022,20 +2039,11 @@ extension Source {
20222039
return AST.Atom(kind.value, kind.location)
20232040
}
20242041

2025-
/// Try to lex the end of a range in a custom character class, which consists
2026-
/// of a '-' character followed by an atom.
2027-
mutating func lexCustomCharClassRangeEnd(
2028-
context: ParsingContext
2029-
) throws -> (dashLoc: SourceLocation, AST.Atom)? {
2030-
// Make sure we don't have a binary operator e.g '--', and the '-' is not
2031-
// ending the custom character class (in which case it is literal).
2032-
guard peekCCBinOp() == nil, !starts(with: "-]"),
2033-
let dash = tryEatWithLoc("-"),
2034-
let end = try lexAtom(context: context)
2035-
else {
2036-
return nil
2037-
}
2038-
return (dash, end)
2042+
/// Try to lex the range operator '-' for a custom character class.
2043+
mutating func lexCustomCharacterClassRangeOperator() -> SourceLocation? {
2044+
// Eat a '-', making sure we don't have a binary op such as '--'.
2045+
guard peekCCBinOp() == nil else { return nil }
2046+
return tryEatWithLoc("-")
20392047
}
20402048

20412049
/// Try to consume a newline sequence matching option kind.

0 commit comments

Comments
 (0)