Skip to content

Some trivia fixes #431

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
May 25, 2022
Merged
12 changes: 11 additions & 1 deletion Sources/_RegexParser/Regex/AST/CustomCharClass.swift
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,16 @@ extension AST {
public var lhs: Atom
public var dashLoc: SourceLocation
public var rhs: Atom
public var trivia: [AST.Trivia]

public init(_ lhs: Atom, _ dashLoc: SourceLocation, _ rhs: Atom) {
public init(
_ lhs: Atom, _ dashLoc: SourceLocation, _ rhs: Atom,
trivia: [AST.Trivia]
) {
self.lhs = lhs
self.dashLoc = dashLoc
self.rhs = rhs
self.trivia = trivia
}
}
public enum SetOp: String, Hashable {
Expand Down Expand Up @@ -95,6 +100,11 @@ extension CustomCC.Member {
return false
}

public var asTrivia: AST.Trivia? {
guard case .trivia(let t) = self else { return nil }
return t
}

public var isSemantic: Bool {
!isTrivia
}
Expand Down
140 changes: 74 additions & 66 deletions Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,14 @@ extension Source {
return result
}

/// Perform a lookahead using a temporary source. Within the body of the
/// lookahead, any modifications to the source will not be reflected outside
/// the body.
func lookahead<T>(_ body: (inout Source) throws -> T) rethrows -> T {
var src = self
return try body(&src)
}

/// Attempt to eat the given character, returning its source location if
/// successful, `nil` otherwise.
mutating func tryEatWithLoc(_ c: Character) -> SourceLocation? {
Expand Down Expand Up @@ -413,9 +421,7 @@ extension Source {
) throws -> (Located<Quant.Amount>, Located<Quant.Kind>, [AST.Trivia])? {
var trivia: [AST.Trivia] = []

if let t = try lexNonSemanticWhitespace(context: context) {
trivia.append(t)
}
if let t = lexNonSemanticWhitespace(context: context) { trivia.append(t) }

let amt: Located<Quant.Amount>? = try recordLoc { src in
if src.tryEat("*") { return .zeroOrMore }
Expand All @@ -424,7 +430,7 @@ extension Source {

return try src.tryEating { src in
guard src.tryEat("{"),
let range = try src.lexRange(context: context),
let range = try src.lexRange(context: context, trivia: &trivia),
src.tryEat("}")
else { return nil }
return range.value
Expand All @@ -433,9 +439,7 @@ extension Source {
guard let amt = amt else { return nil }

// PCRE allows non-semantic whitespace here in extended syntax mode.
if let t = try lexNonSemanticWhitespace(context: context) {
trivia.append(t)
}
if let t = lexNonSemanticWhitespace(context: context) { trivia.append(t) }

let kind: Located<Quant.Kind> = recordLoc { src in
if src.tryEat("?") { return .reluctant }
Expand All @@ -452,11 +456,17 @@ extension Source {
/// | ExpRange
/// ExpRange -> '..<' <Int> | '...' <Int>
/// | <Int> '..<' <Int> | <Int> '...' <Int>?
mutating func lexRange(context: ParsingContext) throws -> Located<Quant.Amount>? {
mutating func lexRange(
context: ParsingContext, trivia: inout [AST.Trivia]
) throws -> Located<Quant.Amount>? {
try recordLoc { src in
try src.tryEating { src in
if let t = src.lexWhitespace() { trivia.append(t) }

let lowerOpt = try src.lexNumber()

if let t = src.lexWhitespace() { trivia.append(t) }

// ',' or '...' or '..<' or nothing
// TODO: We ought to try and consume whitespace here and emit a
// diagnostic for the user warning them that it would cause the range to
Expand All @@ -476,11 +486,15 @@ extension Source {
closedRange = nil
}

if let t = src.lexWhitespace() { trivia.append(t) }

let upperOpt = try src.lexNumber()?.map { upper in
// If we have an open range, the upper bound should be adjusted down.
closedRange == true ? upper : upper - 1
}

if let t = src.lexWhitespace() { trivia.append(t) }

switch (lowerOpt, closedRange, upperOpt) {
case let (l?, nil, nil):
return .exactly(l)
Expand Down Expand Up @@ -625,11 +639,11 @@ extension Source {
///
mutating func lexComment(context: ParsingContext) throws -> AST.Trivia? {
let trivia: Located<String>? = try recordLoc { src in
if src.tryEat(sequence: "(?#") {
return try src.expectQuoted(endingWith: ")").value
if !context.isInCustomCharacterClass && src.tryEat(sequence: "(?#") {
return try src.lexUntil(eating: ")").value
}
if context.experimentalComments, src.tryEat(sequence: "/*") {
return try src.expectQuoted(endingWith: "*/").value
return try src.lexUntil(eating: "*/").value
}
if context.endOfLineComments, src.tryEat("#") {
// Try eat until we either exhaust the input, or hit a newline. Note
Expand Down Expand Up @@ -667,7 +681,7 @@ extension Source {
/// Does nothing unless `SyntaxOptions.nonSemanticWhitespace` is set
mutating func lexNonSemanticWhitespace(
context: ParsingContext
) throws -> AST.Trivia? {
) -> AST.Trivia? {
guard context.ignoreWhitespace else { return nil }

// FIXME: PCRE only treats space and tab characters as whitespace when
Expand Down Expand Up @@ -699,7 +713,7 @@ extension Source {
if let comment = try lexComment(context: context) {
return comment
}
if let whitespace = try lexNonSemanticWhitespace(context: context) {
if let whitespace = lexNonSemanticWhitespace(context: context) {
return whitespace
}
return nil
Expand Down Expand Up @@ -1178,8 +1192,7 @@ extension Source {
}
}

mutating func lexCustomCCStart(
) throws -> Located<CustomCC.Start>? {
mutating func lexCustomCCStart() -> Located<CustomCC.Start>? {
recordLoc { src in
// Make sure we don't have a POSIX character property. This may require
// walking to its ending to make sure we have a closing ':]', as otherwise
Expand Down Expand Up @@ -1240,8 +1253,9 @@ extension Source {

private func canLexPOSIXCharacterProperty() -> Bool {
do {
var src = self
return try src.lexPOSIXCharacterProperty() != nil
return try lookahead { src in
try src.lexPOSIXCharacterProperty() != nil
}
} catch {
// We want to tend on the side of lexing a POSIX character property, so
// even if it is invalid in some way (e.g invalid property names), still
Expand Down Expand Up @@ -1394,10 +1408,11 @@ extension Source {

/// Checks whether a numbered reference can be lexed.
private func canLexNumberedReference() -> Bool {
var src = self
_ = src.tryEat(anyOf: "+", "-")
guard let next = src.peek() else { return false }
return RadixKind.decimal.characterFilter(next)
lookahead { src in
_ = src.tryEat(anyOf: "+", "-")
guard let next = src.peek() else { return false }
return RadixKind.decimal.characterFilter(next)
}
}

/// Eat a named reference up to a given closing delimiter.
Expand Down Expand Up @@ -1587,53 +1602,55 @@ extension Source {

/// Whether we can lex a group-like reference after the specifier '(?'.
private func canLexGroupLikeReference() -> Bool {
var src = self
if src.tryEat("P") {
return src.tryEat(anyOf: "=", ">") != nil
}
if src.tryEat(anyOf: "&", "R") != nil {
return true
lookahead { src in
if src.tryEat("P") {
return src.tryEat(anyOf: "=", ">") != nil
}
if src.tryEat(anyOf: "&", "R") != nil {
return true
}
return src.canLexNumberedReference()
}
return src.canLexNumberedReference()
}

private func canLexMatchingOptionsAsAtom(context: ParsingContext) -> Bool {
var src = self

// See if we can lex a matching option sequence that terminates in ')'. Such
// a sequence is an atom. If an error is thrown, there are invalid elements
// of the matching option sequence. In such a case, we can lex as a group
// and diagnose the invalid group kind.
guard (try? src.lexMatchingOptionSequence(context: context)) != nil else {
return false
lookahead { src in
// See if we can lex a matching option sequence that terminates in ')'.
// Such a sequence is an atom. If an error is thrown, there are invalid
// elements of the matching option sequence. In such a case, we can lex as
// a group and diagnose the invalid group kind.
guard (try? src.lexMatchingOptionSequence(context: context)) != nil else {
return false
}
return src.tryEat(")")
}
return src.tryEat(")")
}

/// Whether a group specifier should be lexed as an atom instead of a group.
private func shouldLexGroupLikeAtom(context: ParsingContext) -> Bool {
var src = self
guard src.tryEat("(") else { return false }
lookahead { src in
guard src.tryEat("(") else { return false }

if src.tryEat("?") {
// The start of a reference '(?P=', '(?R', ...
if src.canLexGroupLikeReference() { return true }
if src.tryEat("?") {
// The start of a reference '(?P=', '(?R', ...
if src.canLexGroupLikeReference() { return true }

// The start of a PCRE callout.
if src.tryEat("C") { return true }
// The start of a PCRE callout.
if src.tryEat("C") { return true }

// The start of an Oniguruma 'of-contents' callout.
if src.tryEat("{") { return true }
// The start of an Oniguruma 'of-contents' callout.
if src.tryEat("{") { return true }

// A matching option atom (?x), (?i), ...
if src.canLexMatchingOptionsAsAtom(context: context) { return true }
// A matching option atom (?x), (?i), ...
if src.canLexMatchingOptionsAsAtom(context: context) { return true }

return false
}
// The start of a backreference directive or Oniguruma named callout.
if src.tryEat("*") { return true }

return false
}
// The start of a backreference directive or Oniguruma named callout.
if src.tryEat("*") { return true }

return false
}

/// Consume an escaped atom, starting from after the backslash
Expand Down Expand Up @@ -2022,20 +2039,11 @@ extension Source {
return AST.Atom(kind.value, kind.location)
}

/// Try to lex the end of a range in a custom character class, which consists
/// of a '-' character followed by an atom.
mutating func lexCustomCharClassRangeEnd(
context: ParsingContext
) throws -> (dashLoc: SourceLocation, AST.Atom)? {
// Make sure we don't have a binary operator e.g '--', and the '-' is not
// ending the custom character class (in which case it is literal).
guard peekCCBinOp() == nil, !starts(with: "-]"),
let dash = tryEatWithLoc("-"),
let end = try lexAtom(context: context)
else {
return nil
}
return (dash, end)
/// Try to lex the range operator '-' for a custom character class.
mutating func lexCustomCharacterClassRangeOperator() -> SourceLocation? {
// Eat a '-', making sure we don't have a binary op such as '--'.
guard peekCCBinOp() == nil else { return nil }
return tryEatWithLoc("-")
}

/// Try to consume a newline sequence matching option kind.
Expand Down
Loading