Skip to content

[5.7] Cherry-pick some syntax tweaks #493

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions Sources/_RegexParser/Regex/Parse/Diagnostics.swift
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ enum ParseError: Error, Hashable {
case invalidEscape(Character)
case confusableCharacter(Character)

case quoteMayNotSpanMultipleLines
case unsetExtendedSyntaxMayNotSpanMultipleLines

case cannotReferToWholePattern

case quantifierRequiresOperand(String)
Expand Down Expand Up @@ -79,6 +82,7 @@ enum ParseError: Error, Hashable {
case cannotRemoveTextSegmentOptions
case cannotRemoveSemanticsOptions
case cannotRemoveExtendedSyntaxInMultilineMode
case cannotResetExtendedSyntaxInMultilineMode

case expectedCalloutArgument

Expand Down Expand Up @@ -139,6 +143,10 @@ extension ParseError: CustomStringConvertible {
return "invalid escape sequence '\\\(c)'"
case .confusableCharacter(let c):
return "'\(c)' is confusable for a metacharacter; use '\\u{...}' instead"
case .quoteMayNotSpanMultipleLines:
return "quoted sequence may not span multiple lines in multi-line literal"
case .unsetExtendedSyntaxMayNotSpanMultipleLines:
return "group that unsets extended syntax may not span multiple lines in multi-line literal"
case .cannotReferToWholePattern:
return "cannot refer to whole pattern here"
case .quantifierRequiresOperand(let q):
Expand Down Expand Up @@ -190,6 +198,8 @@ extension ParseError: CustomStringConvertible {
return "semantic level cannot be unset, only changed"
case .cannotRemoveExtendedSyntaxInMultilineMode:
return "extended syntax may not be disabled in multi-line mode"
case .cannotResetExtendedSyntaxInMultilineMode:
return "extended syntax may not be disabled in multi-line mode; use '(?^x)' instead"
case .expectedCalloutArgument:
return "expected argument to callout"
case .unrecognizedScript(let value):
Expand Down
142 changes: 77 additions & 65 deletions Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
Original file line number Diff line number Diff line change
Expand Up @@ -342,8 +342,8 @@ extension Source {
}.value
}

/// Eat a scalar off the front, starting from after the
/// backslash and base character (e.g. `\u` or `\x`).
/// Try to eat a scalar off the front, starting from after the backslash and
/// base character (e.g. `\u` or `\x`).
///
/// UniScalar -> 'u{' UniScalarSequence '}'
/// | 'u' HexDigit{4}
Expand All @@ -353,60 +353,60 @@ extension Source {
/// | 'o{' OctalDigit{1...} '}'
/// | '0' OctalDigit{0...3}
///
mutating func expectUnicodeScalar(
escapedCharacter base: Character
) throws -> AST.Atom.Kind {
mutating func lexUnicodeScalar() throws -> AST.Atom.Kind? {
try recordLoc { src in
try src.tryEating { src in

func nullScalar() -> AST.Atom.Kind {
let pos = src.currentPosition
return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos)))
}

// TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
switch base {
// Hex numbers.
case "u" where src.tryEat("{"):
return try src.expectUnicodeScalarSequence(eating: "}")

case "x" where src.tryEat("{"):
let str = try src.lexUntil(eating: "}")
return .scalar(try Source.validateUnicodeScalar(str, .hex))

case "x":
// \x expects *up to* 2 digits.
guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit)
else {
// In PCRE, \x without any valid hex digits is \u{0}.
// TODO: This doesn't appear to be followed by ICU or Oniguruma, so
// could be changed to throw an error if we had a parsing mode for
// them.
return nullScalar()
func nullScalar() -> AST.Atom.Kind {
let pos = src.currentPosition
return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos)))
}
return .scalar(try Source.validateUnicodeScalar(digits, .hex))

case "u":
return .scalar(try src.expectUnicodeScalar(numDigits: 4))
case "U":
return .scalar(try src.expectUnicodeScalar(numDigits: 8))
// TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
switch src.tryEat() {
// Hex numbers.
case "u" where src.tryEat("{"):
return try src.expectUnicodeScalarSequence(eating: "}")

case "x" where src.tryEat("{"):
let str = try src.lexUntil(eating: "}")
return .scalar(try Source.validateUnicodeScalar(str, .hex))

case "x":
// \x expects *up to* 2 digits.
guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit)
else {
// In PCRE, \x without any valid hex digits is \u{0}.
// TODO: This doesn't appear to be followed by ICU or Oniguruma, so
// could be changed to throw an error if we had a parsing mode for
// them.
return nullScalar()
}
return .scalar(try Source.validateUnicodeScalar(digits, .hex))

case "u":
return .scalar(try src.expectUnicodeScalar(numDigits: 4))
case "U":
return .scalar(try src.expectUnicodeScalar(numDigits: 8))

// Octal numbers.
case "o" where src.tryEat("{"):
let str = try src.lexUntil(eating: "}")
return .scalar(try Source.validateUnicodeScalar(str, .octal))

case "0":
// We can read *up to* 3 more octal digits.
// FIXME: PCRE can only read up to 2 octal digits, if we get a strict
// PCRE mode, we should limit it here.
guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit)
else {
return nullScalar()
}
return .scalar(try Source.validateUnicodeScalar(digits, .octal))

// Octal numbers.
case "o" where src.tryEat("{"):
let str = try src.lexUntil(eating: "}")
return .scalar(try Source.validateUnicodeScalar(str, .octal))

case "0":
// We can read *up to* 3 more octal digits.
// FIXME: PCRE can only read up to 2 octal digits, if we get a strict
// PCRE mode, we should limit it here.
guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit)
else {
return nullScalar()
default:
return nil
}
return .scalar(try Source.validateUnicodeScalar(digits, .octal))

default:
fatalError("Unexpected scalar start")
}
}.value
}
Expand Down Expand Up @@ -579,7 +579,7 @@ extension Source {

/// Try to consume quoted content
///
/// Quote -> '\Q' (!'\E' .)* '\E'
/// Quote -> '\Q' (!'\E' .)* '\E'?
///
/// With `SyntaxOptions.experimentalQuotes`, also accepts
///
Expand All @@ -592,9 +592,24 @@ extension Source {
mutating func lexQuote(context: ParsingContext) throws -> AST.Quote? {
let str = try recordLoc { src -> String? in
if src.tryEat(sequence: #"\Q"#) {
return try src.expectQuoted(endingWith: #"\E"#).value
let contents = src.lexUntil { src in
src.isEmpty || src.tryEat(sequence: #"\E"#)
}.value

// In multi-line literals, the quote may not span multiple lines.
if context.syntax.contains(.multilineCompilerLiteral),
contents.spansMultipleLinesInRegexLiteral {
throw ParseError.quoteMayNotSpanMultipleLines
}

// The sequence must not be empty in a custom character class.
if context.isInCustomCharacterClass && contents.isEmpty {
throw ParseError.expectedNonEmptyContents
}
return contents
}
if context.experimentalQuotes, src.tryEat("\"") {
// TODO: Can experimental quotes be empty?
return try src.expectQuoted(endingWith: "\"", ignoreEscaped: true).value
}
return nil
Expand Down Expand Up @@ -787,6 +802,11 @@ extension Source {
mutating func lexMatchingOptionSequence(
context: ParsingContext
) throws -> AST.MatchingOptionSequence? {
// PCRE accepts '(?)'
// TODO: This is a no-op, should we warn?
if peek() == ")" {
return .init(caretLoc: nil, adding: [], minusLoc: nil, removing: [])
}
let ateCaret = recordLoc { $0.tryEat("^") }

// TODO: Warn on duplicate options, and options appearing in both adding
Expand Down Expand Up @@ -820,11 +840,6 @@ extension Source {
if opt.isSemanticMatchingLevel {
throw ParseError.cannotRemoveSemanticsOptions
}
// Extended syntax may not be removed if in multi-line mode.
if context.syntax.contains(.multilineExtendedSyntax) &&
opt.isAnyExtended {
throw ParseError.cannotRemoveExtendedSyntaxInMultilineMode
}
removing.append(opt)
}
return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location,
Expand Down Expand Up @@ -1692,6 +1707,11 @@ extension Source {
return ref
}

// Hexadecimal and octal unicode scalars.
if let scalar = try src.lexUnicodeScalar() {
return scalar
}

guard let char = src.tryEat() else {
throw ParseError.expectedEscape
}
Expand All @@ -1703,14 +1723,6 @@ extension Source {
return .escaped(builtin)
}

switch char {
// Hexadecimal and octal unicode scalars.
case "u", "x", "U", "o", "0":
return try src.expectUnicodeScalar(escapedCharacter: char)
default:
break
}

// We only allow unknown escape sequences for non-letter non-number ASCII,
// and non-ASCII whitespace.
// TODO: Once we have fix-its, suggest a `0` prefix for octal `[\7]`.
Expand Down
62 changes: 49 additions & 13 deletions Sources/_RegexParser/Regex/Parse/Parse.swift
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,8 @@ extension Parser {
/// Apply the syntax options of a given matching option sequence to the
/// current set of options.
private mutating func applySyntaxOptions(
of opts: AST.MatchingOptionSequence
) {
of opts: AST.MatchingOptionSequence, isScoped: Bool
) throws {
func mapOption(_ option: SyntaxOptions,
_ pred: (AST.MatchingOption) -> Bool) {
if opts.resetsCurrentOptions {
Expand All @@ -311,22 +311,41 @@ extension Parser {
mapOption(.namedCapturesOnly, .namedCapturesOnly)

// (?x), (?xx)
// We skip this for multi-line, as extended syntax is always enabled there.
// This cannot be unset in a multi-line literal, unless in a scoped group
// e.g (?-x:...). We later enforce that such a group does not span multiple
// lines.
// TODO: PCRE differentiates between (?x) and (?xx) where only the latter
// handles non-semantic whitespace in a custom character class. Other
// engines such as Oniguruma, Java, and ICU do this under (?x). Therefore,
// treat (?x) and (?xx) as the same option here. If we ever get a strict
// PCRE mode, we will need to change this to handle that.
if !context.syntax.contains(.multilineExtendedSyntax) {
if !isScoped && context.syntax.contains(.multilineCompilerLiteral) {
// An unscoped removal of extended syntax is not allowed in a multi-line
// literal.
if let opt = opts.removing.first(where: \.isAnyExtended) {
throw Source.LocatedError(
ParseError.cannotRemoveExtendedSyntaxInMultilineMode, opt.location)
}
if opts.resetsCurrentOptions {
throw Source.LocatedError(
ParseError.cannotResetExtendedSyntaxInMultilineMode, opts.caretLoc!)
}
// The only remaning case is an unscoped addition of extended syntax,
// which is a no-op.
} else {
// We either have a scoped change of extended syntax, or this is a
// single-line literal.
mapOption(.extendedSyntax, \.isAnyExtended)
}
}

/// Apply the syntax options of a matching option changing group to the
/// current set of options.
private mutating func applySyntaxOptions(of group: AST.Group.Kind) {
private mutating func applySyntaxOptions(
of group: AST.Group.Kind, isScoped: Bool
) throws {
if case .changeMatchingOptions(let seq) = group {
applySyntaxOptions(of: seq)
try applySyntaxOptions(of: seq, isScoped: isScoped)
}
}

Expand All @@ -337,14 +356,25 @@ extension Parser {
context.recordGroup(kind.value)

let currentSyntax = context.syntax
applySyntaxOptions(of: kind.value)
try applySyntaxOptions(of: kind.value, isScoped: true)
defer {
context.syntax = currentSyntax
}

let unsetsExtendedSyntax = currentSyntax.contains(.extendedSyntax) &&
!context.syntax.contains(.extendedSyntax)
let child = try parseNode()
try source.expect(")")
return .init(kind, child, loc(start))
let groupLoc = loc(start)

// In multi-line literals, the body of a group that unsets extended syntax
// may not span multiple lines.
if unsetsExtendedSyntax &&
context.syntax.contains(.multilineCompilerLiteral) &&
source[child.location.range].spansMultipleLinesInRegexLiteral {
throw Source.LocatedError(
ParseError.unsetExtendedSyntaxMayNotSpanMultipleLines, groupLoc)
}
return .init(kind, child, groupLoc)
}

/// Consume the body of an absent function.
Expand Down Expand Up @@ -438,7 +468,7 @@ extension Parser {
// If we have a change matching options atom, apply the syntax options. We
// already take care of scoping syntax options within a group.
if case .changeMatchingOptions(let opts) = atom.kind {
applySyntaxOptions(of: opts)
try applySyntaxOptions(of: opts, isScoped: false)
}
// TODO: track source locations
return .atom(atom)
Expand Down Expand Up @@ -592,6 +622,13 @@ public func parse<S: StringProtocol>(
return ast
}

extension StringProtocol {
/// Whether the given string is considered multi-line for a regex literal.
var spansMultipleLinesInRegexLiteral: Bool {
unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" })
}
}

/// Retrieve the default set of syntax options that a delimiter and literal
/// contents indicates.
fileprivate func defaultSyntaxOptions(
Expand All @@ -601,9 +638,8 @@ fileprivate func defaultSyntaxOptions(
case .forwardSlash:
// For an extended syntax forward slash e.g #/.../#, extended syntax is
// permitted if it spans multiple lines.
if delim.poundCount > 0 &&
contents.unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" }) {
return .multilineExtendedSyntax
if delim.poundCount > 0 && contents.spansMultipleLinesInRegexLiteral {
return [.multilineCompilerLiteral, .extendedSyntax]
}
return .traditional
case .reSingleQuote:
Expand Down
12 changes: 6 additions & 6 deletions Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,10 @@ public struct SyntaxOptions: OptionSet {
/// `(_: .*)` == `(?:.*)`
public static var experimentalCaptures: Self { Self(1 << 5) }

/// The default syntax for a multi-line regex literal.
public static var multilineExtendedSyntax: Self {
return [Self(1 << 6), .extendedSyntax]
}
/// The syntax kind of a multi-line literal. This will always be set when
/// parsing a multi-line `#/.../#` literal. Note this does not imply extended
/// syntax, as that may be temporarily disabled while parsing.
public static var multilineCompilerLiteral: Self { Self(1 << 6) }

/// `(?n)`
public static var namedCapturesOnly: Self { Self(1 << 7) }
Expand All @@ -76,8 +76,8 @@ public struct SyntaxOptions: OptionSet {
public static var traditional: Self { Self(0) }

public static var experimental: Self {
// Experimental syntax enables everything except end-of-line comments.
Self(~0).subtracting(.endOfLineComments)
[.nonSemanticWhitespace, .experimentalQuotes, .experimentalComments,
.experimentalRanges, .experimentalCaptures]
}

// TODO: Probably want to model strict-PCRE etc. options too.
Expand Down
Loading