Skip to content

Allow scoped removal of extended syntax in multi-line literals #484

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Sources/_RegexParser/Regex/Parse/Diagnostics.swift
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ enum ParseError: Error, Hashable {
case confusableCharacter(Character)

case quoteMayNotSpanMultipleLines
case unsetExtendedSyntaxMayNotSpanMultipleLines

case cannotReferToWholePattern

Expand Down Expand Up @@ -81,6 +82,7 @@ enum ParseError: Error, Hashable {
case cannotRemoveTextSegmentOptions
case cannotRemoveSemanticsOptions
case cannotRemoveExtendedSyntaxInMultilineMode
case cannotResetExtendedSyntaxInMultilineMode

case expectedCalloutArgument

Expand Down Expand Up @@ -143,6 +145,8 @@ extension ParseError: CustomStringConvertible {
return "'\(c)' is confusable for a metacharacter; use '\\u{...}' instead"
case .quoteMayNotSpanMultipleLines:
return "quoted sequence may not span multiple lines in multi-line literal"
case .unsetExtendedSyntaxMayNotSpanMultipleLines:
return "group that unsets extended syntax may not span multiple lines in multi-line literal"
case .cannotReferToWholePattern:
return "cannot refer to whole pattern here"
case .quantifierRequiresOperand(let q):
Expand Down Expand Up @@ -194,6 +198,8 @@ extension ParseError: CustomStringConvertible {
return "semantic level cannot be unset, only changed"
case .cannotRemoveExtendedSyntaxInMultilineMode:
return "extended syntax may not be disabled in multi-line mode"
case .cannotResetExtendedSyntaxInMultilineMode:
return "extended syntax may not be disabled in multi-line mode; use '(?^x)' instead"
case .expectedCalloutArgument:
return "expected argument to callout"
case .unrecognizedScript(let value):
Expand Down
7 changes: 1 addition & 6 deletions Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
Original file line number Diff line number Diff line change
Expand Up @@ -597,7 +597,7 @@ extension Source {
}.value

// In multi-line literals, the quote may not span multiple lines.
if context.syntax.contains(.multilineExtendedSyntax),
if context.syntax.contains(.multilineCompilerLiteral),
contents.spansMultipleLinesInRegexLiteral {
throw ParseError.quoteMayNotSpanMultipleLines
}
Expand Down Expand Up @@ -840,11 +840,6 @@ extension Source {
if opt.isSemanticMatchingLevel {
throw ParseError.cannotRemoveSemanticsOptions
}
// Extended syntax may not be removed if in multi-line mode.
if context.syntax.contains(.multilineExtendedSyntax) &&
opt.isAnyExtended {
throw ParseError.cannotRemoveExtendedSyntaxInMultilineMode
}
removing.append(opt)
}
return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location,
Expand Down
54 changes: 42 additions & 12 deletions Sources/_RegexParser/Regex/Parse/Parse.swift
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,8 @@ extension Parser {
/// Apply the syntax options of a given matching option sequence to the
/// current set of options.
private mutating func applySyntaxOptions(
of opts: AST.MatchingOptionSequence
) {
of opts: AST.MatchingOptionSequence, isScoped: Bool
) throws {
func mapOption(_ option: SyntaxOptions,
_ pred: (AST.MatchingOption) -> Bool) {
if opts.resetsCurrentOptions {
Expand All @@ -311,22 +311,41 @@ extension Parser {
mapOption(.namedCapturesOnly, .namedCapturesOnly)

// (?x), (?xx)
// We skip this for multi-line, as extended syntax is always enabled there.
// This cannot be unset in a multi-line literal, unless in a scoped group
// e.g (?-x:...). We later enforce that such a group does not span multiple
// lines.
// TODO: PCRE differentiates between (?x) and (?xx) where only the latter
// handles non-semantic whitespace in a custom character class. Other
// engines such as Oniguruma, Java, and ICU do this under (?x). Therefore,
// treat (?x) and (?xx) as the same option here. If we ever get a strict
// PCRE mode, we will need to change this to handle that.
if !context.syntax.contains(.multilineExtendedSyntax) {
if !isScoped && context.syntax.contains(.multilineCompilerLiteral) {
// An unscoped removal of extended syntax is not allowed in a multi-line
// literal.
if let opt = opts.removing.first(where: \.isAnyExtended) {
throw Source.LocatedError(
ParseError.cannotRemoveExtendedSyntaxInMultilineMode, opt.location)
}
if opts.resetsCurrentOptions {
throw Source.LocatedError(
ParseError.cannotResetExtendedSyntaxInMultilineMode, opts.caretLoc!)
}
// The only remaning case is an unscoped addition of extended syntax,
// which is a no-op.
} else {
// We either have a scoped change of extended syntax, or this is a
// single-line literal.
mapOption(.extendedSyntax, \.isAnyExtended)
}
}

/// Apply the syntax options of a matching option changing group to the
/// current set of options.
private mutating func applySyntaxOptions(of group: AST.Group.Kind) {
private mutating func applySyntaxOptions(
of group: AST.Group.Kind, isScoped: Bool
) throws {
if case .changeMatchingOptions(let seq) = group {
applySyntaxOptions(of: seq)
try applySyntaxOptions(of: seq, isScoped: isScoped)
}
}

Expand All @@ -337,14 +356,25 @@ extension Parser {
context.recordGroup(kind.value)

let currentSyntax = context.syntax
applySyntaxOptions(of: kind.value)
try applySyntaxOptions(of: kind.value, isScoped: true)
defer {
context.syntax = currentSyntax
}

let unsetsExtendedSyntax = currentSyntax.contains(.extendedSyntax) &&
!context.syntax.contains(.extendedSyntax)
let child = try parseNode()
try source.expect(")")
return .init(kind, child, loc(start))
let groupLoc = loc(start)

// In multi-line literals, the body of a group that unsets extended syntax
// may not span multiple lines.
if unsetsExtendedSyntax &&
context.syntax.contains(.multilineCompilerLiteral) &&
source[child.location.range].spansMultipleLinesInRegexLiteral {
throw Source.LocatedError(
ParseError.unsetExtendedSyntaxMayNotSpanMultipleLines, groupLoc)
}
return .init(kind, child, groupLoc)
}

/// Consume the body of an absent function.
Expand Down Expand Up @@ -438,7 +468,7 @@ extension Parser {
// If we have a change matching options atom, apply the syntax options. We
// already take care of scoping syntax options within a group.
if case .changeMatchingOptions(let opts) = atom.kind {
applySyntaxOptions(of: opts)
try applySyntaxOptions(of: opts, isScoped: false)
}
// TODO: track source locations
return .atom(atom)
Expand Down Expand Up @@ -592,7 +622,7 @@ public func parse<S: StringProtocol>(
return ast
}

extension String {
extension StringProtocol {
/// Whether the given string is considered multi-line for a regex literal.
var spansMultipleLinesInRegexLiteral: Bool {
unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" })
Expand All @@ -609,7 +639,7 @@ fileprivate func defaultSyntaxOptions(
// For an extended syntax forward slash e.g #/.../#, extended syntax is
// permitted if it spans multiple lines.
if delim.poundCount > 0 && contents.spansMultipleLinesInRegexLiteral {
return .multilineExtendedSyntax
return [.multilineCompilerLiteral, .extendedSyntax]
}
return .traditional
case .reSingleQuote:
Expand Down
12 changes: 6 additions & 6 deletions Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,10 @@ public struct SyntaxOptions: OptionSet {
/// `(_: .*)` == `(?:.*)`
public static var experimentalCaptures: Self { Self(1 << 5) }

/// The default syntax for a multi-line regex literal.
public static var multilineExtendedSyntax: Self {
return [Self(1 << 6), .extendedSyntax]
}
/// The syntax kind of a multi-line literal. This will always be set when
/// parsing a multi-line `#/.../#` literal. Note this does not imply extended
/// syntax, as that may be temporarily disabled while parsing.
public static var multilineCompilerLiteral: Self { Self(1 << 6) }

/// `(?n)`
public static var namedCapturesOnly: Self { Self(1 << 7) }
Expand All @@ -76,8 +76,8 @@ public struct SyntaxOptions: OptionSet {
public static var traditional: Self { Self(0) }

public static var experimental: Self {
// Experimental syntax enables everything except end-of-line comments.
Self(~0).subtracting(.endOfLineComments)
[.nonSemanticWhitespace, .experimentalQuotes, .experimentalComments,
.experimentalRanges, .experimentalCaptures]
}

// TODO: Probably want to model strict-PCRE etc. options too.
Expand Down
94 changes: 83 additions & 11 deletions Tests/RegexTests/ParseTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -772,6 +772,9 @@ extension RegexTests {
syntax: .experimental)
parseTest(#""\"""#, quote("\""), syntax: .experimental)

parseTest(#"(abc)"#, capture(concat("a", "b", "c")),
syntax: .experimental, captures: [.cap])

// Quotes in character classes.
parseTest(#"[\Q-\E]"#, charClass(quote_m("-")))
parseTest(#"[\Qa-b[[*+\\E]"#, charClass(quote_m("a-b[[*+\\")))
Expand Down Expand Up @@ -1777,6 +1780,13 @@ extension RegexTests {
" ", "b"
)
)
parseTest(
"(?x) a (?^: b)", concat(
changeMatchingOptions(matchingOptions(adding: .extended)),
"a",
changeMatchingOptions(unsetMatchingOptions(), concat(" ", "b"))
)
)

parseTest("[ # abc]", charClass(" ", "#", " ", "a", "b", "c"))
parseTest("[#]", charClass("#"))
Expand Down Expand Up @@ -2099,6 +2109,17 @@ extension RegexTests {
throwsError: .unsupported, syntax: .extendedSyntax
)

parseWithDelimitersTest(
#"""
#/
a\
b\
c
/#
"""#,
concat("a", "\n", "b", "\n", "c")
)

// MARK: Parse with delimiters

parseWithDelimitersTest("/a b/", concat("a", " ", "b"))
Expand Down Expand Up @@ -2174,22 +2195,40 @@ extension RegexTests {
/#
""", concat("a", "b"))

// Make sure (?^) is ignored.
// (?x) has no effect.
parseWithDelimitersTest("""
#/
(?^)
(?x)
# comment
/#
""", changeMatchingOptions(unsetMatchingOptions())
""", changeMatchingOptions(matchingOptions(adding: .extended))
)

// (?x) has no effect.
// Scoped removal of extended syntax is allowed as long as it does not span
// multiple lines.
parseWithDelimitersTest("""
#/
(?x)
# comment
(?-x:a b)
/#
""", changeMatchingOptions(matchingOptions(adding: .extended))
""", changeMatchingOptions(
matchingOptions(removing: .extended),
concat("a", " ", "b")
)
)
parseWithDelimitersTest("""
#/
(?-xx:a b)
/#
""", changeMatchingOptions(
matchingOptions(removing: .extraExtended),
concat("a", " ", "b")
)
)
parseWithDelimitersTest("""
#/
(?^: a b ) # comment
/#
""", changeMatchingOptions(unsetMatchingOptions(), concat(" ", "a", " ", "b", " "))
)

parseWithDelimitersTest(#"""
Expand Down Expand Up @@ -2773,17 +2812,50 @@ extension RegexTests {
/#
""", .cannotRemoveExtendedSyntaxInMultilineMode
)

// Scoped removal of extended syntax may not span multiple lines
diagnosticWithDelimitersTest("""
#/
(?-x:a b)
(?-x:a b
)
/#
""", .cannotRemoveExtendedSyntaxInMultilineMode
""", .unsetExtendedSyntaxMayNotSpanMultipleLines
)
diagnosticWithDelimitersTest("""
#/
(?-xx:a b)
(?-x:a
b)
/#
""", .cannotRemoveExtendedSyntaxInMultilineMode
""", .unsetExtendedSyntaxMayNotSpanMultipleLines
)
diagnosticWithDelimitersTest("""
#/
(?-xx:
a b)
/#
""", .unsetExtendedSyntaxMayNotSpanMultipleLines
)
diagnosticWithDelimitersTest("""
#/
(?x-x:
a b)
/#
""", .unsetExtendedSyntaxMayNotSpanMultipleLines
)
diagnosticWithDelimitersTest("""
#/
(?^)
# comment
/#
""", .cannotResetExtendedSyntaxInMultilineMode
)
diagnosticWithDelimitersTest("""
#/
(?^:
# comment
)
/#
""", .unsetExtendedSyntaxMayNotSpanMultipleLines
)

diagnosticWithDelimitersTest(#"""
Expand Down