Skip to content

Commit 72adeba

Browse files
committed
Allow scoped removal of extended syntax in multi-line literals
Relax the ban on unsetting extended syntax in a multi-line literal such that it does not apply to a scoped unset e.g `(?-x:...)`, as long as it does not span multiple lines. This commit also bans the use of `(?^)` in a multi-line literal, unless it is scoped and does not span multiple lines. Instead, `(?^x)` must be written, as PCRE defines `(?^)` to be equivalent to `(?-imnsx)`.
1 parent 7edce9a commit 72adeba

File tree

4 files changed

+116
-27
lines changed

4 files changed

+116
-27
lines changed

Sources/_RegexParser/Regex/Parse/Diagnostics.swift

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ enum ParseError: Error, Hashable {
4545
case confusableCharacter(Character)
4646

4747
case quoteMayNotSpanMultipleLines
48+
case unsetExtendedSyntaxMayNotSpanMultipleLines
4849

4950
case cannotReferToWholePattern
5051

@@ -81,6 +82,7 @@ enum ParseError: Error, Hashable {
8182
case cannotRemoveTextSegmentOptions
8283
case cannotRemoveSemanticsOptions
8384
case cannotRemoveExtendedSyntaxInMultilineMode
85+
case cannotResetExtendedSyntaxInMultilineMode
8486

8587
case expectedCalloutArgument
8688

@@ -143,6 +145,8 @@ extension ParseError: CustomStringConvertible {
143145
return "'\(c)' is confusable for a metacharacter; use '\\u{...}' instead"
144146
case .quoteMayNotSpanMultipleLines:
145147
return "quoted sequence may not span multiple lines in multi-line literal"
148+
case .unsetExtendedSyntaxMayNotSpanMultipleLines:
149+
return "group that unsets extended syntax may not span multiple lines in multi-line literal"
146150
case .cannotReferToWholePattern:
147151
return "cannot refer to whole pattern here"
148152
case .quantifierRequiresOperand(let q):
@@ -194,6 +198,8 @@ extension ParseError: CustomStringConvertible {
194198
return "semantic level cannot be unset, only changed"
195199
case .cannotRemoveExtendedSyntaxInMultilineMode:
196200
return "extended syntax may not be disabled in multi-line mode"
201+
case .cannotResetExtendedSyntaxInMultilineMode:
202+
return "extended syntax may not be disabled in multi-line mode; use '(?^x)' instead"
197203
case .expectedCalloutArgument:
198204
return "expected argument to callout"
199205
case .unrecognizedScript(let value):

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -840,11 +840,6 @@ extension Source {
840840
if opt.isSemanticMatchingLevel {
841841
throw ParseError.cannotRemoveSemanticsOptions
842842
}
843-
// Extended syntax may not be removed if in multi-line mode.
844-
if context.syntax.contains(.multilineCompilerLiteral) &&
845-
opt.isAnyExtended {
846-
throw ParseError.cannotRemoveExtendedSyntaxInMultilineMode
847-
}
848843
removing.append(opt)
849844
}
850845
return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location,

Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -289,8 +289,8 @@ extension Parser {
289289
/// Apply the syntax options of a given matching option sequence to the
290290
/// current set of options.
291291
private mutating func applySyntaxOptions(
292-
of opts: AST.MatchingOptionSequence
293-
) {
292+
of opts: AST.MatchingOptionSequence, isScoped: Bool
293+
) throws {
294294
func mapOption(_ option: SyntaxOptions,
295295
_ pred: (AST.MatchingOption) -> Bool) {
296296
if opts.resetsCurrentOptions {
@@ -311,22 +311,41 @@ extension Parser {
311311
mapOption(.namedCapturesOnly, .namedCapturesOnly)
312312

313313
// (?x), (?xx)
314-
// We skip this for multi-line, as extended syntax is always enabled there.
314+
// This cannot be unset in a multi-line literal, unless in a scoped group
315+
// e.g (?-x:...). We later enforce that such a group does not span multiple
316+
// lines.
315317
// TODO: PCRE differentiates between (?x) and (?xx) where only the latter
316318
// handles non-semantic whitespace in a custom character class. Other
317319
// engines such as Oniguruma, Java, and ICU do this under (?x). Therefore,
318320
// treat (?x) and (?xx) as the same option here. If we ever get a strict
319321
// PCRE mode, we will need to change this to handle that.
320-
if !context.syntax.contains(.multilineCompilerLiteral) {
322+
if !isScoped && context.syntax.contains(.multilineCompilerLiteral) {
323+
// An unscoped removal of extended syntax is not allowed in a multi-line
324+
// literal.
325+
if let opt = opts.removing.first(where: \.isAnyExtended) {
326+
throw Source.LocatedError(
327+
ParseError.cannotRemoveExtendedSyntaxInMultilineMode, opt.location)
328+
}
329+
if opts.resetsCurrentOptions {
330+
throw Source.LocatedError(
331+
ParseError.cannotResetExtendedSyntaxInMultilineMode, opts.caretLoc!)
332+
}
333+
// The only remaning case is an unscoped addition of extended syntax,
334+
// which is a no-op.
335+
} else {
336+
// We either have a scoped change of extended syntax, or this is a
337+
// single-line literal.
321338
mapOption(.extendedSyntax, \.isAnyExtended)
322339
}
323340
}
324341

325342
/// Apply the syntax options of a matching option changing group to the
326343
/// current set of options.
327-
private mutating func applySyntaxOptions(of group: AST.Group.Kind) {
344+
private mutating func applySyntaxOptions(
345+
of group: AST.Group.Kind, isScoped: Bool
346+
) throws {
328347
if case .changeMatchingOptions(let seq) = group {
329-
applySyntaxOptions(of: seq)
348+
try applySyntaxOptions(of: seq, isScoped: isScoped)
330349
}
331350
}
332351

@@ -337,14 +356,25 @@ extension Parser {
337356
context.recordGroup(kind.value)
338357

339358
let currentSyntax = context.syntax
340-
applySyntaxOptions(of: kind.value)
359+
try applySyntaxOptions(of: kind.value, isScoped: true)
341360
defer {
342361
context.syntax = currentSyntax
343362
}
344-
363+
let unsetsExtendedSyntax = currentSyntax.contains(.extendedSyntax) &&
364+
!context.syntax.contains(.extendedSyntax)
345365
let child = try parseNode()
346366
try source.expect(")")
347-
return .init(kind, child, loc(start))
367+
let groupLoc = loc(start)
368+
369+
// In multi-line literals, the body of a group that unsets extended syntax
370+
// may not span multiple lines.
371+
if unsetsExtendedSyntax &&
372+
context.syntax.contains(.multilineCompilerLiteral) &&
373+
source[child.location.range].spansMultipleLinesInRegexLiteral {
374+
throw Source.LocatedError(
375+
ParseError.unsetExtendedSyntaxMayNotSpanMultipleLines, groupLoc)
376+
}
377+
return .init(kind, child, groupLoc)
348378
}
349379

350380
/// Consume the body of an absent function.
@@ -438,7 +468,7 @@ extension Parser {
438468
// If we have a change matching options atom, apply the syntax options. We
439469
// already take care of scoping syntax options within a group.
440470
if case .changeMatchingOptions(let opts) = atom.kind {
441-
applySyntaxOptions(of: opts)
471+
try applySyntaxOptions(of: opts, isScoped: false)
442472
}
443473
// TODO: track source locations
444474
return .atom(atom)
@@ -592,7 +622,7 @@ public func parse<S: StringProtocol>(
592622
return ast
593623
}
594624

595-
extension String {
625+
extension StringProtocol {
596626
/// Whether the given string is considered multi-line for a regex literal.
597627
var spansMultipleLinesInRegexLiteral: Bool {
598628
unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" })

Tests/RegexTests/ParseTests.swift

Lines changed: 69 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1780,6 +1780,13 @@ extension RegexTests {
17801780
" ", "b"
17811781
)
17821782
)
1783+
parseTest(
1784+
"(?x) a (?^: b)", concat(
1785+
changeMatchingOptions(matchingOptions(adding: .extended)),
1786+
"a",
1787+
changeMatchingOptions(unsetMatchingOptions(), concat(" ", "b"))
1788+
)
1789+
)
17831790

17841791
parseTest("[ # abc]", charClass(" ", "#", " ", "a", "b", "c"))
17851792
parseTest("[#]", charClass("#"))
@@ -2188,22 +2195,40 @@ extension RegexTests {
21882195
/#
21892196
""", concat("a", "b"))
21902197

2191-
// Make sure (?^) is ignored.
2198+
// (?x) has no effect.
21922199
parseWithDelimitersTest("""
21932200
#/
2194-
(?^)
2201+
(?x)
21952202
# comment
21962203
/#
2197-
""", changeMatchingOptions(unsetMatchingOptions())
2204+
""", changeMatchingOptions(matchingOptions(adding: .extended))
21982205
)
21992206

2200-
// (?x) has no effect.
2207+
// Scoped removal of extended syntax is allowed as long as it does not span
2208+
// multiple lines.
22012209
parseWithDelimitersTest("""
22022210
#/
2203-
(?x)
2204-
# comment
2211+
(?-x:a b)
22052212
/#
2206-
""", changeMatchingOptions(matchingOptions(adding: .extended))
2213+
""", changeMatchingOptions(
2214+
matchingOptions(removing: .extended),
2215+
concat("a", " ", "b")
2216+
)
2217+
)
2218+
parseWithDelimitersTest("""
2219+
#/
2220+
(?-xx:a b)
2221+
/#
2222+
""", changeMatchingOptions(
2223+
matchingOptions(removing: .extraExtended),
2224+
concat("a", " ", "b")
2225+
)
2226+
)
2227+
parseWithDelimitersTest("""
2228+
#/
2229+
(?^: a b ) # comment
2230+
/#
2231+
""", changeMatchingOptions(unsetMatchingOptions(), concat(" ", "a", " ", "b", " "))
22072232
)
22082233

22092234
parseWithDelimitersTest(#"""
@@ -2787,17 +2812,50 @@ extension RegexTests {
27872812
/#
27882813
""", .cannotRemoveExtendedSyntaxInMultilineMode
27892814
)
2815+
2816+
// Scoped removal of extended syntax may not span multiple lines
27902817
diagnosticWithDelimitersTest("""
27912818
#/
2792-
(?-x:a b)
2819+
(?-x:a b
2820+
)
27932821
/#
2794-
""", .cannotRemoveExtendedSyntaxInMultilineMode
2822+
""", .unsetExtendedSyntaxMayNotSpanMultipleLines
27952823
)
27962824
diagnosticWithDelimitersTest("""
27972825
#/
2798-
(?-xx:a b)
2826+
(?-x:a
2827+
b)
27992828
/#
2800-
""", .cannotRemoveExtendedSyntaxInMultilineMode
2829+
""", .unsetExtendedSyntaxMayNotSpanMultipleLines
2830+
)
2831+
diagnosticWithDelimitersTest("""
2832+
#/
2833+
(?-xx:
2834+
a b)
2835+
/#
2836+
""", .unsetExtendedSyntaxMayNotSpanMultipleLines
2837+
)
2838+
diagnosticWithDelimitersTest("""
2839+
#/
2840+
(?x-x:
2841+
a b)
2842+
/#
2843+
""", .unsetExtendedSyntaxMayNotSpanMultipleLines
2844+
)
2845+
diagnosticWithDelimitersTest("""
2846+
#/
2847+
(?^)
2848+
# comment
2849+
/#
2850+
""", .cannotResetExtendedSyntaxInMultilineMode
2851+
)
2852+
diagnosticWithDelimitersTest("""
2853+
#/
2854+
(?^:
2855+
# comment
2856+
)
2857+
/#
2858+
""", .unsetExtendedSyntaxMayNotSpanMultipleLines
28012859
)
28022860

28032861
diagnosticWithDelimitersTest(#"""

0 commit comments

Comments
 (0)