Skip to content

Commit 3076eba

Browse files
authored
Merge pull request #484 from hamishknight/limited-run-syntax
2 parents ed5aedb + 2aa035e commit 3076eba

File tree

5 files changed

+138
-35
lines changed

5 files changed

+138
-35
lines changed

Sources/_RegexParser/Regex/Parse/Diagnostics.swift

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ enum ParseError: Error, Hashable {
4545
case confusableCharacter(Character)
4646

4747
case quoteMayNotSpanMultipleLines
48+
case unsetExtendedSyntaxMayNotSpanMultipleLines
4849

4950
case cannotReferToWholePattern
5051

@@ -81,6 +82,7 @@ enum ParseError: Error, Hashable {
8182
case cannotRemoveTextSegmentOptions
8283
case cannotRemoveSemanticsOptions
8384
case cannotRemoveExtendedSyntaxInMultilineMode
85+
case cannotResetExtendedSyntaxInMultilineMode
8486

8587
case expectedCalloutArgument
8688

@@ -143,6 +145,8 @@ extension ParseError: CustomStringConvertible {
143145
return "'\(c)' is confusable for a metacharacter; use '\\u{...}' instead"
144146
case .quoteMayNotSpanMultipleLines:
145147
return "quoted sequence may not span multiple lines in multi-line literal"
148+
case .unsetExtendedSyntaxMayNotSpanMultipleLines:
149+
return "group that unsets extended syntax may not span multiple lines in multi-line literal"
146150
case .cannotReferToWholePattern:
147151
return "cannot refer to whole pattern here"
148152
case .quantifierRequiresOperand(let q):
@@ -194,6 +198,8 @@ extension ParseError: CustomStringConvertible {
194198
return "semantic level cannot be unset, only changed"
195199
case .cannotRemoveExtendedSyntaxInMultilineMode:
196200
return "extended syntax may not be disabled in multi-line mode"
201+
case .cannotResetExtendedSyntaxInMultilineMode:
202+
return "extended syntax may not be disabled in multi-line mode; use '(?^x)' instead"
197203
case .expectedCalloutArgument:
198204
return "expected argument to callout"
199205
case .unrecognizedScript(let value):

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -597,7 +597,7 @@ extension Source {
597597
}.value
598598

599599
// In multi-line literals, the quote may not span multiple lines.
600-
if context.syntax.contains(.multilineExtendedSyntax),
600+
if context.syntax.contains(.multilineCompilerLiteral),
601601
contents.spansMultipleLinesInRegexLiteral {
602602
throw ParseError.quoteMayNotSpanMultipleLines
603603
}
@@ -840,11 +840,6 @@ extension Source {
840840
if opt.isSemanticMatchingLevel {
841841
throw ParseError.cannotRemoveSemanticsOptions
842842
}
843-
// Extended syntax may not be removed if in multi-line mode.
844-
if context.syntax.contains(.multilineExtendedSyntax) &&
845-
opt.isAnyExtended {
846-
throw ParseError.cannotRemoveExtendedSyntaxInMultilineMode
847-
}
848843
removing.append(opt)
849844
}
850845
return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location,

Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -289,8 +289,8 @@ extension Parser {
289289
/// Apply the syntax options of a given matching option sequence to the
290290
/// current set of options.
291291
private mutating func applySyntaxOptions(
292-
of opts: AST.MatchingOptionSequence
293-
) {
292+
of opts: AST.MatchingOptionSequence, isScoped: Bool
293+
) throws {
294294
func mapOption(_ option: SyntaxOptions,
295295
_ pred: (AST.MatchingOption) -> Bool) {
296296
if opts.resetsCurrentOptions {
@@ -311,22 +311,41 @@ extension Parser {
311311
mapOption(.namedCapturesOnly, .namedCapturesOnly)
312312

313313
// (?x), (?xx)
314-
// We skip this for multi-line, as extended syntax is always enabled there.
314+
// This cannot be unset in a multi-line literal, unless in a scoped group
315+
// e.g (?-x:...). We later enforce that such a group does not span multiple
316+
// lines.
315317
// TODO: PCRE differentiates between (?x) and (?xx) where only the latter
316318
// handles non-semantic whitespace in a custom character class. Other
317319
// engines such as Oniguruma, Java, and ICU do this under (?x). Therefore,
318320
// treat (?x) and (?xx) as the same option here. If we ever get a strict
319321
// PCRE mode, we will need to change this to handle that.
320-
if !context.syntax.contains(.multilineExtendedSyntax) {
322+
if !isScoped && context.syntax.contains(.multilineCompilerLiteral) {
323+
// An unscoped removal of extended syntax is not allowed in a multi-line
324+
// literal.
325+
if let opt = opts.removing.first(where: \.isAnyExtended) {
326+
throw Source.LocatedError(
327+
ParseError.cannotRemoveExtendedSyntaxInMultilineMode, opt.location)
328+
}
329+
if opts.resetsCurrentOptions {
330+
throw Source.LocatedError(
331+
ParseError.cannotResetExtendedSyntaxInMultilineMode, opts.caretLoc!)
332+
}
333+
// The only remaning case is an unscoped addition of extended syntax,
334+
// which is a no-op.
335+
} else {
336+
// We either have a scoped change of extended syntax, or this is a
337+
// single-line literal.
321338
mapOption(.extendedSyntax, \.isAnyExtended)
322339
}
323340
}
324341

325342
/// Apply the syntax options of a matching option changing group to the
326343
/// current set of options.
327-
private mutating func applySyntaxOptions(of group: AST.Group.Kind) {
344+
private mutating func applySyntaxOptions(
345+
of group: AST.Group.Kind, isScoped: Bool
346+
) throws {
328347
if case .changeMatchingOptions(let seq) = group {
329-
applySyntaxOptions(of: seq)
348+
try applySyntaxOptions(of: seq, isScoped: isScoped)
330349
}
331350
}
332351

@@ -337,14 +356,25 @@ extension Parser {
337356
context.recordGroup(kind.value)
338357

339358
let currentSyntax = context.syntax
340-
applySyntaxOptions(of: kind.value)
359+
try applySyntaxOptions(of: kind.value, isScoped: true)
341360
defer {
342361
context.syntax = currentSyntax
343362
}
344-
363+
let unsetsExtendedSyntax = currentSyntax.contains(.extendedSyntax) &&
364+
!context.syntax.contains(.extendedSyntax)
345365
let child = try parseNode()
346366
try source.expect(")")
347-
return .init(kind, child, loc(start))
367+
let groupLoc = loc(start)
368+
369+
// In multi-line literals, the body of a group that unsets extended syntax
370+
// may not span multiple lines.
371+
if unsetsExtendedSyntax &&
372+
context.syntax.contains(.multilineCompilerLiteral) &&
373+
source[child.location.range].spansMultipleLinesInRegexLiteral {
374+
throw Source.LocatedError(
375+
ParseError.unsetExtendedSyntaxMayNotSpanMultipleLines, groupLoc)
376+
}
377+
return .init(kind, child, groupLoc)
348378
}
349379

350380
/// Consume the body of an absent function.
@@ -438,7 +468,7 @@ extension Parser {
438468
// If we have a change matching options atom, apply the syntax options. We
439469
// already take care of scoping syntax options within a group.
440470
if case .changeMatchingOptions(let opts) = atom.kind {
441-
applySyntaxOptions(of: opts)
471+
try applySyntaxOptions(of: opts, isScoped: false)
442472
}
443473
// TODO: track source locations
444474
return .atom(atom)
@@ -592,7 +622,7 @@ public func parse<S: StringProtocol>(
592622
return ast
593623
}
594624

595-
extension String {
625+
extension StringProtocol {
596626
/// Whether the given string is considered multi-line for a regex literal.
597627
var spansMultipleLinesInRegexLiteral: Bool {
598628
unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" })
@@ -609,7 +639,7 @@ fileprivate func defaultSyntaxOptions(
609639
// For an extended syntax forward slash e.g #/.../#, extended syntax is
610640
// permitted if it spans multiple lines.
611641
if delim.poundCount > 0 && contents.spansMultipleLinesInRegexLiteral {
612-
return .multilineExtendedSyntax
642+
return [.multilineCompilerLiteral, .extendedSyntax]
613643
}
614644
return .traditional
615645
case .reSingleQuote:

Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,10 @@ public struct SyntaxOptions: OptionSet {
5858
/// `(_: .*)` == `(?:.*)`
5959
public static var experimentalCaptures: Self { Self(1 << 5) }
6060

61-
/// The default syntax for a multi-line regex literal.
62-
public static var multilineExtendedSyntax: Self {
63-
return [Self(1 << 6), .extendedSyntax]
64-
}
61+
/// The syntax kind of a multi-line literal. This will always be set when
62+
/// parsing a multi-line `#/.../#` literal. Note this does not imply extended
63+
/// syntax, as that may be temporarily disabled while parsing.
64+
public static var multilineCompilerLiteral: Self { Self(1 << 6) }
6565

6666
/// `(?n)`
6767
public static var namedCapturesOnly: Self { Self(1 << 7) }
@@ -76,8 +76,8 @@ public struct SyntaxOptions: OptionSet {
7676
public static var traditional: Self { Self(0) }
7777

7878
public static var experimental: Self {
79-
// Experimental syntax enables everything except end-of-line comments.
80-
Self(~0).subtracting(.endOfLineComments)
79+
[.nonSemanticWhitespace, .experimentalQuotes, .experimentalComments,
80+
.experimentalRanges, .experimentalCaptures]
8181
}
8282

8383
// TODO: Probably want to model strict-PCRE etc. options too.

Tests/RegexTests/ParseTests.swift

Lines changed: 83 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -772,6 +772,9 @@ extension RegexTests {
772772
syntax: .experimental)
773773
parseTest(#""\"""#, quote("\""), syntax: .experimental)
774774

775+
parseTest(#"(abc)"#, capture(concat("a", "b", "c")),
776+
syntax: .experimental, captures: [.cap])
777+
775778
// Quotes in character classes.
776779
parseTest(#"[\Q-\E]"#, charClass(quote_m("-")))
777780
parseTest(#"[\Qa-b[[*+\\E]"#, charClass(quote_m("a-b[[*+\\")))
@@ -1777,6 +1780,13 @@ extension RegexTests {
17771780
" ", "b"
17781781
)
17791782
)
1783+
parseTest(
1784+
"(?x) a (?^: b)", concat(
1785+
changeMatchingOptions(matchingOptions(adding: .extended)),
1786+
"a",
1787+
changeMatchingOptions(unsetMatchingOptions(), concat(" ", "b"))
1788+
)
1789+
)
17801790

17811791
parseTest("[ # abc]", charClass(" ", "#", " ", "a", "b", "c"))
17821792
parseTest("[#]", charClass("#"))
@@ -2099,6 +2109,17 @@ extension RegexTests {
20992109
throwsError: .unsupported, syntax: .extendedSyntax
21002110
)
21012111

2112+
parseWithDelimitersTest(
2113+
#"""
2114+
#/
2115+
a\
2116+
b\
2117+
c
2118+
/#
2119+
"""#,
2120+
concat("a", "\n", "b", "\n", "c")
2121+
)
2122+
21022123
// MARK: Parse with delimiters
21032124

21042125
parseWithDelimitersTest("/a b/", concat("a", " ", "b"))
@@ -2174,22 +2195,40 @@ extension RegexTests {
21742195
/#
21752196
""", concat("a", "b"))
21762197

2177-
// Make sure (?^) is ignored.
2198+
// (?x) has no effect.
21782199
parseWithDelimitersTest("""
21792200
#/
2180-
(?^)
2201+
(?x)
21812202
# comment
21822203
/#
2183-
""", changeMatchingOptions(unsetMatchingOptions())
2204+
""", changeMatchingOptions(matchingOptions(adding: .extended))
21842205
)
21852206

2186-
// (?x) has no effect.
2207+
// Scoped removal of extended syntax is allowed as long as it does not span
2208+
// multiple lines.
21872209
parseWithDelimitersTest("""
21882210
#/
2189-
(?x)
2190-
# comment
2211+
(?-x:a b)
21912212
/#
2192-
""", changeMatchingOptions(matchingOptions(adding: .extended))
2213+
""", changeMatchingOptions(
2214+
matchingOptions(removing: .extended),
2215+
concat("a", " ", "b")
2216+
)
2217+
)
2218+
parseWithDelimitersTest("""
2219+
#/
2220+
(?-xx:a b)
2221+
/#
2222+
""", changeMatchingOptions(
2223+
matchingOptions(removing: .extraExtended),
2224+
concat("a", " ", "b")
2225+
)
2226+
)
2227+
parseWithDelimitersTest("""
2228+
#/
2229+
(?^: a b ) # comment
2230+
/#
2231+
""", changeMatchingOptions(unsetMatchingOptions(), concat(" ", "a", " ", "b", " "))
21932232
)
21942233

21952234
parseWithDelimitersTest(#"""
@@ -2773,17 +2812,50 @@ extension RegexTests {
27732812
/#
27742813
""", .cannotRemoveExtendedSyntaxInMultilineMode
27752814
)
2815+
2816+
// Scoped removal of extended syntax may not span multiple lines
27762817
diagnosticWithDelimitersTest("""
27772818
#/
2778-
(?-x:a b)
2819+
(?-x:a b
2820+
)
27792821
/#
2780-
""", .cannotRemoveExtendedSyntaxInMultilineMode
2822+
""", .unsetExtendedSyntaxMayNotSpanMultipleLines
27812823
)
27822824
diagnosticWithDelimitersTest("""
27832825
#/
2784-
(?-xx:a b)
2826+
(?-x:a
2827+
b)
27852828
/#
2786-
""", .cannotRemoveExtendedSyntaxInMultilineMode
2829+
""", .unsetExtendedSyntaxMayNotSpanMultipleLines
2830+
)
2831+
diagnosticWithDelimitersTest("""
2832+
#/
2833+
(?-xx:
2834+
a b)
2835+
/#
2836+
""", .unsetExtendedSyntaxMayNotSpanMultipleLines
2837+
)
2838+
diagnosticWithDelimitersTest("""
2839+
#/
2840+
(?x-x:
2841+
a b)
2842+
/#
2843+
""", .unsetExtendedSyntaxMayNotSpanMultipleLines
2844+
)
2845+
diagnosticWithDelimitersTest("""
2846+
#/
2847+
(?^)
2848+
# comment
2849+
/#
2850+
""", .cannotResetExtendedSyntaxInMultilineMode
2851+
)
2852+
diagnosticWithDelimitersTest("""
2853+
#/
2854+
(?^:
2855+
# comment
2856+
)
2857+
/#
2858+
""", .unsetExtendedSyntaxMayNotSpanMultipleLines
27872859
)
27882860

27892861
diagnosticWithDelimitersTest(#"""

0 commit comments

Comments
 (0)