Skip to content

Commit 57a5092

Browse files
committed
Allow unbounded quoted sequences \Q...
PCRE and ICU both support quoted sequences that don't have a terminating `\E`. Update the parsing to allow this. Additionally, allow empty quoted sequences outside of custom character classes, which is consistent with ICU. Finally, don't allow quoted sequences to span multiple lines in extended syntax literals.
1 parent 471e073 commit 57a5092

File tree

4 files changed

+60
-6
lines changed

4 files changed

+60
-6
lines changed

Sources/_RegexParser/Regex/Parse/Diagnostics.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ enum ParseError: Error, Hashable {
4444
case invalidEscape(Character)
4545
case confusableCharacter(Character)
4646

47+
case quoteMayNotSpanMultipleLines
48+
4749
case cannotReferToWholePattern
4850

4951
case quantifierRequiresOperand(String)
@@ -138,6 +140,8 @@ extension ParseError: CustomStringConvertible {
138140
return "invalid escape sequence '\\\(c)'"
139141
case .confusableCharacter(let c):
140142
return "'\(c)' is confusable for a metacharacter; use '\\u{...}' instead"
143+
case .quoteMayNotSpanMultipleLines:
144+
return "quoted sequence may not span multiple lines in multi-line literal"
141145
case .cannotReferToWholePattern:
142146
return "cannot refer to whole pattern here"
143147
case .quantifierRequiresOperand(let q):

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -579,7 +579,7 @@ extension Source {
579579

580580
/// Try to consume quoted content
581581
///
582-
/// Quote -> '\Q' (!'\E' .)* '\E'
582+
/// Quote -> '\Q' (!'\E' .)* '\E'?
583583
///
584584
/// With `SyntaxOptions.experimentalQuotes`, also accepts
585585
///
@@ -592,9 +592,24 @@ extension Source {
592592
mutating func lexQuote(context: ParsingContext) throws -> AST.Quote? {
593593
let str = try recordLoc { src -> String? in
594594
if src.tryEat(sequence: #"\Q"#) {
595-
return try src.expectQuoted(endingWith: #"\E"#).value
595+
let contents = src.lexUntil { src in
596+
src.isEmpty || src.tryEat(sequence: #"\E"#)
597+
}.value
598+
599+
// In multi-line literals, the quote may not span multiple lines.
600+
if context.syntax.contains(.multilineExtendedSyntax),
601+
contents.spansMultipleLinesInRegexLiteral {
602+
throw ParseError.quoteMayNotSpanMultipleLines
603+
}
604+
605+
// The sequence must not be empty in a custom character class.
606+
if context.isInCustomCharacterClass && contents.isEmpty {
607+
throw ParseError.expectedNonEmptyContents
608+
}
609+
return contents
596610
}
597611
if context.experimentalQuotes, src.tryEat("\"") {
612+
// TODO: Can experimental quotes be empty?
598613
return try src.expectQuoted(endingWith: "\"", ignoreEscaped: true).value
599614
}
600615
return nil

Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,13 @@ public func parse<S: StringProtocol>(
592592
return ast
593593
}
594594

595+
extension String {
596+
/// Whether the given string is considered multi-line for a regex literal.
597+
var spansMultipleLinesInRegexLiteral: Bool {
598+
unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" })
599+
}
600+
}
601+
595602
/// Retrieve the default set of syntax options that a delimiter and literal
596603
/// contents indicates.
597604
fileprivate func defaultSyntaxOptions(
@@ -601,8 +608,7 @@ fileprivate func defaultSyntaxOptions(
601608
case .forwardSlash:
602609
// For an extended syntax forward slash e.g #/.../#, extended syntax is
603610
// permitted if it spans multiple lines.
604-
if delim.poundCount > 0 &&
605-
contents.unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" }) {
611+
if delim.poundCount > 0 && contents.spansMultipleLinesInRegexLiteral {
606612
return .multilineExtendedSyntax
607613
}
608614
return .traditional

Tests/RegexTests/ParseTests.swift

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -754,6 +754,14 @@ extension RegexTests {
754754
// This follows the PCRE behavior.
755755
parseTest(#"\Q\\E"#, quote("\\"))
756756

757+
// ICU allows quotes to be empty outside of custom character classes.
758+
parseTest(#"\Q\E"#, quote(""))
759+
760+
// Quotes may be unterminated.
761+
parseTest(#"\Qab"#, quote("ab"))
762+
parseTest(#"\Q"#, quote(""))
763+
parseTest("\\Qab\\", quote("ab\\"))
764+
757765
parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"),
758766
syntax: .experimental)
759767
parseTest(#"a" .""b""#, concat("a", quote(" ."), quote("b")),
@@ -2539,8 +2547,6 @@ extension RegexTests {
25392547
diagnosticTest(#"(?P"#, .expected(")"))
25402548
diagnosticTest(#"(?R"#, .expected(")"))
25412549

2542-
diagnosticTest(#"\Qab"#, .expected("\\E"))
2543-
diagnosticTest("\\Qab\\", .expected("\\E"))
25442550
diagnosticTest(#""ab"#, .expected("\""), syntax: .experimental)
25452551
diagnosticTest(#""ab\""#, .expected("\""), syntax: .experimental)
25462552
diagnosticTest("\"ab\\", .expectedEscape, syntax: .experimental)
@@ -2619,6 +2625,9 @@ extension RegexTests {
26192625
// TODO: Custom diagnostic for missing '\Q'
26202626
diagnosticTest(#"\E"#, .invalidEscape("E"))
26212627

2628+
diagnosticTest(#"[\Q\E]"#, .expectedNonEmptyContents)
2629+
diagnosticTest(#"[\Q]"#, .expected("]"))
2630+
26222631
// PCRE treats these as octal, but we require a `0` prefix.
26232632
diagnosticTest(#"[\1]"#, .invalidEscape("1"))
26242633
diagnosticTest(#"[\123]"#, .invalidEscape("1"))
@@ -2711,6 +2720,26 @@ extension RegexTests {
27112720
""", .cannotRemoveExtendedSyntaxInMultilineMode
27122721
)
27132722

2723+
diagnosticWithDelimitersTest(#"""
2724+
#/
2725+
\Q
2726+
\E
2727+
/#
2728+
"""#, .quoteMayNotSpanMultipleLines)
2729+
2730+
diagnosticWithDelimitersTest(#"""
2731+
#/
2732+
\Qabc
2733+
\E
2734+
/#
2735+
"""#, .quoteMayNotSpanMultipleLines)
2736+
2737+
diagnosticWithDelimitersTest(#"""
2738+
#/
2739+
\Q
2740+
/#
2741+
"""#, .quoteMayNotSpanMultipleLines)
2742+
27142743
// MARK: Group specifiers
27152744

27162745
diagnosticTest(#"(*"#, .unknownGroupKind("*"))

0 commit comments

Comments
 (0)