Skip to content

Remove re'...' and rx'...' delimiters #594

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 2 additions & 98 deletions Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ public struct Delimiter: Hashable {
switch kind {
case .forwardSlash:
return poundCount > 0
case .experimental, .reSingleQuote, .rxSingleQuote:
case .experimental:
return false
}
}
Expand All @@ -47,15 +47,11 @@ extension Delimiter {
enum Kind: Hashable, CaseIterable {
case forwardSlash
case experimental
case reSingleQuote
case rxSingleQuote

var openingAndClosing: (opening: String, closing: String) {
switch self {
case .forwardSlash: return ("/", "/")
case .experimental: return ("#|", "|#")
case .reSingleQuote: return ("re'", "'")
case .rxSingleQuote: return ("rx'", "'")
}
}
var opening: String { openingAndClosing.opening }
Expand All @@ -67,7 +63,7 @@ extension Delimiter {
switch self {
case .forwardSlash:
return true
case .experimental, .reSingleQuote, .rxSingleQuote:
case .experimental:
return false
}
}
Expand Down Expand Up @@ -150,14 +146,6 @@ fileprivate struct DelimiterLexer {
slice(at: cursor, count)
}

/// Return the slice of `count` bytes preceding the current cursor, or `nil`
/// if there are fewer than `count` bytes before the cursor.
func sliceBehind(_ count: Int) -> UnsafeRawBufferPointer? {
let priorCursor = cursor - count
guard priorCursor >= start else { return nil }
return slice(at: priorCursor, count)
}

/// Advance the cursor `n` bytes.
mutating func advanceCursor(_ n: Int = 1) {
cursor += n
Expand Down Expand Up @@ -186,86 +174,6 @@ fileprivate struct DelimiterLexer {
return true
}

/// Attempt to skip over a closing delimiter character that is unlikely to be
/// the actual closing delimiter.
mutating func trySkipDelimiter(_ delimiter: Delimiter) {
// Only the closing `'` for re'...'/rx'...' can potentially be skipped over.
switch delimiter.kind {
case .forwardSlash, .experimental:
return
case .reSingleQuote, .rxSingleQuote:
break
}
guard load() == ascii("'") else { return }

/// Need to look for a prefix of `(?`, `(?(`, `\k`, `\g`, `(?C`, as those
/// are the cases that could use single quotes. Note that none of these
/// would be valid regex endings anyway.
let calloutPrefix = "(?C"
let prefix = ["(?", "(?(", #"\k"#, #"\g"#, calloutPrefix].first { prior in
guard let priorSlice = sliceBehind(prior.utf8.count),
priorSlice.elementsEqual(prior.utf8)
else { return false }

// Make sure the slice isn't preceded by a '\', as that invalidates this
// analysis.
if let prior = sliceBehind(priorSlice.count + 1) {
return prior[0] != ascii("\\")
}
return true
}
guard let prefix = prefix else { return }
let isCallout = prefix == calloutPrefix

func isPossiblyGroupReference(_ c: UInt8) -> Bool {
// If this is an ASCII character, make sure it's for a group name. Leave
// other UTF-8 encoded scalars alone, this should at least catch cases
// where we run into a symbol such as `{`, `.`, `;` that would indicate
// we've likely advanced out of the bounds of the regex.
let scalar = UnicodeScalar(c)
guard scalar.isASCII else { return true }
switch scalar {
// Include '-' and '+' which may be used in recursion levels and relative
// references.
case "A"..."Z", "a"..."z", "0"..."9", "_", "-", "+":
return true
default:
return false
}
}

// Make a note of the current lexing position, as we may need to revert
// back to it.
let originalCursor = cursor
advanceCursor()

// Try skip over what would be the contents of a group identifier/reference.
while let next = load() {
// Found the ending, we're done. Return so we can continue to lex to the
// real delimiter.
if next == ascii("'") {
advanceCursor()
return
}

// If this isn't a callout, make sure we have something that could be a
// group reference. We limit the character set here to improve diagnostic
// behavior in the case where the literal is actually unterminated. We
// ideally don't want to go wandering off into Swift source code. We can't
// do the same for callouts, as they take arbitrary strings.
guard isCallout || isPossiblyGroupReference(next) else { break }
do {
try advance()
} catch {
break
}
}
// We bailed out, either because we ran into something that didn't look like
// an identifier, or we reached the end of the line. Revert back to the
// original guess of delimiter.
cursor = originalCursor
}

/// Attempt to eat a particular closing delimiter, returning the contents of
/// the literal, and ending pointer, or `nil` if this is not a delimiter
/// ending.
Expand Down Expand Up @@ -401,10 +309,6 @@ fileprivate struct DelimiterLexer {
}
}
while true {
// Check to see if we're at a character that looks like a delimiter, but
// likely isn't. In such a case, we can attempt to skip over it.
trySkipDelimiter(delimiter)

// Try to lex the closing delimiter.
if let (contents, end) = try tryEatEnding(delimiter,
contentsStart: contentsStart) {
Expand Down
4 changes: 1 addition & 3 deletions Sources/_RegexParser/Regex/Parse/Parse.swift
Original file line number Diff line number Diff line change
Expand Up @@ -672,9 +672,7 @@ fileprivate func defaultSyntaxOptions(
return [.multilineCompilerLiteral, .extendedSyntax]
}
return .traditional
case .reSingleQuote:
return .traditional
case .experimental, .rxSingleQuote:
case .experimental:
return .experimental
}
}
Expand Down
3 changes: 0 additions & 3 deletions Tests/RegexTests/LexTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,6 @@ extension RegexTests {
("#|abc/#def#", nil),
("#/abc\n/#", nil),
("#/abc\r/#", nil),

(#"re'abcre\''"#, (#"abcre\'"#, delim(.reSingleQuote))),
(#"re'\'"#, nil)
]

for (input, expected) in testCases {
Expand Down
89 changes: 34 additions & 55 deletions Tests/RegexTests/ParseTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -2151,9 +2151,6 @@ extension RegexTests {
parseWithDelimitersTest("##/a b/##", concat("a", " ", "b"))
parseWithDelimitersTest("#|a b|#", concat("a", "b"))

parseWithDelimitersTest("re'a b'", concat("a", " ", "b"))
parseWithDelimitersTest("rx'a b'", concat("a", "b"))

parseWithDelimitersTest("#|[a b]|#", charClass("a", "b"))
parseWithDelimitersTest(
"#|(?-x)[a b]|#", concat(
Expand All @@ -2176,13 +2173,13 @@ extension RegexTests {
parseWithDelimitersTest("#||||#", alt(empty(), empty(), empty()))
parseWithDelimitersTest("#|a||#", alt("a", empty()))

parseWithDelimitersTest("re'x*'", zeroOrMore(of: "x"))
parseWithDelimitersTest("/x*/", zeroOrMore(of: "x"))

parseWithDelimitersTest(#"re'🔥🇩🇰'"#, concat("🔥", "🇩🇰"))
parseWithDelimitersTest(#"re'🔥✅'"#, concat("🔥", "✅"))
parseWithDelimitersTest(#"/🔥🇩🇰/"#, concat("🔥", "🇩🇰"))
parseWithDelimitersTest(#"/🔥✅/"#, concat("🔥", "✅"))

// Printable ASCII characters.
delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##)
delimiterLexingTest(##"#/ !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~/#"##)

// Make sure we can handle a combining accent as first character.
parseWithDelimitersTest("/\u{301}/", "\u{301}")
Expand Down Expand Up @@ -2294,72 +2291,61 @@ extension RegexTests {
/#
"""#, charClass(range_m("a", "b")))


// MARK: Delimiter skipping: Make sure we can skip over the ending delimiter
// if it's clear that it's part of the regex syntax.

parseWithDelimitersTest(
#"re'(?'a_bcA0'\')'"#, namedCapture("a_bcA0", "'"))
#"/(?'a_bcA0'\')/"#, namedCapture("a_bcA0", "'"))
parseWithDelimitersTest(
#"re'(?'a_bcA0-c1A'x*)'"#,
#"/(?'a_bcA0-c1A'x*)/"#,
balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x")),
unsupported: true)

parseWithDelimitersTest(
#"rx' (?'a_bcA0' a b)'"#, concat(namedCapture("a_bcA0", concat("a", "b"))))
#"/ (?'a_bcA0' a b)/"#, concat(" ", namedCapture("a_bcA0", concat(" ", "a", " ", "b"))))

parseWithDelimitersTest(
#"re'(?('a_bcA0')x|y)'"#, conditional(
#"/(?('a_bcA0')x|y)/"#, conditional(
.groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y"),
unsupported: true
)
parseWithDelimitersTest(
#"re'(?('+20')\')'"#, conditional(
#"/(?('+20')\')/"#, conditional(
.groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty()),
unsupported: true
)
parseWithDelimitersTest(
#"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .invalidNamedReference("b0A"))
#"/a\k'b0A'/"#, concat("a", backreference(.named("b0A"))), throwsError: .invalidNamedReference("b0A"))
parseWithDelimitersTest(
#"re'\k'+2-1''"#, backreference(ref(plus: 2), recursionLevel: -1),
#"/\k'+2-1'/"#, backreference(ref(plus: 2), recursionLevel: -1),
unsupported: true
)

parseWithDelimitersTest(
#"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A"))), unsupported: true)
#"/a\g'b0A'/"#, concat("a", subpattern(.named("b0A"))), unsupported: true)
parseWithDelimitersTest(
#"re'\g'-1'\''"#, concat(subpattern(ref(minus: 1)), "'"), unsupported: true)
#"/\g'-1'\'/"#, concat(subpattern(ref(minus: 1)), "'"), unsupported: true)

parseWithDelimitersTest(
#"re'(?C'a*b\c 🔥_ ;')'"#, pcreCallout(string: #"a*b\c 🔥_ ;"#),
#"/(?C'a*b\c 🔥_ ;')/"#, pcreCallout(string: #"a*b\c 🔥_ ;"#),
unsupported: true)

// Fine, because we don't end up skipping.
delimiterLexingTest(#"re'(?'"#)
delimiterLexingTest(#"re'(?('"#)
delimiterLexingTest(#"re'\k'"#)
delimiterLexingTest(#"re'\g'"#)
delimiterLexingTest(#"re'(?C'"#)
delimiterLexingTest(#"/(?/"#)
delimiterLexingTest(#"/(?(/"#)
delimiterLexingTest(#"/\k/"#)
delimiterLexingTest(#"/\g/"#)
delimiterLexingTest(#"/(?C/"#)

// Not a valid group name, but we can still skip over it.
delimiterLexingTest(#"re'(?'🔥')'"#)
delimiterLexingTest(#"/(?'🔥')/"#)

// Escaped, so don't skip. These will ignore the ending `'` as we've already
// closed the literal.
parseWithDelimitersTest(
#"re'\(?''"#, zeroOrOne(of: "("), ignoreTrailing: true
)
#"/\(?/"#, zeroOrOne(of: "("))
parseWithDelimitersTest(
#"re'\\k''"#, concat("\\", "k"), ignoreTrailing: true
)
#"/\\k/"#, concat("\\", "k"))
parseWithDelimitersTest(
#"re'\\g''"#, concat("\\", "g"), ignoreTrailing: true
)
#"/\\g/"#, concat("\\", "g"))
parseWithDelimitersTest(
#"re'\(?C''"#, concat(zeroOrOne(of: "("), "C"), ignoreTrailing: true
)
delimiterLexingTest(#"re'(\?''"#, ignoreTrailing: true)
delimiterLexingTest(#"re'\(?(''"#, ignoreTrailing: true)
#"/\(?C/"#, concat(zeroOrOne(of: "("), "C"))

delimiterLexingTest(#"/(\?/"#)
delimiterLexingTest(#"/\(?(/"#)

// MARK: Parse not-equal

Expand Down Expand Up @@ -3322,31 +3308,24 @@ extension RegexTests {

// MARK: Printable ASCII

delimiterLexingDiagnosticTest(#"re'\\#n'"#, .unterminated)
for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r.
delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII)
delimiterLexingDiagnosticTest("/\(UnicodeScalar(i))/", .unprintableASCII)
}
delimiterLexingDiagnosticTest("re'\n'", .unterminated)
delimiterLexingDiagnosticTest("re'\r'", .unterminated)
delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII)

// MARK: Delimiter skipping
// Can only be done if pound signs are used.
delimiterLexingDiagnosticTest("/\n/", .unterminated)
delimiterLexingDiagnosticTest("/\r/", .unterminated)
delimiterLexingDiagnosticTest("/\u{7F}/", .unprintableASCII)

delimiterLexingDiagnosticTest("re'(?''", .unterminated)
delimiterLexingDiagnosticTest("re'(?'abc'", .unterminated)
delimiterLexingDiagnosticTest("re'(?('abc'", .unterminated)
delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .unterminated)
delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .unterminated)
delimiterLexingDiagnosticTest("/", .unterminated)
delimiterLexingDiagnosticTest("/x", .unterminated)

// MARK: Unbalanced extended syntax
delimiterLexingDiagnosticTest("#/a/", .unterminated)
delimiterLexingDiagnosticTest("##/a/#", .unterminated)

// MARK: Multiline

// Can only be done if pound signs are used.
delimiterLexingDiagnosticTest("/\n/", .unterminated)

// Opening and closing delimiters must be on a newline.
delimiterLexingDiagnosticTest("#/a\n/#", .unterminated)
delimiterLexingDiagnosticTest("#/\na/#", .multilineClosingNotOnNewline)
Expand Down