Skip to content

Commit 415b080

Browse files
authored
Merge pull request #594 from hamishknight/limited-edition
2 parents 9762399 + 1b7779a commit 415b080

File tree

4 files changed

+37
-159
lines changed

4 files changed

+37
-159
lines changed

Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift

Lines changed: 2 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ public struct Delimiter: Hashable {
3131
switch kind {
3232
case .forwardSlash:
3333
return poundCount > 0
34-
case .experimental, .reSingleQuote, .rxSingleQuote:
34+
case .experimental:
3535
return false
3636
}
3737
}
@@ -47,15 +47,11 @@ extension Delimiter {
4747
enum Kind: Hashable, CaseIterable {
4848
case forwardSlash
4949
case experimental
50-
case reSingleQuote
51-
case rxSingleQuote
5250

5351
var openingAndClosing: (opening: String, closing: String) {
5452
switch self {
5553
case .forwardSlash: return ("/", "/")
5654
case .experimental: return ("#|", "|#")
57-
case .reSingleQuote: return ("re'", "'")
58-
case .rxSingleQuote: return ("rx'", "'")
5955
}
6056
}
6157
var opening: String { openingAndClosing.opening }
@@ -67,7 +63,7 @@ extension Delimiter {
6763
switch self {
6864
case .forwardSlash:
6965
return true
70-
case .experimental, .reSingleQuote, .rxSingleQuote:
66+
case .experimental:
7167
return false
7268
}
7369
}
@@ -150,14 +146,6 @@ fileprivate struct DelimiterLexer {
150146
slice(at: cursor, count)
151147
}
152148

153-
/// Return the slice of `count` bytes preceding the current cursor, or `nil`
154-
/// if there are fewer than `count` bytes before the cursor.
155-
func sliceBehind(_ count: Int) -> UnsafeRawBufferPointer? {
156-
let priorCursor = cursor - count
157-
guard priorCursor >= start else { return nil }
158-
return slice(at: priorCursor, count)
159-
}
160-
161149
/// Advance the cursor `n` bytes.
162150
mutating func advanceCursor(_ n: Int = 1) {
163151
cursor += n
@@ -186,86 +174,6 @@ fileprivate struct DelimiterLexer {
186174
return true
187175
}
188176

189-
/// Attempt to skip over a closing delimiter character that is unlikely to be
190-
/// the actual closing delimiter.
191-
mutating func trySkipDelimiter(_ delimiter: Delimiter) {
192-
// Only the closing `'` for re'...'/rx'...' can potentially be skipped over.
193-
switch delimiter.kind {
194-
case .forwardSlash, .experimental:
195-
return
196-
case .reSingleQuote, .rxSingleQuote:
197-
break
198-
}
199-
guard load() == ascii("'") else { return }
200-
201-
/// Need to look for a prefix of `(?`, `(?(`, `\k`, `\g`, `(?C`, as those
202-
/// are the cases that could use single quotes. Note that none of these
203-
/// would be valid regex endings anyway.
204-
let calloutPrefix = "(?C"
205-
let prefix = ["(?", "(?(", #"\k"#, #"\g"#, calloutPrefix].first { prior in
206-
guard let priorSlice = sliceBehind(prior.utf8.count),
207-
priorSlice.elementsEqual(prior.utf8)
208-
else { return false }
209-
210-
// Make sure the slice isn't preceded by a '\', as that invalidates this
211-
// analysis.
212-
if let prior = sliceBehind(priorSlice.count + 1) {
213-
return prior[0] != ascii("\\")
214-
}
215-
return true
216-
}
217-
guard let prefix = prefix else { return }
218-
let isCallout = prefix == calloutPrefix
219-
220-
func isPossiblyGroupReference(_ c: UInt8) -> Bool {
221-
// If this is an ASCII character, make sure it's for a group name. Leave
222-
// other UTF-8 encoded scalars alone, this should at least catch cases
223-
// where we run into a symbol such as `{`, `.`, `;` that would indicate
224-
// we've likely advanced out of the bounds of the regex.
225-
let scalar = UnicodeScalar(c)
226-
guard scalar.isASCII else { return true }
227-
switch scalar {
228-
// Include '-' and '+' which may be used in recursion levels and relative
229-
// references.
230-
case "A"..."Z", "a"..."z", "0"..."9", "_", "-", "+":
231-
return true
232-
default:
233-
return false
234-
}
235-
}
236-
237-
// Make a note of the current lexing position, as we may need to revert
238-
// back to it.
239-
let originalCursor = cursor
240-
advanceCursor()
241-
242-
// Try skip over what would be the contents of a group identifier/reference.
243-
while let next = load() {
244-
// Found the ending, we're done. Return so we can continue to lex to the
245-
// real delimiter.
246-
if next == ascii("'") {
247-
advanceCursor()
248-
return
249-
}
250-
251-
// If this isn't a callout, make sure we have something that could be a
252-
// group reference. We limit the character set here to improve diagnostic
253-
// behavior in the case where the literal is actually unterminated. We
254-
// ideally don't want to go wandering off into Swift source code. We can't
255-
// do the same for callouts, as they take arbitrary strings.
256-
guard isCallout || isPossiblyGroupReference(next) else { break }
257-
do {
258-
try advance()
259-
} catch {
260-
break
261-
}
262-
}
263-
// We bailed out, either because we ran into something that didn't look like
264-
// an identifier, or we reached the end of the line. Revert back to the
265-
// original guess of delimiter.
266-
cursor = originalCursor
267-
}
268-
269177
/// Attempt to eat a particular closing delimiter, returning the contents of
270178
/// the literal, and ending pointer, or `nil` if this is not a delimiter
271179
/// ending.
@@ -401,10 +309,6 @@ fileprivate struct DelimiterLexer {
401309
}
402310
}
403311
while true {
404-
// Check to see if we're at a character that looks like a delimiter, but
405-
// likely isn't. In such a case, we can attempt to skip over it.
406-
trySkipDelimiter(delimiter)
407-
408312
// Try to lex the closing delimiter.
409313
if let (contents, end) = try tryEatEnding(delimiter,
410314
contentsStart: contentsStart) {

Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -672,9 +672,7 @@ fileprivate func defaultSyntaxOptions(
672672
return [.multilineCompilerLiteral, .extendedSyntax]
673673
}
674674
return .traditional
675-
case .reSingleQuote:
676-
return .traditional
677-
case .experimental, .rxSingleQuote:
675+
case .experimental:
678676
return .experimental
679677
}
680678
}

Tests/RegexTests/LexTests.swift

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,6 @@ extension RegexTests {
9696
("#|abc/#def#", nil),
9797
("#/abc\n/#", nil),
9898
("#/abc\r/#", nil),
99-
100-
(#"re'abcre\''"#, (#"abcre\'"#, delim(.reSingleQuote))),
101-
(#"re'\'"#, nil)
10299
]
103100

104101
for (input, expected) in testCases {

Tests/RegexTests/ParseTests.swift

Lines changed: 34 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -2151,9 +2151,6 @@ extension RegexTests {
21512151
parseWithDelimitersTest("##/a b/##", concat("a", " ", "b"))
21522152
parseWithDelimitersTest("#|a b|#", concat("a", "b"))
21532153

2154-
parseWithDelimitersTest("re'a b'", concat("a", " ", "b"))
2155-
parseWithDelimitersTest("rx'a b'", concat("a", "b"))
2156-
21572154
parseWithDelimitersTest("#|[a b]|#", charClass("a", "b"))
21582155
parseWithDelimitersTest(
21592156
"#|(?-x)[a b]|#", concat(
@@ -2176,13 +2173,13 @@ extension RegexTests {
21762173
parseWithDelimitersTest("#||||#", alt(empty(), empty(), empty()))
21772174
parseWithDelimitersTest("#|a||#", alt("a", empty()))
21782175

2179-
parseWithDelimitersTest("re'x*'", zeroOrMore(of: "x"))
2176+
parseWithDelimitersTest("/x*/", zeroOrMore(of: "x"))
21802177

2181-
parseWithDelimitersTest(#"re'🔥🇩🇰'"#, concat("🔥", "🇩🇰"))
2182-
parseWithDelimitersTest(#"re'🔥✅'"#, concat("🔥", ""))
2178+
parseWithDelimitersTest(#"/🔥🇩🇰/"#, concat("🔥", "🇩🇰"))
2179+
parseWithDelimitersTest(#"/🔥✅/"#, concat("🔥", ""))
21832180

21842181
// Printable ASCII characters.
2185-
delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##)
2182+
delimiterLexingTest(##"#/ !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~/#"##)
21862183

21872184
// Make sure we can handle a combining accent as first character.
21882185
parseWithDelimitersTest("/\u{301}/", "\u{301}")
@@ -2294,72 +2291,61 @@ extension RegexTests {
22942291
/#
22952292
"""#, charClass(range_m("a", "b")))
22962293

2297-
2298-
// MARK: Delimiter skipping: Make sure we can skip over the ending delimiter
2299-
// if it's clear that it's part of the regex syntax.
2300-
23012294
parseWithDelimitersTest(
2302-
#"re'(?'a_bcA0'\')'"#, namedCapture("a_bcA0", "'"))
2295+
#"/(?'a_bcA0'\')/"#, namedCapture("a_bcA0", "'"))
23032296
parseWithDelimitersTest(
2304-
#"re'(?'a_bcA0-c1A'x*)'"#,
2297+
#"/(?'a_bcA0-c1A'x*)/"#,
23052298
balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x")),
23062299
unsupported: true)
23072300

23082301
parseWithDelimitersTest(
2309-
#"rx' (?'a_bcA0' a b)'"#, concat(namedCapture("a_bcA0", concat("a", "b"))))
2302+
#"/ (?'a_bcA0' a b)/"#, concat(" ", namedCapture("a_bcA0", concat(" ", "a", " ", "b"))))
23102303

23112304
parseWithDelimitersTest(
2312-
#"re'(?('a_bcA0')x|y)'"#, conditional(
2305+
#"/(?('a_bcA0')x|y)/"#, conditional(
23132306
.groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y"),
23142307
unsupported: true
23152308
)
23162309
parseWithDelimitersTest(
2317-
#"re'(?('+20')\')'"#, conditional(
2310+
#"/(?('+20')\')/"#, conditional(
23182311
.groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty()),
23192312
unsupported: true
23202313
)
23212314
parseWithDelimitersTest(
2322-
#"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .invalidNamedReference("b0A"))
2315+
#"/a\k'b0A'/"#, concat("a", backreference(.named("b0A"))), throwsError: .invalidNamedReference("b0A"))
23232316
parseWithDelimitersTest(
2324-
#"re'\k'+2-1''"#, backreference(ref(plus: 2), recursionLevel: -1),
2317+
#"/\k'+2-1'/"#, backreference(ref(plus: 2), recursionLevel: -1),
23252318
unsupported: true
23262319
)
23272320

23282321
parseWithDelimitersTest(
2329-
#"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A"))), unsupported: true)
2322+
#"/a\g'b0A'/"#, concat("a", subpattern(.named("b0A"))), unsupported: true)
23302323
parseWithDelimitersTest(
2331-
#"re'\g'-1'\''"#, concat(subpattern(ref(minus: 1)), "'"), unsupported: true)
2324+
#"/\g'-1'\'/"#, concat(subpattern(ref(minus: 1)), "'"), unsupported: true)
23322325

23332326
parseWithDelimitersTest(
2334-
#"re'(?C'a*b\c 🔥_ ;')'"#, pcreCallout(string: #"a*b\c 🔥_ ;"#),
2327+
#"/(?C'a*b\c 🔥_ ;')/"#, pcreCallout(string: #"a*b\c 🔥_ ;"#),
23352328
unsupported: true)
23362329

2337-
// Fine, because we don't end up skipping.
2338-
delimiterLexingTest(#"re'(?'"#)
2339-
delimiterLexingTest(#"re'(?('"#)
2340-
delimiterLexingTest(#"re'\k'"#)
2341-
delimiterLexingTest(#"re'\g'"#)
2342-
delimiterLexingTest(#"re'(?C'"#)
2330+
delimiterLexingTest(#"/(?/"#)
2331+
delimiterLexingTest(#"/(?(/"#)
2332+
delimiterLexingTest(#"/\k/"#)
2333+
delimiterLexingTest(#"/\g/"#)
2334+
delimiterLexingTest(#"/(?C/"#)
23432335

2344-
// Not a valid group name, but we can still skip over it.
2345-
delimiterLexingTest(#"re'(?'🔥')'"#)
2336+
delimiterLexingTest(#"/(?'🔥')/"#)
23462337

2347-
// Escaped, so don't skip. These will ignore the ending `'` as we've already
2348-
// closed the literal.
23492338
parseWithDelimitersTest(
2350-
#"re'\(?''"#, zeroOrOne(of: "("), ignoreTrailing: true
2351-
)
2339+
#"/\(?/"#, zeroOrOne(of: "("))
23522340
parseWithDelimitersTest(
2353-
#"re'\\k''"#, concat("\\", "k"), ignoreTrailing: true
2354-
)
2341+
#"/\\k/"#, concat("\\", "k"))
23552342
parseWithDelimitersTest(
2356-
#"re'\\g''"#, concat("\\", "g"), ignoreTrailing: true
2357-
)
2343+
#"/\\g/"#, concat("\\", "g"))
23582344
parseWithDelimitersTest(
2359-
#"re'\(?C''"#, concat(zeroOrOne(of: "("), "C"), ignoreTrailing: true
2360-
)
2361-
delimiterLexingTest(#"re'(\?''"#, ignoreTrailing: true)
2362-
delimiterLexingTest(#"re'\(?(''"#, ignoreTrailing: true)
2345+
#"/\(?C/"#, concat(zeroOrOne(of: "("), "C"))
2346+
2347+
delimiterLexingTest(#"/(\?/"#)
2348+
delimiterLexingTest(#"/\(?(/"#)
23632349

23642350
// MARK: Parse not-equal
23652351

@@ -3322,31 +3308,24 @@ extension RegexTests {
33223308

33233309
// MARK: Printable ASCII
33243310

3325-
delimiterLexingDiagnosticTest(#"re'\\#n'"#, .unterminated)
33263311
for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r.
3327-
delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII)
3312+
delimiterLexingDiagnosticTest("/\(UnicodeScalar(i))/", .unprintableASCII)
33283313
}
3329-
delimiterLexingDiagnosticTest("re'\n'", .unterminated)
3330-
delimiterLexingDiagnosticTest("re'\r'", .unterminated)
3331-
delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII)
33323314

3333-
// MARK: Delimiter skipping
3315+
// Can only be done if pound signs are used.
3316+
delimiterLexingDiagnosticTest("/\n/", .unterminated)
3317+
delimiterLexingDiagnosticTest("/\r/", .unterminated)
3318+
delimiterLexingDiagnosticTest("/\u{7F}/", .unprintableASCII)
33343319

3335-
delimiterLexingDiagnosticTest("re'(?''", .unterminated)
3336-
delimiterLexingDiagnosticTest("re'(?'abc'", .unterminated)
3337-
delimiterLexingDiagnosticTest("re'(?('abc'", .unterminated)
3338-
delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .unterminated)
3339-
delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .unterminated)
3320+
delimiterLexingDiagnosticTest("/", .unterminated)
3321+
delimiterLexingDiagnosticTest("/x", .unterminated)
33403322

33413323
// MARK: Unbalanced extended syntax
33423324
delimiterLexingDiagnosticTest("#/a/", .unterminated)
33433325
delimiterLexingDiagnosticTest("##/a/#", .unterminated)
33443326

33453327
// MARK: Multiline
33463328

3347-
// Can only be done if pound signs are used.
3348-
delimiterLexingDiagnosticTest("/\n/", .unterminated)
3349-
33503329
// Opening and closing delimiters must be on a newline.
33513330
delimiterLexingDiagnosticTest("#/a\n/#", .unterminated)
33523331
delimiterLexingDiagnosticTest("#/\na/#", .multilineClosingNotOnNewline)

0 commit comments

Comments
 (0)