Skip to content

Commit ed5aedb

Browse files
authored
Implement atomic non-capturing groups (#488)
* Add a `clearThrough` instruction This will let us fix lookahead assertions that have leftover save points in the subpattern on success, and also allow us to implement atomic groups. * Fix lookaheads with quantifiers On success, the subpatterns in lookaheads like (?=.*e) had a save point that persisted, causing the logic in the lookahead group to be invalid. * Implement atomic non-capturing group support In addition to the (?>...) syntax, this is what's underneath `Local`.
1 parent 005e0fb commit ed5aedb

File tree

8 files changed

+183
-15
lines changed

8 files changed

+183
-15
lines changed

Sources/_RegexParser/Regex/Parse/Sema.swift

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,8 @@ extension RegexValidator {
325325
func validateGroup(_ group: AST.Group) throws {
326326
let kind = group.kind
327327
switch kind.value {
328-
case .capture, .namedCapture, .nonCapture, .lookahead, .negativeLookahead:
328+
case .capture, .namedCapture, .nonCapture, .lookahead, .negativeLookahead,
329+
.atomicNonCapturing:
329330
break
330331

331332
case .balancedCapture:
@@ -336,9 +337,6 @@ extension RegexValidator {
336337
// We need to figure out how these interact with typed captures.
337338
throw error(.unsupported("branch reset group"), at: kind.location)
338339

339-
case .atomicNonCapturing:
340-
throw error(.unsupported("atomic group"), at: kind.location)
341-
342340
case .nonAtomicLookahead:
343341
throw error(.unsupported("non-atomic lookahead"), at: kind.location)
344342

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ fileprivate extension Compiler.ByteCodeGen {
306306
save(restoringAt: success)
307307
save(restoringAt: intercept)
308308
<sub-pattern> // failure restores at intercept
309-
clearSavePoint // remove intercept
309+
clearThrough(intercept) // remove intercept and any leftovers from <sub-pattern>
310310
<if negative>:
311311
clearSavePoint // remove success
312312
fail // positive->success, negative propagates
@@ -324,7 +324,7 @@ fileprivate extension Compiler.ByteCodeGen {
324324
builder.buildSave(success)
325325
builder.buildSave(intercept)
326326
try emitNode(child)
327-
builder.buildClear()
327+
builder.buildClearThrough(intercept)
328328
if !positive {
329329
builder.buildClear()
330330
}
@@ -339,6 +339,38 @@ fileprivate extension Compiler.ByteCodeGen {
339339
builder.label(success)
340340
}
341341

342+
mutating func emitAtomicNoncapturingGroup(
343+
_ child: DSLTree.Node
344+
) throws {
345+
/*
346+
save(continuingAt: success)
347+
save(restoringAt: intercept)
348+
<sub-pattern> // failure restores at intercept
349+
clearThrough(intercept) // remove intercept and any leftovers from <sub-pattern>
350+
fail // ->success
351+
intercept:
352+
clearSavePoint // remove success
353+
fail // propagate failure
354+
success:
355+
...
356+
*/
357+
358+
let intercept = builder.makeAddress()
359+
let success = builder.makeAddress()
360+
361+
builder.buildSaveAddress(success)
362+
builder.buildSave(intercept)
363+
try emitNode(child)
364+
builder.buildClearThrough(intercept)
365+
builder.buildFail()
366+
367+
builder.label(intercept)
368+
builder.buildClear()
369+
builder.buildFail()
370+
371+
builder.label(success)
372+
}
373+
342374
mutating func emitMatcher(
343375
_ matcher: @escaping _MatcherInterface
344376
) -> ValueRegister {
@@ -384,6 +416,9 @@ fileprivate extension Compiler.ByteCodeGen {
384416
}
385417
options.apply(optionSequence)
386418
try emitNode(child)
419+
420+
case .atomicNonCapturing:
421+
try emitAtomicNoncapturingGroup(child)
387422

388423
default:
389424
// FIXME: Other kinds...

Sources/_StringProcessing/Engine/Instruction.swift

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,13 @@ extension Instruction {
228228
/// Precondition: There is a save point to remove
229229
case clear
230230

231+
/// Remove save points up to and including the operand
232+
///
233+
/// Operand: instruction address to look for
234+
///
235+
/// Precondition: The operand is in the save point list
236+
case clearThrough
237+
231238
/// View the most recently saved point
232239
///
233240
/// UNIMPLEMENTED

Sources/_StringProcessing/Engine/MEBuilder.swift

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,10 @@ extension MEProgram.Builder {
153153
mutating func buildClear() {
154154
instructions.append(.init(.clear))
155155
}
156+
mutating func buildClearThrough(_ t: AddressToken) {
157+
instructions.append(.init(.clearThrough))
158+
fixup(to: t)
159+
}
156160
mutating func buildRestore() {
157161
instructions.append(.init(.restore))
158162
}
@@ -317,7 +321,7 @@ extension MEProgram.Builder {
317321
case .condBranchZeroElseDecrement:
318322
payload = .init(addr: addr, int: inst.payload.int)
319323

320-
case .branch, .save, .saveAddress, .call:
324+
case .branch, .save, .saveAddress, .call, .clearThrough:
321325
payload = .init(addr: addr)
322326

323327
case .splitSaving:

Sources/_StringProcessing/Engine/Processor.swift

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,17 @@ extension Processor {
204204
}
205205
}
206206

207+
mutating func clearThrough(_ address: InstructionAddress) {
208+
while let sp = savePoints.popLast() {
209+
if sp.pc == address {
210+
controller.step()
211+
return
212+
}
213+
}
214+
// TODO: What should we do here?
215+
fatalError("Invalid code: Tried to clear save points when empty")
216+
}
217+
207218
mutating func cycle() {
208219
_checkInvariants()
209220
assert(state == .inProgress)
@@ -288,9 +299,13 @@ extension Processor {
288299
if let _ = savePoints.popLast() {
289300
controller.step()
290301
} else {
291-
fatalError("TODO: What should we do here?")
302+
// TODO: What should we do here?
303+
fatalError("Invalid code: Tried to clear save points when empty")
292304
}
293305

306+
case .clearThrough:
307+
clearThrough(payload.addr)
308+
294309
case .peek:
295310
fatalError()
296311

Tests/RegexBuilderTests/RegexDSLTests.swift

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,29 @@ class RegexDSLTests: XCTestCase {
467467
XCTAssertEqual("ab12".firstMatch(of: octoDecimalRegex)!.output.1, 61904)
468468
}
469469

470+
func testLocal() throws {
471+
try _testDSLCaptures(
472+
("aaaaa", nil),
473+
matchType: Substring.self, ==)
474+
{
475+
Local {
476+
OneOrMore("a")
477+
}
478+
"a"
479+
}
480+
481+
try _testDSLCaptures(
482+
("aa", "aa"),
483+
("aaa", nil),
484+
matchType: Substring.self, ==)
485+
{
486+
Local {
487+
OneOrMore("a", .reluctant)
488+
}
489+
"a"
490+
}
491+
}
492+
470493
func testAssertions() throws {
471494
try _testDSLCaptures(
472495
("aaaaab", "aaaaab"),

Tests/RegexTests/MatchTests.swift

Lines changed: 91 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -891,8 +891,7 @@ extension RegexTests {
891891
input: "Price: 100 dollars", match: nil)
892892
firstMatchTest(
893893
#"(?=\d+ dollars)\d+"#,
894-
input: "Price: 100 dollars", match: "100",
895-
xfail: true) // TODO
894+
input: "Price: 100 dollars", match: "100")
896895

897896
firstMatchTest(
898897
#"\d+(*pla: dollars)"#,
@@ -917,6 +916,14 @@ extension RegexTests {
917916
#"\d+(*negative_lookahead: dollars)"#,
918917
input: "Price: 100 pesos", match: "100")
919918

919+
// More complex lookaheads
920+
firstMatchTests(
921+
#"(?=.*e)(?=.*o)(?!.*z)."#,
922+
(input: "hello", match: "h"),
923+
(input: "hzello", match: "e"),
924+
(input: "hezllo", match: nil),
925+
(input: "helloz", match: nil))
926+
920927
firstMatchTest(
921928
#"(?<=USD)\d+"#, input: "Price: USD100", match: "100", xfail: true)
922929
firstMatchTest(
@@ -1050,14 +1057,93 @@ extension RegexTests {
10501057
firstMatchTest(
10511058
#"(?:a|.b)c"#, input: "123abcacxyz", match: "abc")
10521059
firstMatchTest(
1053-
#"(?>a|.b)c"#, input: "123abcacxyz", match: "ac", xfail: true)
1060+
#"(?>a|.b)c"#, input: "123abcacxyz", match: "ac")
10541061
firstMatchTest(
1055-
"(*atomic:a|.b)c", input: "123abcacxyz", match: "ac", xfail: true)
1062+
"(*atomic:a|.b)c", input: "123abcacxyz", match: "ac")
10561063
firstMatchTest(
10571064
#"(?:a+)[a-z]c"#, input: "123aacacxyz", match: "aac")
10581065
firstMatchTest(
1059-
#"(?>a+)[a-z]c"#, input: "123aacacxyz", match: "ac", xfail: true)
1066+
#"(?>a+)[a-z]c"#, input: "123aacacxyz", match: nil)
1067+
1068+
// Atomicity should stay in the atomic group
1069+
firstMatchTest(
1070+
#"(?:(?>a)|.b)c"#, input: "123abcacxyz", match: "abc")
1071+
1072+
// Quantifier behavior inside atomic groups
1073+
1074+
// (?:a+?) matches as few 'a's as possible, after matching the first
1075+
// (?>a+?) always matches exactly one 'a'
1076+
firstMatchTests(
1077+
#"^(?:a+?)a$"#,
1078+
(input: "a", match: nil),
1079+
(input: "aa", match: "aa"),
1080+
(input: "aaa", match: "aaa"))
1081+
firstMatchTests(
1082+
#"^(?>a+?)a$"#,
1083+
(input: "a", match: nil),
1084+
(input: "aa", match: "aa"),
1085+
(input: "aaa", match: nil))
1086+
1087+
// (?:a?+) and (?>a?+) are equivalent: they match one 'a' if available
1088+
firstMatchTests(
1089+
#"^(?:a?+)a$"#,
1090+
(input: "a", match: nil),
1091+
xfail: true)
1092+
firstMatchTests(
1093+
#"^(?:a?+)a$"#,
1094+
(input: "aa", match: "aa"),
1095+
(input: "aaa", match: nil))
1096+
firstMatchTests(
1097+
#"^(?>a?+)a$"#,
1098+
(input: "a", match: nil),
1099+
(input: "aa", match: "aa"),
1100+
(input: "aaa", match: nil))
10601101

1102+
// Capture behavior in non-atomic vs atomic groups
1103+
firstMatchTests(
1104+
#"(\d+)\w+\1"#,
1105+
(input: "123x12", match: "123x12"), // `\w+` matches "3x" in this case
1106+
(input: "23x23", match: "23x23"),
1107+
(input: "123x23", match: "23x23"))
1108+
firstMatchTests(
1109+
#"(?>(\d+))\w+\1"#,
1110+
(input: "123x12", match: nil))
1111+
firstMatchTests(
1112+
#"(?>(\d+))\w+\1"#,
1113+
(input: "23x23", match: "23x23"),
1114+
(input: "123x23", match: "23x23"),
1115+
xfail: true)
1116+
1117+
// Backreferences in lookaheads
1118+
firstMatchTests(
1119+
#"^(?=.*(.)(.)\2\1).+$"#,
1120+
(input: "abbba", match: nil),
1121+
(input: "ABBA", match: "ABBA"),
1122+
(input: "defABBAdef", match: "defABBAdef"))
1123+
firstMatchTests(
1124+
#"^(?=.*(.)(.)\2\1).+\2$"#,
1125+
(input: "abbba", match: nil),
1126+
(input: "ABBA", match: nil),
1127+
(input: "defABBAdef", match: nil))
1128+
// FIXME: Backreferences don't escape positive lookaheads
1129+
firstMatchTests(
1130+
#"^(?=.*(.)(.)\2\1).+\2$"#,
1131+
(input: "ABBAB", match: "ABBAB"),
1132+
(input: "defABBAdefB", match: "defABBAdefB"),
1133+
xfail: true)
1134+
1135+
firstMatchTests(
1136+
#"^(?!.*(.)(.)\2\1).+$"#,
1137+
(input: "abbba", match: "abbba"),
1138+
(input: "ABBA", match: nil),
1139+
(input: "defABBAdef", match: nil))
1140+
// Backreferences don't escape negative lookaheads;
1141+
// matching only proceeds when the lookahead fails
1142+
firstMatchTests(
1143+
#"^(?!.*(.)(.)\2\1).+\2$"#,
1144+
(input: "abbba", match: nil),
1145+
(input: "abbbab", match: nil),
1146+
(input: "ABBAB", match: nil))
10611147

10621148
// TODO: Test example where non-atomic is significant
10631149
firstMatchTest(

Tests/RegexTests/ParseTests.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -950,10 +950,10 @@ extension RegexTests {
950950
concat("a", nonCaptureReset("b"), "c"), throwsError: .unsupported)
951951
parseTest(
952952
#"a(?>b)c"#,
953-
concat("a", atomicNonCapturing("b"), "c"), throwsError: .unsupported)
953+
concat("a", atomicNonCapturing("b"), "c"))
954954
parseTest(
955955
"a(*atomic:b)c",
956-
concat("a", atomicNonCapturing("b"), "c"), throwsError: .unsupported)
956+
concat("a", atomicNonCapturing("b"), "c"))
957957

958958
parseTest("a(?=b)c", concat("a", lookahead("b"), "c"))
959959
parseTest("a(*pla:b)c", concat("a", lookahead("b"), "c"))

0 commit comments

Comments
 (0)