Skip to content

[5.7] Implement atomic non-capturing groups #523

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jun 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions Sources/_RegexParser/Regex/Parse/Sema.swift
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,8 @@ extension RegexValidator {
func validateGroup(_ group: AST.Group) throws {
let kind = group.kind
switch kind.value {
case .capture, .namedCapture, .nonCapture, .lookahead, .negativeLookahead:
case .capture, .namedCapture, .nonCapture, .lookahead, .negativeLookahead,
.atomicNonCapturing:
break

case .balancedCapture:
Expand All @@ -336,9 +337,6 @@ extension RegexValidator {
// We need to figure out how these interact with typed captures.
throw error(.unsupported("branch reset group"), at: kind.location)

case .atomicNonCapturing:
throw error(.unsupported("atomic group"), at: kind.location)

case .nonAtomicLookahead:
throw error(.unsupported("non-atomic lookahead"), at: kind.location)

Expand Down
39 changes: 37 additions & 2 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ fileprivate extension Compiler.ByteCodeGen {
save(restoringAt: success)
save(restoringAt: intercept)
<sub-pattern> // failure restores at intercept
clearSavePoint // remove intercept
clearThrough(intercept) // remove intercept and any leftovers from <sub-pattern>
<if negative>:
clearSavePoint // remove success
fail // positive->success, negative propagates
Expand All @@ -324,7 +324,7 @@ fileprivate extension Compiler.ByteCodeGen {
builder.buildSave(success)
builder.buildSave(intercept)
try emitNode(child)
builder.buildClear()
builder.buildClearThrough(intercept)
if !positive {
builder.buildClear()
}
Expand All @@ -339,6 +339,38 @@ fileprivate extension Compiler.ByteCodeGen {
builder.label(success)
}

mutating func emitAtomicNoncapturingGroup(
_ child: DSLTree.Node
) throws {
/*
save(continuingAt: success)
save(restoringAt: intercept)
<sub-pattern> // failure restores at intercept
clearThrough(intercept) // remove intercept and any leftovers from <sub-pattern>
fail // ->success
intercept:
clearSavePoint // remove success
fail // propagate failure
success:
...
*/

let intercept = builder.makeAddress()
let success = builder.makeAddress()

builder.buildSaveAddress(success)
builder.buildSave(intercept)
try emitNode(child)
builder.buildClearThrough(intercept)
builder.buildFail()

builder.label(intercept)
builder.buildClear()
builder.buildFail()

builder.label(success)
}

mutating func emitMatcher(
_ matcher: @escaping _MatcherInterface
) -> ValueRegister {
Expand Down Expand Up @@ -384,6 +416,9 @@ fileprivate extension Compiler.ByteCodeGen {
}
options.apply(optionSequence)
try emitNode(child)

case .atomicNonCapturing:
try emitAtomicNoncapturingGroup(child)

default:
// FIXME: Other kinds...
Expand Down
7 changes: 7 additions & 0 deletions Sources/_StringProcessing/Engine/Instruction.swift
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,13 @@ extension Instruction {
/// Precondition: There is a save point to remove
case clear

/// Remove save points up to and including the operand
///
/// Operand: instruction address to look for
///
/// Precondition: The operand is in the save point list
case clearThrough

/// View the most recently saved point
///
/// UNIMPLEMENTED
Expand Down
6 changes: 5 additions & 1 deletion Sources/_StringProcessing/Engine/MEBuilder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,10 @@ extension MEProgram.Builder {
mutating func buildClear() {
instructions.append(.init(.clear))
}
mutating func buildClearThrough(_ t: AddressToken) {
instructions.append(.init(.clearThrough))
fixup(to: t)
}
mutating func buildRestore() {
instructions.append(.init(.restore))
}
Expand Down Expand Up @@ -317,7 +321,7 @@ extension MEProgram.Builder {
case .condBranchZeroElseDecrement:
payload = .init(addr: addr, int: inst.payload.int)

case .branch, .save, .saveAddress, .call:
case .branch, .save, .saveAddress, .call, .clearThrough:
payload = .init(addr: addr)

case .splitSaving:
Expand Down
17 changes: 16 additions & 1 deletion Sources/_StringProcessing/Engine/Processor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,17 @@ extension Processor {
}
}

mutating func clearThrough(_ address: InstructionAddress) {
while let sp = savePoints.popLast() {
if sp.pc == address {
controller.step()
return
}
}
// TODO: What should we do here?
fatalError("Invalid code: Tried to clear save points when empty")
}

mutating func cycle() {
_checkInvariants()
assert(state == .inProgress)
Expand Down Expand Up @@ -323,9 +334,13 @@ extension Processor {
if let _ = savePoints.popLast() {
controller.step()
} else {
fatalError("TODO: What should we do here?")
// TODO: What should we do here?
fatalError("Invalid code: Tried to clear save points when empty")
}

case .clearThrough:
clearThrough(payload.addr)

case .peek:
fatalError()

Expand Down
23 changes: 23 additions & 0 deletions Tests/RegexBuilderTests/RegexDSLTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,29 @@ class RegexDSLTests: XCTestCase {
XCTAssertEqual("ab12".firstMatch(of: octoDecimalRegex)!.output.1, 61904)
}

func testLocal() throws {
try _testDSLCaptures(
("aaaaa", nil),
matchType: Substring.self, ==)
{
Local {
OneOrMore("a")
}
"a"
}

try _testDSLCaptures(
("aa", "aa"),
("aaa", nil),
matchType: Substring.self, ==)
{
Local {
OneOrMore("a", .reluctant)
}
"a"
}
}

func testAssertions() throws {
try _testDSLCaptures(
("aaaaab", "aaaaab"),
Expand Down
96 changes: 91 additions & 5 deletions Tests/RegexTests/MatchTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -891,8 +891,7 @@ extension RegexTests {
input: "Price: 100 dollars", match: nil)
firstMatchTest(
#"(?=\d+ dollars)\d+"#,
input: "Price: 100 dollars", match: "100",
xfail: true) // TODO
input: "Price: 100 dollars", match: "100")

firstMatchTest(
#"\d+(*pla: dollars)"#,
Expand All @@ -917,6 +916,14 @@ extension RegexTests {
#"\d+(*negative_lookahead: dollars)"#,
input: "Price: 100 pesos", match: "100")

// More complex lookaheads
firstMatchTests(
#"(?=.*e)(?=.*o)(?!.*z)."#,
(input: "hello", match: "h"),
(input: "hzello", match: "e"),
(input: "hezllo", match: nil),
(input: "helloz", match: nil))

firstMatchTest(
#"(?<=USD)\d+"#, input: "Price: USD100", match: "100", xfail: true)
firstMatchTest(
Expand Down Expand Up @@ -1050,14 +1057,93 @@ extension RegexTests {
firstMatchTest(
#"(?:a|.b)c"#, input: "123abcacxyz", match: "abc")
firstMatchTest(
#"(?>a|.b)c"#, input: "123abcacxyz", match: "ac", xfail: true)
#"(?>a|.b)c"#, input: "123abcacxyz", match: "ac")
firstMatchTest(
"(*atomic:a|.b)c", input: "123abcacxyz", match: "ac", xfail: true)
"(*atomic:a|.b)c", input: "123abcacxyz", match: "ac")
firstMatchTest(
#"(?:a+)[a-z]c"#, input: "123aacacxyz", match: "aac")
firstMatchTest(
#"(?>a+)[a-z]c"#, input: "123aacacxyz", match: "ac", xfail: true)
#"(?>a+)[a-z]c"#, input: "123aacacxyz", match: nil)

// Atomicity should stay in the atomic group
firstMatchTest(
#"(?:(?>a)|.b)c"#, input: "123abcacxyz", match: "abc")

// Quantifier behavior inside atomic groups

// (?:a+?) matches as few 'a's as possible, after matching the first
// (?>a+?) always matches exactly one 'a'
firstMatchTests(
#"^(?:a+?)a$"#,
(input: "a", match: nil),
(input: "aa", match: "aa"),
(input: "aaa", match: "aaa"))
firstMatchTests(
#"^(?>a+?)a$"#,
(input: "a", match: nil),
(input: "aa", match: "aa"),
(input: "aaa", match: nil))

// (?:a?+) and (?>a?+) are equivalent: they match one 'a' if available
firstMatchTests(
#"^(?:a?+)a$"#,
(input: "a", match: nil),
xfail: true)
firstMatchTests(
#"^(?:a?+)a$"#,
(input: "aa", match: "aa"),
(input: "aaa", match: nil))
firstMatchTests(
#"^(?>a?+)a$"#,
(input: "a", match: nil),
(input: "aa", match: "aa"),
(input: "aaa", match: nil))

// Capture behavior in non-atomic vs atomic groups
firstMatchTests(
#"(\d+)\w+\1"#,
(input: "123x12", match: "123x12"), // `\w+` matches "3x" in this case
(input: "23x23", match: "23x23"),
(input: "123x23", match: "23x23"))
firstMatchTests(
#"(?>(\d+))\w+\1"#,
(input: "123x12", match: nil))
firstMatchTests(
#"(?>(\d+))\w+\1"#,
(input: "23x23", match: "23x23"),
(input: "123x23", match: "23x23"),
xfail: true)

// Backreferences in lookaheads
firstMatchTests(
#"^(?=.*(.)(.)\2\1).+$"#,
(input: "abbba", match: nil),
(input: "ABBA", match: "ABBA"),
(input: "defABBAdef", match: "defABBAdef"))
firstMatchTests(
#"^(?=.*(.)(.)\2\1).+\2$"#,
(input: "abbba", match: nil),
(input: "ABBA", match: nil),
(input: "defABBAdef", match: nil))
// FIXME: Backreferences don't escape positive lookaheads
firstMatchTests(
#"^(?=.*(.)(.)\2\1).+\2$"#,
(input: "ABBAB", match: "ABBAB"),
(input: "defABBAdefB", match: "defABBAdefB"),
xfail: true)

firstMatchTests(
#"^(?!.*(.)(.)\2\1).+$"#,
(input: "abbba", match: "abbba"),
(input: "ABBA", match: nil),
(input: "defABBAdef", match: nil))
// Backreferences don't escape negative lookaheads;
// matching only proceeds when the lookahead fails
firstMatchTests(
#"^(?!.*(.)(.)\2\1).+\2$"#,
(input: "abbba", match: nil),
(input: "abbbab", match: nil),
(input: "ABBAB", match: nil))

// TODO: Test example where non-atomic is significant
firstMatchTest(
Expand Down
4 changes: 2 additions & 2 deletions Tests/RegexTests/ParseTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -979,10 +979,10 @@ extension RegexTests {
concat("a", nonCaptureReset("b"), "c"), throwsError: .unsupported)
parseTest(
#"a(?>b)c"#,
concat("a", atomicNonCapturing("b"), "c"), throwsError: .unsupported)
concat("a", atomicNonCapturing("b"), "c"))
parseTest(
"a(*atomic:b)c",
concat("a", atomicNonCapturing("b"), "c"), throwsError: .unsupported)
concat("a", atomicNonCapturing("b"), "c"))

parseTest("a(?=b)c", concat("a", lookahead("b"), "c"))
parseTest("a(*pla:b)c", concat("a", lookahead("b"), "c"))
Expand Down