Skip to content

Commit 99d01fc

Browse files
committed
Fully generalize "whole match" in the engine and enable transforming custom types
* Track the whole match as an element of the "capture list" in the matching engine. Do so by emitting code as an implicit `capture` around the root node. * No longer handle `matcher` as a special case within `capture` lowering, because the matcher can be arbitrarily nested within "output-forwarding" nodes, such as a `changeMatchingOptions` non-capturing group. Instead, make the bytecode emitter carry a result value so that a custom output can be propagated through any forwarding nodes. ```swift Regex { Capture( SemanticVersionParser() .ignoringCase() .matchingSemantics(.unicodeScalar) ) // This would not work previously. } ``` * Collapse DSLTree node `transform` into `capture`, because a transform can never be standalone (without a `capture` parent). This greatly simplifies `capture` lowering. * Make the bytecode's capture transform use type `(Input, _StoredCapture) -> Any` so that it can transform any whole match, not just `Substring`. This means you can now transform any captured value, including a custom-consuming regex component's result! ```swift Regex { "version:" OneOrMore(.whitespace) Capture { SemanticVersionParser() // Regex<SemanticVersion> } transform: { // (SemanticVersion) -> SomethingElse } } ``` The transforms of `Capture` and `TryCapture` are now generalized from taking `Substring` to taking generic parameter `W` (the whole match).
1 parent 5e05b61 commit 99d01fc

23 files changed

+623
-899
lines changed

Sources/RegexBuilder/Variadics.swift

Lines changed: 308 additions & 528 deletions
Large diffs are not rendered by default.

Sources/VariadicsGenerator/VariadicsGenerator.swift

Lines changed: 28 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -646,28 +646,23 @@ struct VariadicsGenerator: ParsableCommand {
646646
\(disfavored)\
647647
public init<\(genericParams), NewCapture>(
648648
_ component: R,
649-
transform: @escaping (Substring) throws -> NewCapture
649+
transform: @escaping @Sendable (W) throws -> NewCapture
650650
) \(whereClauseTransformed) {
651-
self.init(node: .capture(.transform(
652-
CaptureTransform(resultType: NewCapture.self) {
653-
try transform($0) as Any
654-
},
655-
component.regex.root)))
651+
self.init(node: .capture(
652+
component.regex.root,
653+
CaptureTransform(transform)))
656654
}
657655
658656
\(disfavored)\
659657
public init<\(genericParams), NewCapture>(
660658
_ component: R,
661659
as reference: Reference<NewCapture>,
662-
transform: @escaping (Substring) throws -> NewCapture
660+
transform: @escaping @Sendable (W) throws -> NewCapture
663661
) \(whereClauseTransformed) {
664662
self.init(node: .capture(
665663
reference: reference.id,
666-
.transform(
667-
CaptureTransform(resultType: NewCapture.self) {
668-
try transform($0) as Any
669-
},
670-
component.regex.root)))
664+
component.regex.root,
665+
CaptureTransform(transform)))
671666
}
672667
}
673668
@@ -676,28 +671,23 @@ struct VariadicsGenerator: ParsableCommand {
676671
\(disfavored)\
677672
public init<\(genericParams), NewCapture>(
678673
_ component: R,
679-
transform: @escaping (Substring) throws -> NewCapture?
674+
transform: @escaping @Sendable (W) throws -> NewCapture?
680675
) \(whereClauseTransformed) {
681-
self.init(node: .capture(.transform(
682-
CaptureTransform(resultType: NewCapture.self) {
683-
try transform($0) as Any?
684-
},
685-
component.regex.root)))
676+
self.init(node: .capture(
677+
component.regex.root,
678+
CaptureTransform(transform)))
686679
}
687680
688681
\(disfavored)\
689682
public init<\(genericParams), NewCapture>(
690683
_ component: R,
691684
as reference: Reference<NewCapture>,
692-
transform: @escaping (Substring) throws -> NewCapture?
685+
transform: @escaping @Sendable (W) throws -> NewCapture?
693686
) \(whereClauseTransformed) {
694687
self.init(node: .capture(
695688
reference: reference.id,
696-
.transform(
697-
CaptureTransform(resultType: NewCapture.self) {
698-
try transform($0) as Any?
699-
},
700-
component.regex.root)))
689+
component.regex.root,
690+
CaptureTransform(transform)))
701691
}
702692
}
703693
@@ -725,28 +715,23 @@ struct VariadicsGenerator: ParsableCommand {
725715
\(disfavored)\
726716
public init<\(genericParams), NewCapture>(
727717
@\(concatBuilderName) _ component: () -> R,
728-
transform: @escaping (Substring) throws -> NewCapture
718+
transform: @escaping @Sendable (W) throws -> NewCapture
729719
) \(whereClauseTransformed) {
730-
self.init(node: .capture(.transform(
731-
CaptureTransform(resultType: NewCapture.self) {
732-
try transform($0) as Any
733-
},
734-
component().regex.root)))
720+
self.init(node: .capture(
721+
component().regex.root,
722+
CaptureTransform(transform)))
735723
}
736724
737725
\(disfavored)\
738726
public init<\(genericParams), NewCapture>(
739727
as reference: Reference<NewCapture>,
740728
@\(concatBuilderName) _ component: () -> R,
741-
transform: @escaping (Substring) throws -> NewCapture
729+
transform: @escaping @Sendable (W) throws -> NewCapture
742730
) \(whereClauseTransformed) {
743731
self.init(node: .capture(
744732
reference: reference.id,
745-
.transform(
746-
CaptureTransform(resultType: NewCapture.self) {
747-
try transform($0) as Any
748-
},
749-
component().regex.root)))
733+
component().regex.root,
734+
CaptureTransform(transform)))
750735
}
751736
}
752737
@@ -755,28 +740,23 @@ struct VariadicsGenerator: ParsableCommand {
755740
\(disfavored)\
756741
public init<\(genericParams), NewCapture>(
757742
@\(concatBuilderName) _ component: () -> R,
758-
transform: @escaping (Substring) throws -> NewCapture?
743+
transform: @escaping @Sendable (W) throws -> NewCapture?
759744
) \(whereClauseTransformed) {
760-
self.init(node: .capture(.transform(
761-
CaptureTransform(resultType: NewCapture.self) {
762-
try transform($0) as Any?
763-
},
764-
component().regex.root)))
745+
self.init(node: .capture(
746+
component().regex.root,
747+
CaptureTransform(transform)))
765748
}
766749
767750
\(disfavored)\
768751
public init<\(genericParams), NewCapture>(
769752
as reference: Reference<NewCapture>,
770753
@\(concatBuilderName) _ component: () -> R,
771-
transform: @escaping (Substring) throws -> NewCapture?
754+
transform: @escaping @Sendable (W) throws -> NewCapture?
772755
) \(whereClauseTransformed) {
773756
self.init(node: .capture(
774757
reference: reference.id,
775-
.transform(
776-
CaptureTransform(resultType: NewCapture.self) {
777-
try transform($0) as Any?
778-
},
779-
component().regex.root)))
758+
component().regex.root,
759+
CaptureTransform(transform)))
780760
}
781761
}
782762

Sources/_RegexParser/Regex/Parse/CaptureList.swift

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,13 @@ public struct CaptureList {
2424
extension CaptureList {
2525
public struct Capture {
2626
public var name: String?
27-
public var type: Any.Type?
27+
public var type: Any.Type
2828
public var optionalDepth: Int
2929
public var location: SourceLocation
3030

3131
public init(
3232
name: String? = nil,
33-
type: Any.Type? = nil,
33+
type: Any.Type = Substring.self,
3434
optionalDepth: Int,
3535
_ location: SourceLocation
3636
) {
@@ -125,6 +125,7 @@ extension AST.Node {
125125

126126
public var _captureList: CaptureList {
127127
var caps = CaptureList()
128+
caps.append(.init(optionalDepth: 0, .fake))
128129
self._addCaptures(to: &caps, optionalNesting: 0)
129130
return caps
130131
}
@@ -151,12 +152,7 @@ extension CaptureList: Equatable {}
151152

152153
extension CaptureList.Capture: CustomStringConvertible {
153154
public var description: String {
154-
let typeStr: String
155-
if let ty = type {
156-
typeStr = "\(ty)"
157-
} else {
158-
typeStr = "Substring"
159-
}
155+
let typeStr = String(describing: type)
160156
let suffix = String(repeating: "?", count: optionalDepth)
161157
return typeStr + suffix
162158
}

Sources/_RegexParser/Regex/Parse/CaptureStructure.swift

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -246,10 +246,7 @@ extension CaptureList {
246246
extension CaptureList.Capture {
247247
func _captureStructure(nestOptionals: Bool) -> CaptureStructure {
248248
if optionalDepth == 0 {
249-
if let ty = type {
250-
return .atom(name: name, type: .init(ty))
251-
}
252-
return .atom(name: name)
249+
return .atom(name: name, type: type == Substring.self ? nil : .init(type))
253250
}
254251
var copy = self
255252
copy.optionalDepth = 0

Sources/_RegexParser/Regex/Parse/Sema.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ extension RegexValidator {
7777
}
7878
switch ref.kind {
7979
case .absolute(let i):
80-
guard i <= captures.captures.count else {
80+
guard i < captures.captures.count else {
8181
throw error(.invalidReference(i), at: ref.innerLoc)
8282
}
8383
case .named(let name):

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 45 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,20 @@ extension Compiler {
99
self.options = options
1010
self.builder.captureList = captureList
1111
}
12-
13-
mutating func finish(
14-
) throws -> Program {
15-
builder.buildAccept()
16-
return try builder.assemble()
17-
}
1812
}
1913
}
2014

2115
extension Compiler.ByteCodeGen {
16+
mutating func emitRoot(_ root: DSLTree.Node) throws -> Program {
17+
// The whole match (`.0` element of output) is equivalent to an implicit
18+
// capture over the entire regex.
19+
try emitNode(.capture(name: nil, reference: nil, root))
20+
builder.buildAccept()
21+
return try builder.assemble()
22+
}
23+
}
24+
25+
fileprivate extension Compiler.ByteCodeGen {
2226
mutating func emitAtom(_ a: DSLTree.Atom) throws {
2327
switch a {
2428
case .any:
@@ -65,8 +69,7 @@ extension Compiler.ByteCodeGen {
6569

6670
switch ref.kind {
6771
case .absolute(let i):
68-
// Backreferences number starting at 1
69-
builder.buildBackreference(.init(i-1))
72+
builder.buildBackreference(.init(i))
7073
case .named(let name):
7174
try builder.buildNamedReference(name)
7275
case .relative:
@@ -329,9 +332,8 @@ extension Compiler.ByteCodeGen {
329332
}
330333

331334
mutating func emitMatcher(
332-
_ matcher: @escaping _MatcherInterface,
333-
into capture: CaptureRegister? = nil
334-
) {
335+
_ matcher: @escaping _MatcherInterface
336+
) -> ValueRegister {
335337

336338
// TODO: Consider emitting consumer interface if
337339
// not captured. This may mean we should store
@@ -343,26 +345,7 @@ extension Compiler.ByteCodeGen {
343345

344346
let valReg = builder.makeValueRegister()
345347
builder.buildMatcher(matcher, into: valReg)
346-
347-
// TODO: Instruction to store directly
348-
if let cap = capture {
349-
builder.buildMove(valReg, into: cap)
350-
}
351-
}
352-
353-
mutating func emitTransform(
354-
_ t: CaptureTransform,
355-
_ child: DSLTree.Node,
356-
into cap: CaptureRegister
357-
) throws {
358-
let transform = builder.makeTransformFunction {
359-
input, range in
360-
try t(input[range])
361-
}
362-
builder.buildBeginCapture(cap)
363-
try emitNode(child)
364-
builder.buildEndCapture(cap)
365-
builder.buildTransformCapture(cap, transform)
348+
return valReg
366349
}
367350

368351
mutating func emitNoncapturingGroup(
@@ -612,7 +595,8 @@ extension Compiler.ByteCodeGen {
612595
builder.buildConsume(by: consumer)
613596
}
614597

615-
mutating func emitNode(_ node: DSLTree.Node) throws {
598+
@discardableResult
599+
mutating func emitNode(_ node: DSLTree.Node) throws -> ValueRegister? {
616600
switch node {
617601

618602
case let .orderedChoice(children):
@@ -623,20 +607,34 @@ extension Compiler.ByteCodeGen {
623607
try emitConcatenationComponent(child)
624608
}
625609

626-
case let .capture(name, refId, child):
610+
case let .capture(name, refId, child, transform):
627611
options.beginScope()
628612
defer { options.endScope() }
629613

630614
let cap = builder.makeCapture(id: refId, name: name)
631-
switch child {
632-
case let .matcher(_, m):
633-
emitMatcher(m, into: cap)
634-
case let .transform(t, child):
635-
try emitTransform(t, child, into: cap)
636-
default:
637-
builder.buildBeginCapture(cap)
638-
try emitNode(child)
639-
builder.buildEndCapture(cap)
615+
builder.buildBeginCapture(cap)
616+
let value = try emitNode(child)
617+
builder.buildEndCapture(cap)
618+
// If the child node produced a custom capture value, e.g. the result of
619+
// a matcher, this should override the captured substring.
620+
if let value {
621+
builder.buildMove(value, into: cap)
622+
}
623+
// If there's a capture transform, apply it now.
624+
if let transform = transform {
625+
let fn = builder.makeTransformFunction { input, storedCapture in
626+
// If it's a substring capture with no custom value, apply the
627+
// transform directly to the substring to avoid existential traffic.
628+
if let cap = storedCapture.latest, cap.value == nil {
629+
return try transform(input[cap.range])
630+
}
631+
let value = constructExistentialOutputComponent(
632+
from: input,
633+
component: storedCapture.latest,
634+
optionalCount: 0)
635+
return try transform(value)
636+
}
637+
builder.buildTransformCapture(cap, fn)
640638
}
641639

642640
case let .nonCapturingGroup(kind, child):
@@ -704,29 +702,25 @@ extension Compiler.ByteCodeGen {
704702
}
705703

706704
case let .regexLiteral(l):
707-
try emitNode(l.ast.dslTreeNode)
705+
return try emitNode(l.ast.dslTreeNode)
708706

709707
case let .convertedRegexLiteral(n, _):
710-
try emitNode(n)
708+
return try emitNode(n)
711709

712710
case .absentFunction:
713711
throw Unsupported("absent function")
714712
case .consumer:
715713
throw Unsupported("consumer")
716714

717715
case let .matcher(_, f):
718-
emitMatcher(f)
719-
720-
case .transform:
721-
throw Unreachable(
722-
"Transforms only directly inside captures")
716+
return emitMatcher(f)
723717

724718
case .characterPredicate:
725719
throw Unsupported("character predicates")
726720

727721
case .trivia, .empty:
728-
return
722+
break
729723
}
724+
return nil
730725
}
731726
}
732-

0 commit comments

Comments
 (0)