Skip to content

Commit a4d7be0

Browse files
authored
Keep track of initial options in compiled program (#412)
The initial options are stored in the lowered program, and include all options that are set before the first attempted match. Note that not all initial options are global - a leading option-setting group is included in initial options, even though it applies only to a portion of the overall regex. Previously, searching via firstMatch or matches(of:) would only _start_ searches at a character index, even when a regex has Unicode scalar semantics.
1 parent 4f8f67a commit a4d7be0

File tree

10 files changed

+102
-16
lines changed

10 files changed

+102
-16
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ extension Compiler.ByteCodeGen {
3535
builder.buildUnresolvedReference(id: id)
3636

3737
case let .changeMatchingOptions(optionSequence):
38+
if !builder.hasReceivedInstructions {
39+
builder.initialOptions.apply(optionSequence.ast)
40+
}
3841
options.apply(optionSequence.ast)
3942

4043
case let .unconverted(astAtom):
@@ -379,6 +382,9 @@ extension Compiler.ByteCodeGen {
379382
throw Unreachable("These should produce a capture node")
380383

381384
case .changeMatchingOptions(let optionSequence):
385+
if !builder.hasReceivedInstructions {
386+
builder.initialOptions.apply(optionSequence)
387+
}
382388
options.apply(optionSequence)
383389
try emitNode(child)
384390

Sources/_StringProcessing/Engine/MEBuilder.swift

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ extension MEProgram where Input.Element: Hashable {
3939
var failAddressToken: AddressToken? = nil
4040

4141
var captureList = CaptureList()
42+
var initialOptions = MatchingOptions()
4243

4344
// Symbolic reference resolution
4445
var unresolvedReferences: [ReferenceID: [InstructionAddress]] = [:]
@@ -77,6 +78,11 @@ extension MEProgram.Builder {
7778
var lastInstructionAddress: InstructionAddress {
7879
.init(instructions.endIndex - 1)
7980
}
81+
82+
/// `true` if the builder has received any instructions.
83+
var hasReceivedInstructions: Bool {
84+
!instructions.isEmpty
85+
}
8086

8187
mutating func buildNop(_ r: StringRegister? = nil) {
8288
instructions.append(.init(.nop, .init(optionalString: r)))
@@ -353,7 +359,8 @@ extension MEProgram.Builder {
353359
registerInfo: regInfo,
354360
captureList: captureList,
355361
referencedCaptureOffsets: referencedCaptureOffsets,
356-
namedCaptureOffsets: namedCaptureOffsets)
362+
namedCaptureOffsets: namedCaptureOffsets,
363+
initialOptions: initialOptions)
357364
}
358365

359366
mutating func reset() { self = Self() }

Sources/_StringProcessing/Engine/MEProgram.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ struct MEProgram<Input: Collection> where Input.Element: Equatable {
3737
let captureList: CaptureList
3838
let referencedCaptureOffsets: [ReferenceID: Int]
3939
let namedCaptureOffsets: [String: Int]
40+
41+
var initialOptions: MatchingOptions
4042
}
4143

4244
extension MEProgram: CustomStringConvertible {

Sources/_StringProcessing/MatchingOptions.swift

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,13 @@ extension MatchingOptions {
5454
stack[stack.count - 1].apply(sequence)
5555
_invariantCheck()
5656
}
57+
58+
// @testable
59+
/// Returns true if the options at the top of `stack` are equal to those
60+
/// for `other`.
61+
func _equal(to other: MatchingOptions) -> Bool {
62+
stack.last == other.stack.last
63+
}
5764
}
5865

5966
// MARK: Matching behavior API
@@ -127,6 +134,7 @@ extension MatchingOptions {
127134
}
128135
}
129136

137+
// MARK: - Implementation
130138
extension MatchingOptions {
131139
/// An option that changes the behavior of a regular expression.
132140
fileprivate enum Option: Int {

Sources/_StringProcessing/Regex/ASTConversion.swift

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,7 @@
1313

1414
extension AST {
1515
var dslTree: DSLTree {
16-
return DSLTree(
17-
root.dslTreeNode, options: globalOptions?.dslTreeOptions)
18-
}
19-
}
20-
21-
extension AST.GlobalMatchingOptionSequence {
22-
var dslTreeOptions: DSLTree.Options {
23-
// TODO: map options
24-
return .init()
16+
return DSLTree(root.dslTreeNode)
2517
}
2618
}
2719

Sources/_StringProcessing/Regex/Core.swift

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,18 @@ extension Regex {
9191
self.tree = tree
9292
}
9393
}
94+
95+
/// The set of matching options that applies to the start of this regex.
96+
///
97+
/// Note that the initial options may not apply to the entire regex. For
98+
/// example, in this regex, only case insensitivity (`i`) and Unicode scalar
99+
/// semantics (set by API) apply to the entire regex, while ASCII character
100+
/// classes (`P`) is part of `initialOptions` but not global:
101+
///
102+
/// let regex = /(?i)(?P:\d+\s*)abc/.semanticLevel(.unicodeScalar)
103+
var initialOptions: MatchingOptions {
104+
program.loweredProgram.initialOptions
105+
}
94106
}
95107

96108
@available(SwiftStdlib 5.7, *)
@@ -102,6 +114,6 @@ extension Regex {
102114

103115
@_spi(RegexBuilder)
104116
public init(node: DSLTree.Node) {
105-
self.program = Program(tree: .init(node, options: nil))
117+
self.program = Program(tree: .init(node))
106118
}
107119
}

Sources/_StringProcessing/Regex/DSLTree.swift

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,9 @@
1414
@_spi(RegexBuilder)
1515
public struct DSLTree {
1616
var root: Node
17-
var options: Options?
1817

19-
init(_ r: Node, options: Options?) {
18+
init(_ r: Node) {
2019
self.root = r
21-
self.options = options
2220
}
2321
}
2422

Sources/_StringProcessing/Regex/Match.swift

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,12 @@ extension Regex {
158158
if let m = try _match(input, in: low..<high, mode: .partialFromFront) {
159159
return m
160160
}
161-
if low == high { return nil }
162-
input.formIndex(after: &low)
161+
if low >= high { return nil }
162+
if regex.initialOptions.semanticLevel == .graphemeCluster {
163+
input.formIndex(after: &low)
164+
} else {
165+
input.unicodeScalars.formIndex(after: &low)
166+
}
163167
}
164168
}
165169
}

Tests/RegexTests/CompileTests.swift

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,4 +88,51 @@ extension RegexTests {
8888
try testCompilationEquivalence(row)
8989
}
9090
}
91+
92+
func testCompileInitialOptions() throws {
93+
func expectInitialOptions<T>(
94+
_ regex: Regex<T>,
95+
_ optionSequence: AST.MatchingOptionSequence,
96+
file: StaticString = #file,
97+
line: UInt = #line
98+
) throws {
99+
var options = MatchingOptions()
100+
options.apply(optionSequence)
101+
102+
XCTAssertTrue(
103+
regex.program.loweredProgram.initialOptions._equal(to: options),
104+
file: file, line: line)
105+
}
106+
107+
func expectInitialOptions(
108+
_ pattern: String,
109+
_ optionSequence: AST.MatchingOptionSequence,
110+
file: StaticString = #file,
111+
line: UInt = #line
112+
) throws {
113+
let regex = try Regex(pattern)
114+
try expectInitialOptions(regex, optionSequence, file: file, line: line)
115+
}
116+
117+
try expectInitialOptions(".", matchingOptions())
118+
try expectInitialOptions("(?i)(?-i).", matchingOptions())
119+
120+
try expectInitialOptions("(?i).", matchingOptions(adding: [.caseInsensitive]))
121+
try expectInitialOptions("(?i).(?-i)", matchingOptions(adding: [.caseInsensitive]))
122+
123+
try expectInitialOptions(
124+
"(?im)(?s).",
125+
matchingOptions(adding: [.caseInsensitive, .multiline, .singleLine]))
126+
try expectInitialOptions(".", matchingOptions())
127+
try expectInitialOptions(
128+
"(?im)(?s).(?u)",
129+
matchingOptions(adding: [.caseInsensitive, .multiline, .singleLine]))
130+
131+
try expectInitialOptions(
132+
"(?i:.)",
133+
matchingOptions(adding: [.caseInsensitive]))
134+
try expectInitialOptions(
135+
"(?i:.)(?m:.)",
136+
matchingOptions(adding: [.caseInsensitive]))
137+
}
91138
}

Tests/RegexTests/MatchTests.swift

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1618,5 +1618,15 @@ extension RegexTests {
16181618

16191619
// TODO: Add test for grapheme boundaries at start/end of match
16201620

1621+
func testCase() {
1622+
let regex = try! Regex(#".\N{SPARKLING HEART}."#)
1623+
let input = "🧟‍♀️💖🧠 or 🧠💖☕️"
1624+
let characterMatches = input.matches(of: regex)
1625+
XCTAssertEqual(characterMatches.map { $0.0 }, ["🧟‍♀️💖🧠", "🧠💖☕️"])
1626+
1627+
let scalarMatches = input.matches(of: regex.matchingSemantics(.unicodeScalar))
1628+
let scalarExpected: [Substring] = ["\u{FE0F}💖🧠", "🧠💖☕"]
1629+
XCTAssertEqual(scalarMatches.map { $0.0 }, scalarExpected)
1630+
}
16211631
}
16221632

0 commit comments

Comments
 (0)