Skip to content

Commit 7f5bffd

Browse files
natecook1000milseman
authored andcommitted
Disentangle disparate 'bounds' ideas in processor (#496)
This separates the two different ideas for boundaries in the base input: - subjectBounds: These represent the actual subject in the input string. For a `String` callee, this will cover the entire bounds, while for a `Substring` these will represent the bounds of the substring in the base. - searchBounds: These represent the current search range in the subject. These bounds can be the same as `subjectBounds` or a subrange when searching for subsequent matches or replacing only in a subrange of a string. * firstMatch shouldn't update searchBounds on iteration When we move forward while searching for the first match, the search bounds should stay the same. Only the currentPosition needs to move forward. This will allow us to implement the \G start of match anchor, with which /\Gab/ matches "abab" twice, compared with /^ab/, which only matches once. * Make matches(of:) and ranges(of:) boundary-aware With this change, RegexMatchesCollection keeps the subject bounds and search bounds separately, modifying the search bounds with each iteration. In addition, the replace methods that only operate on a subrange can specify that specifically, getting the correct anchor behavior while only matching within a portion of a string.
1 parent b3ea513 commit 7f5bffd

File tree

13 files changed

+292
-152
lines changed

13 files changed

+292
-152
lines changed

Sources/_StringProcessing/Algorithms/Algorithms/Contains.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,6 @@ extension BidirectionalCollection where SubSequence == Substring {
7474
@_disfavoredOverload
7575
@available(SwiftStdlib 5.7, *)
7676
public func contains(_ regex: some RegexComponent) -> Bool {
77-
_contains(RegexConsumer(regex))
77+
(try? regex.regex.firstMatch(in: self[...])) != nil
7878
}
7979
}

Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -229,9 +229,18 @@ extension BidirectionalCollection where Element: Comparable {
229229
@available(SwiftStdlib 5.7, *)
230230
struct RegexRangesCollection<Output> {
231231
let base: RegexMatchesCollection<Output>
232-
233-
init(string: Substring, regex: Regex<Output>) {
234-
self.base = RegexMatchesCollection(base: string, regex: regex)
232+
233+
init(
234+
input: String,
235+
subjectBounds: Range<String.Index>,
236+
searchBounds: Range<String.Index>,
237+
regex: Regex<Output>
238+
) {
239+
self.base = .init(
240+
input: input,
241+
subjectBounds: subjectBounds,
242+
searchBounds: searchBounds,
243+
regex: regex)
235244
}
236245
}
237246

@@ -263,12 +272,29 @@ extension RegexRangesCollection: Collection {
263272
// MARK: Regex algorithms
264273

265274
extension Collection where SubSequence == Substring {
275+
@available(SwiftStdlib 5.7, *)
276+
@_disfavoredOverload
277+
func _ranges<R: RegexComponent>(
278+
of regex: R,
279+
subjectBounds: Range<String.Index>,
280+
searchBounds: Range<String.Index>
281+
) -> RegexRangesCollection<R.RegexOutput> {
282+
RegexRangesCollection(
283+
input: self[...].base,
284+
subjectBounds: subjectBounds,
285+
searchBounds: searchBounds,
286+
regex: regex.regex)
287+
}
288+
266289
@available(SwiftStdlib 5.7, *)
267290
@_disfavoredOverload
268291
func _ranges<R: RegexComponent>(
269292
of regex: R
270293
) -> RegexRangesCollection<R.RegexOutput> {
271-
RegexRangesCollection(string: self[...], regex: regex.regex)
294+
_ranges(
295+
of: regex,
296+
subjectBounds: startIndex..<endIndex,
297+
searchBounds: startIndex..<endIndex)
272298
}
273299
}
274300

Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,10 @@ extension RangeReplaceableCollection where SubSequence == Substring {
179179
maxReplacements: Int = .max
180180
) -> Self where Replacement.Element == Element {
181181
_replacing(
182-
self[subrange]._ranges(of: regex),
182+
self._ranges(
183+
of: regex,
184+
subjectBounds: startIndex..<endIndex,
185+
searchBounds: subrange),
183186
with: replacement,
184187
maxReplacements: maxReplacements)
185188
}

Sources/_StringProcessing/Algorithms/Matching/Matches.swift

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -185,14 +185,26 @@ extension BidirectionalCollection {
185185

186186
@available(SwiftStdlib 5.7, *)
187187
struct RegexMatchesCollection<Output> {
188-
let input: Substring
188+
let input: String
189+
let subjectBounds: Range<String.Index>
190+
let searchBounds: Range<String.Index>
189191
let regex: Regex<Output>
190192
let startIndex: Index
191193

192-
init(base: Substring, regex: Regex<Output>) {
193-
self.input = base
194+
init(
195+
input: String,
196+
subjectBounds: Range<String.Index>,
197+
searchBounds: Range<String.Index>,
198+
regex: Regex<Output>
199+
) {
200+
self.input = input
201+
self.subjectBounds = subjectBounds
202+
self.searchBounds = searchBounds
194203
self.regex = regex
195-
self.startIndex = base.firstMatch(of: regex).map(Index.match) ?? .end
204+
self.startIndex = (try? regex._firstMatch(
205+
input,
206+
subjectBounds: subjectBounds,
207+
searchBounds: searchBounds)).map(Index.match) ?? .end
196208
}
197209
}
198210

@@ -241,12 +253,15 @@ extension RegexMatchesCollection: Sequence {
241253
}
242254

243255
// `nextStart` is `nil` when iteration has completed
244-
guard let start = nextStart else {
256+
guard let start = nextStart, start <= base.searchBounds.upperBound else {
245257
return nil
246258
}
247259

248260
// Otherwise, find the next match (if any) and compute `nextStart`
249-
let match = try? base.regex.firstMatch(in: base.input[start...])
261+
let match = try? base.regex._firstMatch(
262+
base.input,
263+
subjectBounds: base.subjectBounds,
264+
searchBounds: start..<base.searchBounds.upperBound)
250265
nextStart = match.flatMap(base.searchIndex(after:))
251266
return match
252267
}
@@ -310,7 +325,11 @@ extension RegexMatchesCollection: Collection {
310325

311326
guard
312327
let start = searchIndex(after: currentMatch),
313-
let nextMatch = try? regex.firstMatch(in: input[start...])
328+
start <= searchBounds.upperBound,
329+
let nextMatch = try? regex._firstMatch(
330+
input,
331+
subjectBounds: subjectBounds,
332+
searchBounds: start..<searchBounds.upperBound)
314333
else {
315334
return .end
316335
}
@@ -331,7 +350,11 @@ extension BidirectionalCollection where SubSequence == Substring {
331350
func _matches<R: RegexComponent>(
332351
of regex: R
333352
) -> RegexMatchesCollection<R.RegexOutput> {
334-
RegexMatchesCollection(base: self[...], regex: regex.regex)
353+
RegexMatchesCollection(
354+
input: self[...].base,
355+
subjectBounds: startIndex..<endIndex,
356+
searchBounds: startIndex..<endIndex,
357+
regex: regex.regex)
335358
}
336359

337360
@available(SwiftStdlib 5.7, *)

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 32 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -96,26 +96,26 @@ fileprivate extension Compiler.ByteCodeGen {
9696
// need to supply both a slice bounds and a per-search bounds.
9797
switch kind {
9898
case .startOfSubject:
99-
builder.buildAssert { (input, pos, bounds) in
100-
pos == input.startIndex
99+
builder.buildAssert { (input, pos, subjectBounds) in
100+
pos == subjectBounds.lowerBound
101101
}
102102

103103
case .endOfSubjectBeforeNewline:
104-
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in
105-
if pos == input.endIndex { return true }
104+
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
105+
if pos == subjectBounds.upperBound { return true }
106106
switch semanticLevel {
107107
case .graphemeCluster:
108-
return input.index(after: pos) == input.endIndex
108+
return input.index(after: pos) == subjectBounds.upperBound
109109
&& input[pos].isNewline
110110
case .unicodeScalar:
111-
return input.unicodeScalars.index(after: pos) == input.endIndex
111+
return input.unicodeScalars.index(after: pos) == subjectBounds.upperBound
112112
&& input.unicodeScalars[pos].isNewline
113113
}
114114
}
115115

116116
case .endOfSubject:
117-
builder.buildAssert { (input, pos, bounds) in
118-
pos == input.endIndex
117+
builder.buildAssert { (input, pos, subjectBounds) in
118+
pos == subjectBounds.upperBound
119119
}
120120

121121
case .resetStartOfMatch:
@@ -124,9 +124,10 @@ fileprivate extension Compiler.ByteCodeGen {
124124

125125
case .firstMatchingPositionInSubject:
126126
// TODO: We can probably build a nice model with API here
127-
builder.buildAssert { (input, pos, bounds) in
128-
pos == bounds.lowerBound
129-
}
127+
128+
// FIXME: This needs to be based on `searchBounds`,
129+
// not the `subjectBounds` given as an argument here
130+
builder.buildAssert { (input, pos, subjectBounds) in false }
130131

131132
case .textSegment:
132133
builder.buildAssert { (input, pos, _) in
@@ -141,9 +142,13 @@ fileprivate extension Compiler.ByteCodeGen {
141142
}
142143

143144
case .startOfLine:
145+
// FIXME: Anchor.startOfLine must always use this first branch
146+
// The behavior of `^` should depend on `anchorsMatchNewlines`, but
147+
// the DSL-based `.startOfLine` anchor should always match the start
148+
// of a line. Right now we don't distinguish between those anchors.
144149
if options.anchorsMatchNewlines {
145-
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in
146-
if pos == input.startIndex { return true }
150+
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
151+
if pos == subjectBounds.lowerBound { return true }
147152
switch semanticLevel {
148153
case .graphemeCluster:
149154
return input[input.index(before: pos)].isNewline
@@ -152,15 +157,19 @@ fileprivate extension Compiler.ByteCodeGen {
152157
}
153158
}
154159
} else {
155-
builder.buildAssert { (input, pos, bounds) in
156-
pos == input.startIndex
160+
builder.buildAssert { (input, pos, subjectBounds) in
161+
pos == subjectBounds.lowerBound
157162
}
158163
}
159164

160165
case .endOfLine:
166+
// FIXME: Anchor.endOfLine must always use this first branch
167+
// The behavior of `$` should depend on `anchorsMatchNewlines`, but
168+
// the DSL-based `.endOfLine` anchor should always match the end
169+
// of a line. Right now we don't distinguish between those anchors.
161170
if options.anchorsMatchNewlines {
162-
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in
163-
if pos == input.endIndex { return true }
171+
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
172+
if pos == subjectBounds.upperBound { return true }
164173
switch semanticLevel {
165174
case .graphemeCluster:
166175
return input[pos].isNewline
@@ -169,25 +178,25 @@ fileprivate extension Compiler.ByteCodeGen {
169178
}
170179
}
171180
} else {
172-
builder.buildAssert { (input, pos, bounds) in
173-
pos == input.endIndex
181+
builder.buildAssert { (input, pos, subjectBounds) in
182+
pos == subjectBounds.upperBound
174183
}
175184
}
176185

177186
case .wordBoundary:
178187
// TODO: May want to consider Unicode level
179-
builder.buildAssert { [options] (input, pos, bounds) in
188+
builder.buildAssert { [options] (input, pos, subjectBounds) in
180189
// TODO: How should we handle bounds?
181190
_CharacterClassModel.word.isBoundary(
182-
input, at: pos, bounds: bounds, with: options)
191+
input, at: pos, bounds: subjectBounds, with: options)
183192
}
184193

185194
case .notWordBoundary:
186195
// TODO: May want to consider Unicode level
187-
builder.buildAssert { [options] (input, pos, bounds) in
196+
builder.buildAssert { [options] (input, pos, subjectBounds) in
188197
// TODO: How should we handle bounds?
189198
!_CharacterClassModel.word.isBoundary(
190-
input, at: pos, bounds: bounds, with: options)
199+
input, at: pos, bounds: subjectBounds, with: options)
191200
}
192201
}
193202
}

Sources/_StringProcessing/Engine/Consume.swift

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,25 @@ extension Engine {
1818
Processor(
1919
program: program,
2020
input: input,
21-
bounds: bounds,
21+
subjectBounds: bounds,
22+
searchBounds: bounds,
2223
matchMode: matchMode,
2324
isTracingEnabled: enableTracing)
2425
}
26+
27+
func makeFirstMatchProcessor(
28+
input: String,
29+
subjectBounds: Range<String.Index>,
30+
searchBounds: Range<String.Index>
31+
) -> Processor {
32+
Processor(
33+
program: program,
34+
input: input,
35+
subjectBounds: subjectBounds,
36+
searchBounds: searchBounds,
37+
matchMode: .partialFromFront,
38+
isTracingEnabled: enableTracing)
39+
}
2540
}
2641

2742
extension Processor {

0 commit comments

Comments
 (0)