Skip to content

Commit 613fa62

Browse files
natecook1000milseman
authored andcommitted
Add regex-specific Matches and Ranges collections (swiftlang#460)
This prepares for adopting an opaque result type for matches(of:) and ranges(of:). The old, CollectionConsumer-based model moves index-by-index, and isn't aware of the regex's semantic level, which results in inaccurate results for regexes that match at a mid-character index.
1 parent 0e5315b commit 613fa62

File tree

5 files changed

+252
-61
lines changed

5 files changed

+252
-61
lines changed

Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -226,17 +226,53 @@ extension BidirectionalCollection where Element: Comparable {
226226
// }
227227
}
228228

229+
@available(SwiftStdlib 5.7, *)
230+
struct RegexRangesCollection<Output> {
231+
let base: RegexMatchesCollection<Output>
232+
233+
init(string: Substring, regex: Regex<Output>) {
234+
self.base = RegexMatchesCollection(base: string, regex: regex)
235+
}
236+
}
237+
238+
@available(SwiftStdlib 5.7, *)
239+
extension RegexRangesCollection: Sequence {
240+
struct Iterator: IteratorProtocol {
241+
var matchesBase: RegexMatchesCollection<Output>.Iterator
242+
243+
mutating func next() -> Range<String.Index>? {
244+
matchesBase.next().map(\.range)
245+
}
246+
}
247+
248+
func makeIterator() -> Iterator {
249+
Iterator(matchesBase: base.makeIterator())
250+
}
251+
}
252+
253+
@available(SwiftStdlib 5.7, *)
254+
extension RegexRangesCollection: Collection {
255+
typealias Index = RegexMatchesCollection<Output>.Index
256+
257+
var startIndex: Index { base.startIndex }
258+
var endIndex: Index { base.endIndex }
259+
func index(after i: Index) -> Index { base.index(after: i) }
260+
subscript(position: Index) -> Range<String.Index> { base[position].range }
261+
}
262+
229263
// MARK: Regex algorithms
230264

231-
extension BidirectionalCollection where SubSequence == Substring {
265+
extension Collection where SubSequence == Substring {
232266
@available(SwiftStdlib 5.7, *)
233267
@_disfavoredOverload
234268
func _ranges<R: RegexComponent>(
235269
of regex: R
236-
) -> RangesCollection<RegexConsumer<R, Self>> {
237-
_ranges(of: RegexConsumer(regex))
270+
) -> RegexRangesCollection<R.RegexOutput> {
271+
RegexRangesCollection(string: self[...], regex: regex.regex)
238272
}
273+
}
239274

275+
extension BidirectionalCollection where SubSequence == Substring {
240276
@available(SwiftStdlib 5.7, *)
241277
func _rangesFromBack<R: RegexComponent>(
242278
of regex: R

Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift

Lines changed: 14 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -12,21 +12,21 @@
1212
// MARK: `CollectionSearcher` algorithms
1313

1414
extension RangeReplaceableCollection {
15-
func _replacing<Searcher: CollectionSearcher, Replacement: Collection>(
16-
_ searcher: Searcher,
15+
func _replacing<Ranges: Collection, Replacement: Collection>(
16+
_ ranges: Ranges,
1717
with replacement: Replacement,
18-
subrange: Range<Index>,
1918
maxReplacements: Int = .max
20-
) -> Self where Searcher.Searched == SubSequence,
19+
) -> Self where Ranges.Element == Range<Index>,
2120
Replacement.Element == Element
2221
{
2322
precondition(maxReplacements >= 0)
2423

25-
var index = subrange.lowerBound
2624
var result = Self()
27-
result.append(contentsOf: self[..<index])
25+
var index = startIndex
2826

29-
for range in self[subrange]._ranges(of: searcher).prefix(maxReplacements) {
27+
// `maxRanges` is a workaround for https://github.com/apple/swift/issues/59522
28+
let maxRanges = ranges.prefix(maxReplacements)
29+
for range in maxRanges {
3030
result.append(contentsOf: self[index..<range.lowerBound])
3131
result.append(contentsOf: replacement)
3232
index = range.upperBound
@@ -36,29 +36,15 @@ extension RangeReplaceableCollection {
3636
return result
3737
}
3838

39-
func _replacing<Searcher: CollectionSearcher, Replacement: Collection>(
40-
_ searcher: Searcher,
41-
with replacement: Replacement,
42-
maxReplacements: Int = .max
43-
) -> Self where Searcher.Searched == SubSequence,
44-
Replacement.Element == Element
45-
{
46-
_replacing(
47-
searcher,
48-
with: replacement,
49-
subrange: startIndex..<endIndex,
50-
maxReplacements: maxReplacements)
51-
}
52-
5339
mutating func _replace<
54-
Searcher: CollectionSearcher, Replacement: Collection
40+
Ranges: Collection, Replacement: Collection
5541
>(
56-
_ searcher: Searcher,
42+
_ ranges: Ranges,
5743
with replacement: Replacement,
5844
maxReplacements: Int = .max
59-
) where Searcher.Searched == SubSequence, Replacement.Element == Element {
45+
) where Ranges.Element == Range<Index>, Replacement.Element == Element {
6046
self = _replacing(
61-
searcher,
47+
ranges,
6248
with: replacement,
6349
maxReplacements: maxReplacements)
6450
}
@@ -85,9 +71,8 @@ extension RangeReplaceableCollection where Element: Equatable {
8571
maxReplacements: Int = .max
8672
) -> Self where C.Element == Element, Replacement.Element == Element {
8773
_replacing(
88-
ZSearcher(pattern: Array(other), by: ==),
74+
self[subrange]._ranges(of: other),
8975
with: replacement,
90-
subrange: subrange,
9176
maxReplacements: maxReplacements)
9277
}
9378

@@ -143,9 +128,8 @@ extension RangeReplaceableCollection
143128
maxReplacements: Int = .max
144129
) -> Self where C.Element == Element, Replacement.Element == Element {
145130
_replacing(
146-
PatternOrEmpty(searcher: TwoWaySearcher(pattern: Array(other))),
131+
self[subrange]._ranges(of: other),
147132
with: replacement,
148-
subrange: subrange,
149133
maxReplacements: maxReplacements)
150134
}
151135

@@ -195,9 +179,8 @@ extension RangeReplaceableCollection where SubSequence == Substring {
195179
maxReplacements: Int = .max
196180
) -> Self where Replacement.Element == Element {
197181
_replacing(
198-
RegexConsumer(regex),
182+
self[subrange]._ranges(of: regex),
199183
with: replacement,
200-
subrange: subrange,
201184
maxReplacements: maxReplacements)
202185
}
203186

Sources/_StringProcessing/Algorithms/Matching/Matches.swift

Lines changed: 145 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -183,13 +183,155 @@ extension BidirectionalCollection {
183183

184184
// MARK: Regex algorithms
185185

186+
@available(SwiftStdlib 5.7, *)
187+
struct RegexMatchesCollection<Output> {
188+
let input: Substring
189+
let regex: Regex<Output>
190+
let startIndex: Index
191+
192+
init(base: Substring, regex: Regex<Output>) {
193+
self.input = base
194+
self.regex = regex
195+
self.startIndex = base.firstMatch(of: regex).map(Index.match) ?? .end
196+
}
197+
}
198+
199+
@available(SwiftStdlib 5.7, *)
200+
extension RegexMatchesCollection: Sequence {
201+
/// Returns the index to start searching for the next match after `match`.
202+
fileprivate func searchIndex(after match: Regex<Output>.Match) -> String.Index? {
203+
if !match.range.isEmpty {
204+
return match.range.upperBound
205+
}
206+
207+
// If the last match was an empty match, advance by one position and
208+
// run again, unless at the end of `input`.
209+
if match.range.lowerBound == input.endIndex {
210+
return nil
211+
}
212+
213+
switch regex.initialOptions.semanticLevel {
214+
case .graphemeCluster:
215+
return input.index(after: match.range.upperBound)
216+
case .unicodeScalar:
217+
return input.unicodeScalars.index(after: match.range.upperBound)
218+
}
219+
}
220+
221+
struct Iterator: IteratorProtocol {
222+
let base: RegexMatchesCollection
223+
224+
// Because `RegexMatchesCollection` eagerly computes the first match for
225+
// its `startIndex`, the iterator can use that match for its initial
226+
// iteration. For subsequent calls to `next()`, this value is `false`, and
227+
// `nextStart` is used to search for the next match.
228+
var initialIteration = true
229+
var nextStart: String.Index?
230+
231+
init(_ matches: RegexMatchesCollection) {
232+
self.base = matches
233+
self.nextStart = base.startIndex.match.flatMap(base.searchIndex(after:))
234+
}
235+
236+
mutating func next() -> Regex<Output>.Match? {
237+
// Initial case with pre-computed first match
238+
if initialIteration {
239+
initialIteration = false
240+
return base.startIndex.match
241+
}
242+
243+
// `nextStart` is `nil` when iteration has completed
244+
guard let start = nextStart else {
245+
return nil
246+
}
247+
248+
// Otherwise, find the next match (if any) and compute `nextStart`
249+
let match = try? base.regex.firstMatch(in: base.input[start...])
250+
nextStart = match.flatMap(base.searchIndex(after:))
251+
return match
252+
}
253+
}
254+
255+
func makeIterator() -> Iterator {
256+
Iterator(self)
257+
}
258+
}
259+
260+
@available(SwiftStdlib 5.7, *)
261+
extension RegexMatchesCollection: Collection {
262+
enum Index: Comparable {
263+
case match(Regex<Output>.Match)
264+
case end
265+
266+
var match: Regex<Output>.Match? {
267+
switch self {
268+
case .match(let match): return match
269+
case .end: return nil
270+
}
271+
}
272+
273+
static func == (lhs: Self, rhs: Self) -> Bool {
274+
switch (lhs, rhs) {
275+
case (.match(let lhs), .match(let rhs)):
276+
return lhs.range == rhs.range
277+
case (.end, .end):
278+
return true
279+
case (.end, .match), (.match, .end):
280+
return false
281+
}
282+
}
283+
284+
static func < (lhs: Self, rhs: Self) -> Bool {
285+
switch (lhs, rhs) {
286+
case (.match(let lhs), .match(let rhs)):
287+
// This implementation uses a tuple comparison so that an empty
288+
// range `i..<i` will be ordered before a non-empty range at that
289+
// same starting point `i..<j`. As of 2022-05-30, `Regex` does not
290+
// return matches of this kind, but that is one behavior under
291+
// discussion for regexes like /a*|b/ when matched against "b".
292+
return (lhs.range.lowerBound, lhs.range.upperBound)
293+
< (rhs.range.lowerBound, rhs.range.upperBound)
294+
case (.match, .end):
295+
return true
296+
case (.end, .match), (.end, .end):
297+
return false
298+
}
299+
}
300+
}
301+
302+
var endIndex: Index {
303+
Index.end
304+
}
305+
306+
func index(after i: Index) -> Index {
307+
guard let currentMatch = i.match else {
308+
fatalError("Can't advance past the 'endIndex' of a match collection.")
309+
}
310+
311+
guard
312+
let start = searchIndex(after: currentMatch),
313+
let nextMatch = try? regex.firstMatch(in: input[start...])
314+
else {
315+
return .end
316+
}
317+
return Index.match(nextMatch)
318+
}
319+
320+
subscript(position: Index) -> Regex<Output>.Match {
321+
guard let match = position.match else {
322+
fatalError("Can't subscript the 'endIndex' of a match collection.")
323+
}
324+
return match
325+
}
326+
}
327+
186328
extension BidirectionalCollection where SubSequence == Substring {
187329
@available(SwiftStdlib 5.7, *)
188330
@_disfavoredOverload
189331
func _matches<R: RegexComponent>(
190332
of regex: R
191-
) -> MatchesCollection<RegexConsumer<R, Self>> {
192-
_matches(of: RegexConsumer(regex))
333+
) -> RegexMatchesCollection<R.RegexOutput> {
334+
RegexMatchesCollection(base: self[...], regex: regex.regex)
193335
}
194336

195337
@available(SwiftStdlib 5.7, *)
@@ -207,30 +349,6 @@ extension BidirectionalCollection where SubSequence == Substring {
207349
public func matches<Output>(
208350
of r: some RegexComponent<Output>
209351
) -> [Regex<Output>.Match] {
210-
let slice = self[...]
211-
var start = self.startIndex
212-
let end = self.endIndex
213-
let regex = r.regex
214-
215-
var result = [Regex<Output>.Match]()
216-
while start <= end {
217-
guard let match = try? regex._firstMatch(
218-
slice.base, in: start..<end
219-
) else {
220-
break
221-
}
222-
result.append(match)
223-
if match.range.isEmpty {
224-
if match.range.upperBound == end {
225-
break
226-
}
227-
// FIXME: semantic level
228-
start = slice.index(after: match.range.upperBound)
229-
} else {
230-
start = match.range.upperBound
231-
}
232-
}
233-
return result
352+
Array(_matches(of: r))
234353
}
235-
236354
}

Tests/RegexTests/AlgorithmsInternalsTests.swift

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,39 @@ extension AlgorithmTests {
4444
XCTAssertEqual("x", "axb"._trimming(r))
4545
XCTAssertEqual("x", "axbb"._trimming(r))
4646
}
47+
48+
func testMatchesCollection() {
49+
let r = try! Regex("a|b+|c*", as: Substring.self)
50+
51+
let str = "zaabbbbbbcde"
52+
let matches = str._matches(of: r)
53+
let expected: [Substring] = [
54+
"", // before 'z'
55+
"a",
56+
"a",
57+
"bbbbbb",
58+
"c",
59+
"", // after 'c'
60+
"", // after 'd'
61+
"", // after 'e'
62+
]
63+
64+
// Make sure we're getting the right collection type
65+
let _: RegexMatchesCollection<Substring> = matches
66+
67+
XCTAssertEqual(matches.map(\.output), expected)
68+
69+
let i = matches.index(matches.startIndex, offsetBy: 3)
70+
XCTAssertEqual(matches[i].output, expected[3])
71+
let j = matches.index(i, offsetBy: 5)
72+
XCTAssertEqual(j, matches.endIndex)
73+
74+
var index = matches.startIndex
75+
while index < matches.endIndex {
76+
XCTAssertEqual(
77+
matches[index].output,
78+
expected[matches.distance(from: matches.startIndex, to: index)])
79+
matches.formIndex(after: &index)
80+
}
81+
}
4782
}

0 commit comments

Comments
 (0)