Skip to content

Commit 85245ff

Browse files
committed
Add a string-specific search algorithm
This adds a Boyer-Moore substring search algorithm, and updates the `firstRange(of:)` and `ranges(of:)` methods to use that when both pieces of the search are strings/substrings. Still need to look at availability and switch the "replacing" methods to use this new search algorithm.
1 parent c3ee2fa commit 85245ff

File tree

3 files changed

+147
-6
lines changed

3 files changed

+147
-6
lines changed

Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,21 +21,49 @@ extension Collection {
2121
}
2222

2323
// MARK: Fixed pattern algorithms
24+
extension Substring {
25+
@usableFromInline
26+
func _firstRangeSubstring(
27+
of other: Substring
28+
) -> Range<String.Index>? {
29+
var searcher = SubstringSearcher(text: self, pattern: other)
30+
return searcher.next()
31+
}
32+
}
2433

2534
extension Collection where Element: Equatable {
35+
@usableFromInline
36+
func _firstRangeGeneric<C: Collection>(
37+
of other: C
38+
) -> Range<Index>? where C.Element == Element {
39+
let searcher = ZSearcher<SubSequence>(pattern: Array(other), by: ==)
40+
return searcher.search(self[...], in: startIndex..<endIndex)
41+
}
42+
2643
/// Finds and returns the range of the first occurrence of a given collection
2744
/// within this collection.
2845
///
2946
/// - Parameter other: The collection to search for.
3047
/// - Returns: A range in the collection of the first occurrence of `sequence`.
3148
/// Returns nil if `sequence` is not found.
3249
@available(SwiftStdlib 5.7, *)
50+
@inline(__always)
3351
public func firstRange<C: Collection>(
3452
of other: C
3553
) -> Range<Index>? where C.Element == Element {
36-
// TODO: Use a more efficient search algorithm
37-
let searcher = ZSearcher<SubSequence>(pattern: Array(other), by: ==)
38-
return searcher.search(self[...], in: startIndex..<endIndex)
54+
switch (self, other) {
55+
case (let str as String, let other as String):
56+
return str[...]._firstRangeSubstring(of: other[...]) as! Range<Index>?
57+
case (let str as Substring, let other as String):
58+
return str._firstRangeSubstring(of: other[...]) as! Range<Index>?
59+
case (let str as String, let other as Substring):
60+
return str[...]._firstRangeSubstring(of: other) as! Range<Index>?
61+
case (let str as Substring, let other as Substring):
62+
return str._firstRangeSubstring(of: other) as! Range<Index>?
63+
64+
default:
65+
return _firstRangeGeneric(of: other)
66+
}
3967
}
4068
}
4169

@@ -47,11 +75,23 @@ extension BidirectionalCollection where Element: Comparable {
4775
/// - Returns: A range in the collection of the first occurrence of `sequence`.
4876
/// Returns `nil` if `sequence` is not found.
4977
@available(SwiftStdlib 5.7, *)
78+
@inline(__always)
5079
public func firstRange<C: Collection>(
5180
of other: C
5281
) -> Range<Index>? where C.Element == Element {
53-
let searcher = ZSearcher<SubSequence>(pattern: Array(other), by: ==)
54-
return searcher.search(self[...], in: startIndex..<endIndex)
82+
switch (self, other) {
83+
case (let str as String, let other as String):
84+
return str[...]._firstRangeSubstring(of: other[...]) as! Range<Index>?
85+
case (let str as Substring, let other as String):
86+
return str._firstRangeSubstring(of: other[...]) as! Range<Index>?
87+
case (let str as String, let other as Substring):
88+
return str[...]._firstRangeSubstring(of: other) as! Range<Index>?
89+
case (let str as Substring, let other as Substring):
90+
return str._firstRangeSubstring(of: other) as! Range<Index>?
91+
92+
default:
93+
return _firstRangeGeneric(of: other)
94+
}
5595
}
5696
}
5797

Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,13 @@ extension Collection where Element: Equatable {
135135
) -> RangesCollection<ZSearcher<Self>> where C.Element == Element {
136136
_ranges(of: ZSearcher(pattern: Array(other), by: ==))
137137
}
138+
139+
@usableFromInline
140+
func _rangesGeneric<C: Collection>(
141+
of other: C
142+
) -> [Range<Index>] where C.Element == Element {
143+
Array(_ranges(of: other))
144+
}
138145

139146
// FIXME: Return `some Collection<Range<Index>>` for SE-0346
140147
/// Finds and returns the ranges of the all occurrences of a given sequence
@@ -143,10 +150,23 @@ extension Collection where Element: Equatable {
143150
/// - Returns: A collection of ranges of all occurrences of `other`. Returns
144151
/// an empty collection if `other` is not found.
145152
@available(SwiftStdlib 5.7, *)
153+
@inline(__always)
146154
public func ranges<C: Collection>(
147155
of other: C
148156
) -> [Range<Index>] where C.Element == Element {
149-
Array(_ranges(of: other))
157+
switch (self, other) {
158+
case (let str as String, let other as String):
159+
return Array(SubstringSearcher(text: str[...], pattern: other[...])) as! [Range<Index>]
160+
case (let str as Substring, let other as String):
161+
return Array(SubstringSearcher(text: str, pattern: other[...])) as! [Range<Index>]
162+
case (let str as String, let other as Substring):
163+
return Array(SubstringSearcher(text: str[...], pattern: other)) as! [Range<Index>]
164+
case (let str as Substring, let other as Substring):
165+
return Array(SubstringSearcher(text: str, pattern: other)) as! [Range<Index>]
166+
167+
default:
168+
return _rangesGeneric(of: other)
169+
}
150170
}
151171
}
152172

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
/// An implementation of the Boyer-Moore algorithm, for string-specific
13+
/// searching.
14+
@usableFromInline
15+
struct SubstringSearcher: Sequence, IteratorProtocol {
16+
@usableFromInline
17+
let text: Substring
18+
@usableFromInline
19+
let pattern: Substring
20+
@usableFromInline
21+
let badCharacterOffsets: [Character: Int]
22+
@usableFromInline
23+
let patternCount: Int
24+
@usableFromInline
25+
var endOfSearch: String.Index?
26+
27+
@usableFromInline
28+
init(text: Substring, pattern: Substring) {
29+
self.text = text
30+
self.pattern = pattern
31+
self.patternCount = pattern.count
32+
self.endOfSearch = text.index(
33+
text.startIndex, offsetBy: patternCount, limitedBy: text.endIndex)
34+
self.badCharacterOffsets = Dictionary(
35+
zip(pattern, 0...), uniquingKeysWith: { _, last in last })
36+
}
37+
38+
@inlinable
39+
mutating func next() -> Range<String.Index>? {
40+
while let end = endOfSearch {
41+
// Empty pattern matches at every position.
42+
if patternCount == 0 {
43+
endOfSearch = end == text.endIndex ? nil : text.index(after: end)
44+
return end..<end
45+
}
46+
47+
var patternOffset = patternCount - 1
48+
var patternCursor = pattern.index(before: pattern.endIndex)
49+
var textCursor = text.index(before: end)
50+
51+
// Search backwards from `end` to the start of the pattern
52+
while patternCursor >= pattern.startIndex
53+
&& pattern[patternCursor] == text[textCursor]
54+
{
55+
patternOffset -= 1
56+
57+
// Success!
58+
if patternCursor == pattern.startIndex {
59+
// Calculate the offset for the next search.
60+
endOfSearch = text.index(end, offsetBy: patternCount, limitedBy: text.endIndex)
61+
return textCursor..<end
62+
}
63+
64+
precondition(textCursor > text.startIndex)
65+
text.formIndex(before: &textCursor)
66+
pattern.formIndex(before: &patternCursor)
67+
}
68+
69+
// Match failed - calculate the end index of the next possible
70+
// candidate, based on the `badCharacterOffsets` table and the
71+
// current position in the pattern.
72+
let shiftOffset = Swift.max(
73+
1,
74+
patternOffset - (badCharacterOffsets[text[textCursor]] ?? 0))
75+
endOfSearch = text.index(
76+
end, offsetBy: shiftOffset, limitedBy: text.endIndex)
77+
}
78+
return nil
79+
}
80+
}
81+

0 commit comments

Comments
 (0)