Skip to content

Disentangle disparate 'bounds' ideas in processor #496

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Jun 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,6 @@ extension BidirectionalCollection where SubSequence == Substring {
@_disfavoredOverload
@available(SwiftStdlib 5.7, *)
public func contains(_ regex: some RegexComponent) -> Bool {
_contains(RegexConsumer(regex))
(try? regex.regex.firstMatch(in: self[...])) != nil
}
}
34 changes: 30 additions & 4 deletions Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift
Original file line number Diff line number Diff line change
Expand Up @@ -229,9 +229,18 @@ extension BidirectionalCollection where Element: Comparable {
@available(SwiftStdlib 5.7, *)
struct RegexRangesCollection<Output> {
let base: RegexMatchesCollection<Output>

init(string: Substring, regex: Regex<Output>) {
self.base = RegexMatchesCollection(base: string, regex: regex)

init(
input: String,
subjectBounds: Range<String.Index>,
searchBounds: Range<String.Index>,
regex: Regex<Output>
) {
self.base = .init(
input: input,
subjectBounds: subjectBounds,
searchBounds: searchBounds,
regex: regex)
}
}

Expand Down Expand Up @@ -263,12 +272,29 @@ extension RegexRangesCollection: Collection {
// MARK: Regex algorithms

extension Collection where SubSequence == Substring {
@available(SwiftStdlib 5.7, *)
@_disfavoredOverload
func _ranges<R: RegexComponent>(
of regex: R,
subjectBounds: Range<String.Index>,
searchBounds: Range<String.Index>
) -> RegexRangesCollection<R.RegexOutput> {
RegexRangesCollection(
input: self[...].base,
subjectBounds: subjectBounds,
searchBounds: searchBounds,
regex: regex.regex)
}

@available(SwiftStdlib 5.7, *)
@_disfavoredOverload
func _ranges<R: RegexComponent>(
of regex: R
) -> RegexRangesCollection<R.RegexOutput> {
RegexRangesCollection(string: self[...], regex: regex.regex)
_ranges(
of: regex,
subjectBounds: startIndex..<endIndex,
searchBounds: startIndex..<endIndex)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,10 @@ extension RangeReplaceableCollection where SubSequence == Substring {
maxReplacements: Int = .max
) -> Self where Replacement.Element == Element {
_replacing(
self[subrange]._ranges(of: regex),
self._ranges(
of: regex,
subjectBounds: startIndex..<endIndex,
searchBounds: subrange),
with: replacement,
maxReplacements: maxReplacements)
}
Expand Down
39 changes: 31 additions & 8 deletions Sources/_StringProcessing/Algorithms/Matching/Matches.swift
Original file line number Diff line number Diff line change
Expand Up @@ -185,14 +185,26 @@ extension BidirectionalCollection {

@available(SwiftStdlib 5.7, *)
struct RegexMatchesCollection<Output> {
let input: Substring
let input: String
let subjectBounds: Range<String.Index>
let searchBounds: Range<String.Index>
let regex: Regex<Output>
let startIndex: Index

init(base: Substring, regex: Regex<Output>) {
self.input = base
init(
input: String,
subjectBounds: Range<String.Index>,
searchBounds: Range<String.Index>,
regex: Regex<Output>
) {
self.input = input
self.subjectBounds = subjectBounds
self.searchBounds = searchBounds
self.regex = regex
self.startIndex = base.firstMatch(of: regex).map(Index.match) ?? .end
self.startIndex = (try? regex._firstMatch(
input,
subjectBounds: subjectBounds,
searchBounds: searchBounds)).map(Index.match) ?? .end
}
}

Expand Down Expand Up @@ -241,12 +253,15 @@ extension RegexMatchesCollection: Sequence {
}

// `nextStart` is `nil` when iteration has completed
guard let start = nextStart else {
guard let start = nextStart, start <= base.searchBounds.upperBound else {
return nil
}

// Otherwise, find the next match (if any) and compute `nextStart`
let match = try? base.regex.firstMatch(in: base.input[start...])
let match = try? base.regex._firstMatch(
base.input,
subjectBounds: base.subjectBounds,
searchBounds: start..<base.searchBounds.upperBound)
nextStart = match.flatMap(base.searchIndex(after:))
return match
}
Expand Down Expand Up @@ -310,7 +325,11 @@ extension RegexMatchesCollection: Collection {

guard
let start = searchIndex(after: currentMatch),
let nextMatch = try? regex.firstMatch(in: input[start...])
start <= searchBounds.upperBound,
let nextMatch = try? regex._firstMatch(
input,
subjectBounds: subjectBounds,
searchBounds: start..<searchBounds.upperBound)
else {
return .end
}
Expand All @@ -331,7 +350,11 @@ extension BidirectionalCollection where SubSequence == Substring {
func _matches<R: RegexComponent>(
of regex: R
) -> RegexMatchesCollection<R.RegexOutput> {
RegexMatchesCollection(base: self[...], regex: regex.regex)
RegexMatchesCollection(
input: self[...].base,
subjectBounds: startIndex..<endIndex,
searchBounds: startIndex..<endIndex,
regex: regex.regex)
}

@available(SwiftStdlib 5.7, *)
Expand Down
55 changes: 32 additions & 23 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -96,26 +96,26 @@ fileprivate extension Compiler.ByteCodeGen {
// need to supply both a slice bounds and a per-search bounds.
switch kind {
case .startOfSubject:
builder.buildAssert { (input, pos, bounds) in
pos == input.startIndex
builder.buildAssert { (input, pos, subjectBounds) in
pos == subjectBounds.lowerBound
}

case .endOfSubjectBeforeNewline:
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in
if pos == input.endIndex { return true }
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
if pos == subjectBounds.upperBound { return true }
switch semanticLevel {
case .graphemeCluster:
return input.index(after: pos) == input.endIndex
return input.index(after: pos) == subjectBounds.upperBound
&& input[pos].isNewline
case .unicodeScalar:
return input.unicodeScalars.index(after: pos) == input.endIndex
return input.unicodeScalars.index(after: pos) == subjectBounds.upperBound
&& input.unicodeScalars[pos].isNewline
}
}

case .endOfSubject:
builder.buildAssert { (input, pos, bounds) in
pos == input.endIndex
builder.buildAssert { (input, pos, subjectBounds) in
pos == subjectBounds.upperBound
}

case .resetStartOfMatch:
Expand All @@ -124,9 +124,10 @@ fileprivate extension Compiler.ByteCodeGen {

case .firstMatchingPositionInSubject:
// TODO: We can probably build a nice model with API here
builder.buildAssert { (input, pos, bounds) in
pos == bounds.lowerBound
}

// FIXME: This needs to be based on `searchBounds`,
// not the `subjectBounds` given as an argument here
builder.buildAssert { (input, pos, subjectBounds) in false }

case .textSegment:
builder.buildAssert { (input, pos, _) in
Expand All @@ -141,9 +142,13 @@ fileprivate extension Compiler.ByteCodeGen {
}

case .startOfLine:
// FIXME: Anchor.startOfLine must always use this first branch
// The behavior of `^` should depend on `anchorsMatchNewlines`, but
// the DSL-based `.startOfLine` anchor should always match the start
// of a line. Right now we don't distinguish between those anchors.
if options.anchorsMatchNewlines {
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in
if pos == input.startIndex { return true }
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
if pos == subjectBounds.lowerBound { return true }
switch semanticLevel {
case .graphemeCluster:
return input[input.index(before: pos)].isNewline
Expand All @@ -152,15 +157,19 @@ fileprivate extension Compiler.ByteCodeGen {
}
}
} else {
builder.buildAssert { (input, pos, bounds) in
pos == input.startIndex
builder.buildAssert { (input, pos, subjectBounds) in
pos == subjectBounds.lowerBound
}
}

case .endOfLine:
// FIXME: Anchor.endOfLine must always use this first branch
// The behavior of `$` should depend on `anchorsMatchNewlines`, but
// the DSL-based `.endOfLine` anchor should always match the end
// of a line. Right now we don't distinguish between those anchors.
if options.anchorsMatchNewlines {
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in
if pos == input.endIndex { return true }
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
if pos == subjectBounds.upperBound { return true }
switch semanticLevel {
case .graphemeCluster:
return input[pos].isNewline
Expand All @@ -169,25 +178,25 @@ fileprivate extension Compiler.ByteCodeGen {
}
}
} else {
builder.buildAssert { (input, pos, bounds) in
pos == input.endIndex
builder.buildAssert { (input, pos, subjectBounds) in
pos == subjectBounds.upperBound
}
}

case .wordBoundary:
// TODO: May want to consider Unicode level
builder.buildAssert { [options] (input, pos, bounds) in
builder.buildAssert { [options] (input, pos, subjectBounds) in
// TODO: How should we handle bounds?
_CharacterClassModel.word.isBoundary(
input, at: pos, bounds: bounds, with: options)
input, at: pos, bounds: subjectBounds, with: options)
}

case .notWordBoundary:
// TODO: May want to consider Unicode level
builder.buildAssert { [options] (input, pos, bounds) in
builder.buildAssert { [options] (input, pos, subjectBounds) in
// TODO: How should we handle bounds?
!_CharacterClassModel.word.isBoundary(
input, at: pos, bounds: bounds, with: options)
input, at: pos, bounds: subjectBounds, with: options)
}
}
}
Expand Down
17 changes: 16 additions & 1 deletion Sources/_StringProcessing/Engine/Consume.swift
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,25 @@ extension Engine {
Processor(
program: program,
input: input,
bounds: bounds,
subjectBounds: bounds,
searchBounds: bounds,
matchMode: matchMode,
isTracingEnabled: enableTracing)
}

func makeFirstMatchProcessor(
input: String,
subjectBounds: Range<String.Index>,
searchBounds: Range<String.Index>
) -> Processor {
Processor(
program: program,
input: input,
subjectBounds: subjectBounds,
searchBounds: searchBounds,
matchMode: .partialFromFront,
isTracingEnabled: enableTracing)
}
}

extension Processor {
Expand Down
Loading