Skip to content

Disentangle disparate 'bounds' ideas in processor #496

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Jun 22, 2022
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,6 @@ extension BidirectionalCollection where SubSequence == Substring {
@_disfavoredOverload
@available(SwiftStdlib 5.7, *)
public func contains(_ regex: some RegexComponent) -> Bool {
_contains(RegexConsumer(regex))
(try? regex.regex.firstMatch(in: self[...])) != nil
}
}
27 changes: 15 additions & 12 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -97,25 +97,25 @@ fileprivate extension Compiler.ByteCodeGen {
switch kind {
case .startOfSubject:
builder.buildAssert { (input, pos, bounds) in
pos == input.startIndex
pos == bounds.lowerBound
}

case .endOfSubjectBeforeNewline:
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in
if pos == input.endIndex { return true }
if pos == bounds.upperBound { return true }
switch semanticLevel {
case .graphemeCluster:
return input.index(after: pos) == input.endIndex
return input.index(after: pos) == bounds.upperBound
&& input[pos].isNewline
case .unicodeScalar:
return input.unicodeScalars.index(after: pos) == input.endIndex
return input.unicodeScalars.index(after: pos) == bounds.upperBound
&& input.unicodeScalars[pos].isNewline
}
}

case .endOfSubject:
builder.buildAssert { (input, pos, bounds) in
pos == input.endIndex
pos == bounds.upperBound
}

case .resetStartOfMatch:
Expand All @@ -124,9 +124,10 @@ fileprivate extension Compiler.ByteCodeGen {

case .firstMatchingPositionInSubject:
// TODO: We can probably build a nice model with API here
builder.buildAssert { (input, pos, bounds) in
pos == bounds.lowerBound
}

// FIXME: This needs to be based on `searchBounds`,
// not the `subjectBounds` given as an argument here
builder.buildAssert { (input, pos, bounds) in false }

case .textSegment:
builder.buildAssert { (input, pos, _) in
Expand All @@ -141,9 +142,10 @@ fileprivate extension Compiler.ByteCodeGen {
}

case .startOfLine:
// FIXME: Anchor.startOfLine must always use this first branch
if options.anchorsMatchNewlines {
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in
if pos == input.startIndex { return true }
if pos == bounds.lowerBound { return true }
switch semanticLevel {
case .graphemeCluster:
return input[input.index(before: pos)].isNewline
Expand All @@ -153,14 +155,15 @@ fileprivate extension Compiler.ByteCodeGen {
}
} else {
builder.buildAssert { (input, pos, bounds) in
pos == input.startIndex
pos == bounds.lowerBound
}
}

case .endOfLine:
// FIXME: Anchor.endOfLine must always use this first branch
if options.anchorsMatchNewlines {
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in
if pos == input.endIndex { return true }
if pos == bounds.upperBound { return true }
switch semanticLevel {
case .graphemeCluster:
return input[pos].isNewline
Expand All @@ -170,7 +173,7 @@ fileprivate extension Compiler.ByteCodeGen {
}
} else {
builder.buildAssert { (input, pos, bounds) in
pos == input.endIndex
pos == bounds.upperBound
}
}

Expand Down
17 changes: 16 additions & 1 deletion Sources/_StringProcessing/Engine/Consume.swift
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,25 @@ extension Engine {
Processor(
program: program,
input: input,
bounds: bounds,
subjectBounds: bounds,
searchBounds: bounds,
matchMode: matchMode,
isTracingEnabled: enableTracing)
}

func makeFirstMatchProcessor(
input: Input,
subjectBounds: Range<Input.Index>,
searchBounds: Range<Input.Index>
) -> Processor<Input> {
Processor(
program: program,
input: input,
subjectBounds: subjectBounds,
searchBounds: searchBounds,
matchMode: .partialFromFront,
isTracingEnabled: enableTracing)
}
}

extension Processor where Input == String {
Expand Down
72 changes: 44 additions & 28 deletions Sources/_StringProcessing/Engine/Processor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,36 @@ struct Processor<
> where Input.Element: Equatable { // maybe Hashable?
typealias Element = Input.Element

/// The base collection of the subject to search.
///
/// Taken together, `input` and `subjectBounds` define the actual subject
/// of the search. `input` can be a "supersequence" of the subject, while
/// `input[subjectBounds]` is the logical entity that is being searched.
let input: Input

/// The bounds of the logical subject in `input`.
///
///
/// `subjectBounds` is equal to or a subrange of
/// `input.startIndex..<input.endIndex`.
let subjectBounds: Range<Position>

let matchMode: MatchMode
let instructions: InstructionList<Instruction>

// MARK: Resettable state

// The subject bounds.
//
// FIXME: This also conflates search bounds too!
var bounds: Range<Position>

// The current position in the subject
/// The bounds within the subject for an individual search.
///
/// `searchBounds` is equal to `subjectBounds` in some cases, but can be a
/// subrange when performing operations like searching for matches iteratively
/// or calling `str.replacing(_:with:subrange:)`.
var searchBounds: Range<Position>

/// The current search position while processing.
///
/// `currentPosition` must always be in the range `subjectBounds` or equal
/// to `subjectBounds.upperBound`.
var currentPosition: Position

var controller: Controller
Expand All @@ -59,53 +77,51 @@ struct Processor<

var failureReason: Error? = nil


// MARK: Metrics, debugging, etc.
var cycleCount = 0
var isTracingEnabled: Bool

}

extension Processor {
typealias Position = Input.Index

var start: Position { bounds.lowerBound }
var end: Position { bounds.upperBound }
var start: Position { subjectBounds.lowerBound }
var end: Position { subjectBounds.upperBound }
}

extension Processor {
init(
program: MEProgram<Input>,
input: Input,
bounds: Range<Position>,
subjectBounds: Range<Position>,
searchBounds: Range<Position>,
matchMode: MatchMode,
isTracingEnabled: Bool
) {
self.controller = Controller(pc: 0)
self.instructions = program.instructions
self.input = input
self.bounds = bounds
self.subjectBounds = subjectBounds
self.searchBounds = searchBounds
self.matchMode = matchMode
self.isTracingEnabled = isTracingEnabled
self.currentPosition = bounds.lowerBound
self.currentPosition = searchBounds.lowerBound

self.registers = Registers(program, bounds.upperBound)
// Initialize registers with end of search bounds
self.registers = Registers(program, searchBounds.upperBound)
self.storedCaptures = Array(
repeating: .init(), count: program.registerInfo.captures)

_checkInvariants()
}


mutating func reset(searchBounds: Range<Position>) {
// FIXME: We currently conflate both subject bounds and search bounds
// This should just reset search bounds
self.bounds = searchBounds
self.currentPosition = self.bounds.lowerBound
self.searchBounds = searchBounds
self.currentPosition = self.searchBounds.lowerBound

self.controller = Controller(pc: 0)

self.registers.reset(sentinel: bounds.upperBound)
self.registers.reset(sentinel: searchBounds.upperBound)

self.savePoints.removeAll(keepingCapacity: true)
self.callStack.removeAll(keepingCapacity: true)
Expand All @@ -132,7 +148,7 @@ extension Processor {
var slice: Input.SubSequence {
// TODO: Should we whole-scale switch to slices, or
// does that depend on options for some anchors?
input[bounds]
input[subjectBounds]
}

// Advance in our input, without any checks or failure signalling
Expand Down Expand Up @@ -161,8 +177,8 @@ extension Processor {
/// - Precondition: `bounds.contains(index) || index == bounds.upperBound`
/// - Precondition: `index >= currentPosition`
mutating func resume(at index: Input.Index) {
assert(index >= bounds.lowerBound)
assert(index <= bounds.upperBound)
assert(index >= subjectBounds.lowerBound)
assert(index <= subjectBounds.upperBound)
assert(index >= currentPosition)
currentPosition = index
}
Expand Down Expand Up @@ -233,7 +249,7 @@ extension Processor {
switch (currentPosition, matchMode) {
// When reaching the end of the match bounds or when we are only doing a
// prefix match, transition to accept.
case (bounds.upperBound, _), (_, .partialFromFront):
case (subjectBounds.upperBound, _), (_, .partialFromFront):
state = .accept

// When we are doing a full match but did not reach the end of the match
Expand Down Expand Up @@ -411,9 +427,9 @@ extension Processor {

case .consumeBy:
let reg = payload.consumer
guard currentPosition < bounds.upperBound,
guard currentPosition < subjectBounds.upperBound,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Subject or search bounds?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do we, generally, know which one to choose?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Generally, we want to be moving and matching with searchBounds; subjectBounds should really only be used to match anchors, not for general purpose index movements.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(i.e. this was incorrect)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any way we could use access control or some better coding convention here? E.g. if we always want search bounds except for anchors, can we isolate the code that refers to subject bounds inside the engine?

let nextIndex = registers[reg](
input, currentPosition..<bounds.upperBound)
input, currentPosition..<subjectBounds.upperBound)
else {
signalFailure()
return
Expand All @@ -425,7 +441,7 @@ extension Processor {
let reg = payload.assertion
let assertion = registers[reg]
do {
guard try assertion(input, currentPosition, bounds) else {
guard try assertion(input, currentPosition, subjectBounds) else {
signalFailure()
return
}
Expand All @@ -440,7 +456,7 @@ extension Processor {
let matcher = registers[matcherReg]
do {
guard let (nextIdx, val) = try matcher(
input, currentPosition, bounds
input, currentPosition, subjectBounds
) else {
signalFailure()
return
Expand Down
27 changes: 15 additions & 12 deletions Sources/_StringProcessing/Executor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,17 @@ struct Executor {
@available(SwiftStdlib 5.7, *)
func firstMatch<Output>(
_ input: String,
in inputRange: Range<String.Index>,
subjectBounds: Range<String.Index>,
searchBounds: Range<String.Index>,
graphemeSemantic: Bool
) throws -> Regex<Output>.Match? {
var cpu = engine.makeProcessor(
input: input, bounds: inputRange, matchMode: .partialFromFront)
var cpu = engine.makeFirstMatchProcessor(
input: input,
subjectBounds: subjectBounds,
searchBounds: searchBounds)

var low = inputRange.lowerBound
let high = inputRange.upperBound
var low = searchBounds.lowerBound
let high = searchBounds.upperBound
while true {
if let m: Regex<Output>.Match = try _match(
input, in: low..<high, using: &cpu
Expand All @@ -49,18 +52,18 @@ struct Executor {
@available(SwiftStdlib 5.7, *)
func match<Output>(
_ input: String,
in inputRange: Range<String.Index>,
in subjectBounds: Range<String.Index>,
_ mode: MatchMode
) throws -> Regex<Output>.Match? {
var cpu = engine.makeProcessor(
input: input, bounds: inputRange, matchMode: mode)
return try _match(input, in: inputRange, using: &cpu)
input: input, bounds: subjectBounds, matchMode: mode)
return try _match(input, in: subjectBounds, using: &cpu)
}

@available(SwiftStdlib 5.7, *)
func _match<Output>(
_ input: String,
in inputRange: Range<String.Index>,
in subjectBounds: Range<String.Index>,
using cpu: inout Processor<String>
) throws -> Regex<Output>.Match? {
guard let endIdx = cpu.consume() else {
Expand All @@ -74,7 +77,7 @@ struct Executor {
values: cpu.storedCaptures,
referencedCaptureOffsets: engine.program.referencedCaptureOffsets)

let range = inputRange.lowerBound..<endIdx
let range = subjectBounds.lowerBound..<endIdx
let caps = engine.program.captureList.createElements(capList)

let anyRegexOutput = AnyRegexOutput(input: input, elements: caps)
Expand All @@ -84,9 +87,9 @@ struct Executor {
@available(SwiftStdlib 5.7, *)
func dynamicMatch(
_ input: String,
in inputRange: Range<String.Index>,
in subjectBounds: Range<String.Index>,
_ mode: MatchMode
) throws -> Regex<AnyRegexOutput>.Match? {
try match(input, in: inputRange, mode)
try match(input, in: subjectBounds, mode)
}
}
19 changes: 15 additions & 4 deletions Sources/_StringProcessing/Regex/Match.swift
Original file line number Diff line number Diff line change
Expand Up @@ -126,21 +126,32 @@ extension Regex {

func _match(
_ input: String,
in inputRange: Range<String.Index>,
in subjectBounds: Range<String.Index>,
mode: MatchMode = .wholeString
) throws -> Regex<Output>.Match? {
let executor = Executor(program: regex.program.loweredProgram)
return try executor.match(input, in: inputRange, mode)
return try executor.match(input, in: subjectBounds, mode)
}

func _firstMatch(
_ input: String,
in inputRange: Range<String.Index>
in subjectBounds: Range<String.Index>
) throws -> Regex<Output>.Match? {
try _firstMatch(input, subjectBounds: subjectBounds, searchBounds: subjectBounds)
}

func _firstMatch(
_ input: String,
subjectBounds: Range<String.Index>,
searchBounds: Range<String.Index>
) throws -> Regex<Output>.Match? {
let executor = Executor(program: regex.program.loweredProgram)
let graphemeSemantic = regex.initialOptions.semanticLevel == .graphemeCluster
return try executor.firstMatch(
input, in: inputRange, graphemeSemantic: graphemeSemantic)
input,
subjectBounds: subjectBounds,
searchBounds: searchBounds,
graphemeSemantic: graphemeSemantic)
}
}

Expand Down
Loading