-
Notifications
You must be signed in to change notification settings - Fork 49
Optimize search for start-anchored regexes #682
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -711,6 +711,105 @@ extension DSLTree.Node { | |
} | ||
} | ||
|
||
extension DSLTree.Node { | ||
/// Implementation for `canOnlyMatchAtStart`, which maintains the option | ||
/// state. | ||
/// | ||
/// For a given specific node, this method can return one of three values: | ||
/// | ||
/// - `true`: This node is guaranteed to match only at the start of a subject. | ||
/// - `false`: This node can match anywhere in the subject. | ||
/// - `nil`: This node is inconclusive about where it can match. | ||
/// | ||
/// In particular, non-required groups and option-setting groups are | ||
/// inconclusive about where they can match. | ||
private func _canOnlyMatchAtStartImpl(_ options: inout MatchingOptions) -> Bool? { | ||
switch self { | ||
// Defining cases | ||
case .atom(.assertion(.startOfSubject)): | ||
return true | ||
case .atom(.assertion(.caretAnchor)): | ||
return !options.anchorsMatchNewlines | ||
|
||
// Changing options doesn't determine `true`/`false`. | ||
case .atom(.changeMatchingOptions(let sequence)): | ||
options.apply(sequence.ast) | ||
return nil | ||
|
||
// Any other atom or consuming node returns `false`. | ||
case .atom, .customCharacterClass, .quotedLiteral: | ||
return false | ||
|
||
// Trivia/empty have no effect. | ||
case .trivia, .empty: | ||
return nil | ||
|
||
// In an alternation, all of its children must match only at start. | ||
case .orderedChoice(let children): | ||
return children.allSatisfy { $0._canOnlyMatchAtStartImpl(&options) == true } | ||
|
||
// In a concatenation, the first definitive child provides the answer. | ||
case .concatenation(let children): | ||
for child in children { | ||
if let result = child._canOnlyMatchAtStartImpl(&options) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of a loop, would this be something like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately, no — a regex like |
||
return result | ||
} | ||
} | ||
return false | ||
|
||
// Groups (and other parent nodes) defer to the child. | ||
case .nonCapturingGroup(let kind, let child): | ||
options.beginScope() | ||
defer { options.endScope() } | ||
if case .changeMatchingOptions(let sequence) = kind.ast { | ||
options.apply(sequence) | ||
} | ||
return child._canOnlyMatchAtStartImpl(&options) | ||
case .capture(_, _, let child, _): | ||
options.beginScope() | ||
defer { options.endScope() } | ||
return child._canOnlyMatchAtStartImpl(&options) | ||
case .ignoreCapturesInTypedOutput(let child), | ||
.convertedRegexLiteral(let child, _): | ||
return child._canOnlyMatchAtStartImpl(&options) | ||
|
||
// A quantification that doesn't require its child to exist can still | ||
// allow a start-only match. (e.g. `/(foo)?^bar/`) | ||
case .quantification(let amount, _, let child): | ||
return amount.requiresAtLeastOne | ||
? child._canOnlyMatchAtStartImpl(&options) | ||
: nil | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the logic for this? If a child is quantified There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If a child is quantified at least once, then whether it can match only at the start depends on the child. This isn't making a determination about whether it's possible for it to match at all, just that if it can match, it only matches at the start. The key part is that a quantification that can be skipped doesn't actually impact whether or not the pattern can only match at the start, since we can skip an anchor inside the quanitification or jump to one after it:
|
||
|
||
// For conditional nodes, both sides must require matching at start. | ||
case .conditional(_, let child1, let child2): | ||
return child1._canOnlyMatchAtStartImpl(&options) == true | ||
&& child2._canOnlyMatchAtStartImpl(&options) == true | ||
|
||
// Extended behavior isn't known, so we return `false` for safety. | ||
case .consumer, .matcher, .characterPredicate, .absentFunction: | ||
return false | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the difference between returning false and nil in practice? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Returning There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Awesome, thanks for clarifying! Could you update the comments for why |
||
} | ||
} | ||
|
||
/// Returns a Boolean value indicating whether the regex with this node as | ||
/// the root can _only_ match at the start of a subject. | ||
/// | ||
/// For example, these regexes can only match at the start of a subject: | ||
/// | ||
/// - `/^foo/` | ||
/// - `/(^foo|^bar)/` (both sides of the alternation start with `^`) | ||
/// | ||
/// These can match other places in a subject: | ||
/// | ||
/// - `/(^foo)?bar/` (`^` is in an optional group) | ||
/// - `/(^foo|bar)/` (only one side of the alternation starts with `^`) | ||
/// - `/(?m)^foo/` (`^` means "the start of a line" due to `(?m)`) | ||
internal func canOnlyMatchAtStart() -> Bool { | ||
var options = MatchingOptions() | ||
return _canOnlyMatchAtStartImpl(&options) ?? false | ||
} | ||
} | ||
|
||
// MARK: AST wrapper types | ||
// | ||
// These wrapper types are required because even @_spi-marked public APIs can't | ||
|
@@ -818,6 +917,17 @@ extension DSLTree { | |
public static func range(_ lower: Int, _ upper: Int) -> Self { | ||
.init(ast: .range(.init(lower, at: .fake), .init(upper, at: .fake))) | ||
} | ||
|
||
internal var requiresAtLeastOne: Bool { | ||
switch ast { | ||
case .zeroOrOne, .zeroOrMore, .upToN: | ||
return false | ||
case .oneOrMore: | ||
return true | ||
case .exactly(let num), .nOrMore(let num), .range(let num, _): | ||
return num.value.map { $0 > 0 } ?? false | ||
} | ||
} | ||
} | ||
|
||
@_spi(RegexBuilder) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why are we mutating options in a query API?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is how the options type is built — we can't really use the traversal stack because this kind of options change affects siblings, not just descendants.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, so that's the recursion base-case for a member of a concatenation which will affect following siblings. Could you mention that in a comment?