Skip to content

Commit c2b2e36

Browse files
committed
Optimize search for start-anchored regexes (swiftlang#682)
When a regex is anchored to the start of a subject, there's no need to search throughout a string for the pattern when searching for the first match: a prefix match is sufficient. This adds a regex compilation-time check about whether a match can only be found at the start of a subject, and then uses that to choose whether to defer to `prefixMatch` from within `firstMatch`.
1 parent 2aababb commit c2b2e36

File tree

8 files changed

+209
-4
lines changed

8 files changed

+209
-4
lines changed

Sources/RegexBenchmark/Suite/NotFound.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ extension BenchmarkRunner {
1313
baseName: "AnchoredNotFound",
1414
regex: "^ +a",
1515
input: input,
16-
isWhole: true)
16+
includeFirst: true)
1717
anchoredNotFound.register(&self)
1818
}
1919
}

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ extension Compiler.ByteCodeGen {
4646
// The whole match (`.0` element of output) is equivalent to an implicit
4747
// capture over the entire regex.
4848
try emitNode(.capture(name: nil, reference: nil, root))
49+
builder.canOnlyMatchAtStart = root.canOnlyMatchAtStart()
4950
builder.buildAccept()
5051
return try builder.assemble()
5152
}

Sources/_StringProcessing/Engine/MEBuilder.swift

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ extension MEProgram {
4343
var captureList = CaptureList()
4444
var initialOptions = MatchingOptions()
4545

46+
// Starting constraint
47+
var canOnlyMatchAtStart = false
48+
4649
// Symbolic reference resolution
4750
var unresolvedReferences: [ReferenceID: [InstructionAddress]] = [:]
4851
var referencedCaptureOffsets: [ReferenceID: Int] = [:]
@@ -404,7 +407,8 @@ extension MEProgram.Builder {
404407
enableMetrics: enableMetrics,
405408
captureList: captureList,
406409
referencedCaptureOffsets: referencedCaptureOffsets,
407-
initialOptions: initialOptions)
410+
initialOptions: initialOptions,
411+
canOnlyMatchAtStart: canOnlyMatchAtStart)
408412
}
409413

410414
mutating func reset() { self = Self() }

Sources/_StringProcessing/Engine/MEProgram.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ struct MEProgram {
3838
let referencedCaptureOffsets: [ReferenceID: Int]
3939

4040
var initialOptions: MatchingOptions
41+
var canOnlyMatchAtStart: Bool
4142
}
4243

4344
extension MEProgram: CustomStringConvertible {

Sources/_StringProcessing/Regex/DSLTree.swift

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -711,6 +711,105 @@ extension DSLTree.Node {
711711
}
712712
}
713713

714+
extension DSLTree.Node {
715+
/// Implementation for `canOnlyMatchAtStart`, which maintains the option
716+
/// state.
717+
///
718+
/// For a given specific node, this method can return one of three values:
719+
///
720+
/// - `true`: This node is guaranteed to match only at the start of a subject.
721+
/// - `false`: This node can match anywhere in the subject.
722+
/// - `nil`: This node is inconclusive about where it can match.
723+
///
724+
/// In particular, non-required groups and option-setting groups are
725+
/// inconclusive about where they can match.
726+
private func _canOnlyMatchAtStartImpl(_ options: inout MatchingOptions) -> Bool? {
727+
switch self {
728+
// Defining cases
729+
case .atom(.assertion(.startOfSubject)):
730+
return true
731+
case .atom(.assertion(.caretAnchor)):
732+
return !options.anchorsMatchNewlines
733+
734+
// Changing options doesn't determine `true`/`false`.
735+
case .atom(.changeMatchingOptions(let sequence)):
736+
options.apply(sequence.ast)
737+
return nil
738+
739+
// Any other atom or consuming node returns `false`.
740+
case .atom, .customCharacterClass, .quotedLiteral:
741+
return false
742+
743+
// Trivia/empty have no effect.
744+
case .trivia, .empty:
745+
return nil
746+
747+
// In an alternation, all of its children must match only at start.
748+
case .orderedChoice(let children):
749+
return children.allSatisfy { $0._canOnlyMatchAtStartImpl(&options) == true }
750+
751+
// In a concatenation, the first definitive child provides the answer.
752+
case .concatenation(let children):
753+
for child in children {
754+
if let result = child._canOnlyMatchAtStartImpl(&options) {
755+
return result
756+
}
757+
}
758+
return false
759+
760+
// Groups (and other parent nodes) defer to the child.
761+
case .nonCapturingGroup(let kind, let child):
762+
options.beginScope()
763+
defer { options.endScope() }
764+
if case .changeMatchingOptions(let sequence) = kind.ast {
765+
options.apply(sequence)
766+
}
767+
return child._canOnlyMatchAtStartImpl(&options)
768+
case .capture(_, _, let child, _):
769+
options.beginScope()
770+
defer { options.endScope() }
771+
return child._canOnlyMatchAtStartImpl(&options)
772+
case .ignoreCapturesInTypedOutput(let child),
773+
.convertedRegexLiteral(let child, _):
774+
return child._canOnlyMatchAtStartImpl(&options)
775+
776+
// A quantification that doesn't require its child to exist can still
777+
// allow a start-only match. (e.g. `/(foo)?^bar/`)
778+
case .quantification(let amount, _, let child):
779+
return amount.requiresAtLeastOne
780+
? child._canOnlyMatchAtStartImpl(&options)
781+
: nil
782+
783+
// For conditional nodes, both sides must require matching at start.
784+
case .conditional(_, let child1, let child2):
785+
return child1._canOnlyMatchAtStartImpl(&options) == true
786+
&& child2._canOnlyMatchAtStartImpl(&options) == true
787+
788+
// Extended behavior isn't known, so we return `false` for safety.
789+
case .consumer, .matcher, .characterPredicate, .absentFunction:
790+
return false
791+
}
792+
}
793+
794+
/// Returns a Boolean value indicating whether the regex with this node as
795+
/// the root can _only_ match at the start of a subject.
796+
///
797+
/// For example, these regexes can only match at the start of a subject:
798+
///
799+
/// - `/^foo/`
800+
/// - `/(^foo|^bar)/` (both sides of the alternation start with `^`)
801+
///
802+
/// These can match other places in a subject:
803+
///
804+
/// - `/(^foo)?bar/` (`^` is in an optional group)
805+
/// - `/(^foo|bar)/` (only one side of the alternation starts with `^`)
806+
/// - `/(?m)^foo/` (`^` means "the start of a line" due to `(?m)`)
807+
internal func canOnlyMatchAtStart() -> Bool {
808+
var options = MatchingOptions()
809+
return _canOnlyMatchAtStartImpl(&options) ?? false
810+
}
811+
}
812+
714813
// MARK: AST wrapper types
715814
//
716815
// These wrapper types are required because even @_spi-marked public APIs can't
@@ -818,6 +917,17 @@ extension DSLTree {
818917
public static func range(_ lower: Int, _ upper: Int) -> Self {
819918
.init(ast: .range(.init(lower, at: .fake), .init(upper, at: .fake)))
820919
}
920+
921+
internal var requiresAtLeastOne: Bool {
922+
switch ast {
923+
case .zeroOrOne, .zeroOrMore, .upToN:
924+
return false
925+
case .oneOrMore:
926+
return true
927+
case .exactly(let num), .nOrMore(let num), .range(let num, _):
928+
return num.value.map { $0 > 0 } ?? false
929+
}
930+
}
821931
}
822932

823933
@_spi(RegexBuilder)

Sources/_StringProcessing/Regex/Match.swift

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,9 @@ extension Regex {
273273
_ input: String,
274274
in subjectBounds: Range<String.Index>
275275
) throws -> Regex<Output>.Match? {
276-
try _firstMatch(input, subjectBounds: subjectBounds, searchBounds: subjectBounds)
276+
try regex.program.loweredProgram.canOnlyMatchAtStart
277+
? _match(input, in: subjectBounds, mode: .partialFromFront)
278+
: _firstMatch(input, subjectBounds: subjectBounds, searchBounds: subjectBounds)
277279
}
278280

279281
func _firstMatch(

Tests/RegexBuilderTests/RegexDSLTests.swift

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
//===----------------------------------------------------------------------===//
1111

1212
import XCTest
13-
import _StringProcessing
13+
@testable import _StringProcessing
1414
import RegexBuilder
1515
import TestSupport
1616

@@ -973,6 +973,55 @@ class RegexDSLTests: XCTestCase {
973973
}
974974
}
975975

976+
func testCanOnlyMatchAtStart() throws {
977+
func expectCanOnlyMatchAtStart(
978+
_ expectation: Bool,
979+
file: StaticString = #file, line: UInt = #line,
980+
@RegexComponentBuilder _ content: () -> some RegexComponent
981+
) {
982+
let regex = content().regex
983+
XCTAssertEqual(regex.program.loweredProgram.canOnlyMatchAtStart, expectation, file: file, line: line)
984+
}
985+
986+
expectCanOnlyMatchAtStart(true) {
987+
Anchor.startOfSubject
988+
"foo"
989+
}
990+
expectCanOnlyMatchAtStart(false) {
991+
"foo"
992+
}
993+
expectCanOnlyMatchAtStart(true) {
994+
Optionally { "foo" }
995+
Anchor.startOfSubject
996+
"bar"
997+
}
998+
999+
expectCanOnlyMatchAtStart(true) {
1000+
ChoiceOf {
1001+
Regex {
1002+
Anchor.startOfSubject
1003+
"foo"
1004+
}
1005+
Regex {
1006+
Anchor.startOfSubject
1007+
"bar"
1008+
}
1009+
}
1010+
}
1011+
expectCanOnlyMatchAtStart(false) {
1012+
ChoiceOf {
1013+
Regex {
1014+
Anchor.startOfSubject
1015+
"foo"
1016+
}
1017+
Regex {
1018+
Anchor.startOfLine
1019+
"bar"
1020+
}
1021+
}
1022+
}
1023+
}
1024+
9761025
func testNestedGroups() throws {
9771026
return;
9781027

Tests/RegexTests/CompileTests.swift

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -484,4 +484,42 @@ extension RegexTests {
484484
expectProgram(for: #"(a+)*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
485485
expectProgram(for: #"(a{1,})*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
486486
}
487+
488+
func testCanOnlyMatchAtStart() throws {
489+
func expectCanOnlyMatchAtStart(
490+
_ regexStr: String,
491+
_ expectTrue: Bool,
492+
file: StaticString = #file,
493+
line: UInt = #line
494+
) throws {
495+
let regex = try Regex(regexStr)
496+
XCTAssertEqual(
497+
regex.program.loweredProgram.canOnlyMatchAtStart, expectTrue,
498+
file: file, line: line)
499+
}
500+
501+
try expectCanOnlyMatchAtStart("^foo", true) // anchor
502+
try expectCanOnlyMatchAtStart("\\Afoo", true) // more specific anchor
503+
try expectCanOnlyMatchAtStart("foo", false) // no anchor
504+
505+
try expectCanOnlyMatchAtStart("(?i)^foo", true) // unrelated option
506+
try expectCanOnlyMatchAtStart("(?m)^foo", false) // anchors match newlines
507+
try expectCanOnlyMatchAtStart("(?i:^foo)", true) // unrelated option
508+
try expectCanOnlyMatchAtStart("(?m:^foo)", false) // anchors match newlines
509+
510+
try expectCanOnlyMatchAtStart("(^foo|bar)", false) // one side of alternation
511+
try expectCanOnlyMatchAtStart("(foo|^bar)", false) // other side of alternation
512+
try expectCanOnlyMatchAtStart("(^foo|^bar)", true) // both sides of alternation
513+
514+
// Test quantifiers that include the anchor
515+
try expectCanOnlyMatchAtStart("(^foo)?bar", false)
516+
try expectCanOnlyMatchAtStart("(^foo)*bar", false)
517+
try expectCanOnlyMatchAtStart("(^foo)+bar", true)
518+
try expectCanOnlyMatchAtStart("(?:^foo)+bar", true)
519+
520+
// Test quantifiers before the anchor
521+
try expectCanOnlyMatchAtStart("(foo)?^bar", true) // The initial group must match ""
522+
try expectCanOnlyMatchAtStart("(?:foo)?^bar", true)
523+
try expectCanOnlyMatchAtStart("(foo)+^bar", false) // This can't actually match anywhere
524+
}
487525
}

0 commit comments

Comments
 (0)