swiftlang · natecook1000 · Jun 22, 2022 · Jun 17, 2022 · Jun 17, 2022 · Jun 17, 2022
diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Contains.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Contains.swift
@@ -74,6 +74,6 @@ extension BidirectionalCollection where SubSequence == Substring {
   @_disfavoredOverload
   @available(SwiftStdlib 5.7, *)
   public func contains(_ regex: some RegexComponent) -> Bool {
-    _contains(RegexConsumer(regex))
+    (try? regex.regex.firstMatch(in: self[...])) != nil
   }
 }
diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift
@@ -229,9 +229,18 @@ extension BidirectionalCollection where Element: Comparable {
 @available(SwiftStdlib 5.7, *)
 struct RegexRangesCollection<Output> {
   let base: RegexMatchesCollection<Output>
-
-  init(string: Substring, regex: Regex<Output>) {
-    self.base = RegexMatchesCollection(base: string, regex: regex)
+
+  init(
+    input: String,
+    subjectBounds: Range<String.Index>,
+    searchBounds: Range<String.Index>,
+    regex: Regex<Output>
+  ) {
+    self.base = .init(
+      input: input,
+      subjectBounds: subjectBounds,
+      searchBounds: searchBounds,
+      regex: regex)
   }
 }
 
@@ -263,12 +272,29 @@ extension RegexRangesCollection: Collection {
 // MARK: Regex algorithms
 
 extension Collection where SubSequence == Substring {
+  @available(SwiftStdlib 5.7, *)
+  @_disfavoredOverload
+  func _ranges<R: RegexComponent>(
+    of regex: R,
+    subjectBounds: Range<String.Index>,
+    searchBounds: Range<String.Index>
+  ) -> RegexRangesCollection<R.RegexOutput> {
+    RegexRangesCollection(
+      input: self[...].base,
+      subjectBounds: subjectBounds,
+      searchBounds: searchBounds,
+      regex: regex.regex)
+  }
+
   @available(SwiftStdlib 5.7, *)
   @_disfavoredOverload
   func _ranges<R: RegexComponent>(
     of regex: R
   ) -> RegexRangesCollection<R.RegexOutput> {
-    RegexRangesCollection(string: self[...], regex: regex.regex)
+    _ranges(
+      of: regex,
+      subjectBounds: startIndex..<endIndex,
+      searchBounds: startIndex..<endIndex)
   }
 }
 

diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift
@@ -179,7 +179,10 @@ extension RangeReplaceableCollection where SubSequence == Substring {
     maxReplacements: Int = .max
   ) -> Self where Replacement.Element == Element {
     _replacing(
-      self[subrange]._ranges(of: regex),
+      self._ranges(
+        of: regex,
+        subjectBounds: startIndex..<endIndex,
+        searchBounds: subrange),
       with: replacement,
       maxReplacements: maxReplacements)
   }

diff --git a/Sources/_StringProcessing/Algorithms/Matching/Matches.swift b/Sources/_StringProcessing/Algorithms/Matching/Matches.swift
@@ -185,14 +185,26 @@ extension BidirectionalCollection {
 
 @available(SwiftStdlib 5.7, *)
 struct RegexMatchesCollection<Output> {
-  let input: Substring
+  let input: String
+  let subjectBounds: Range<String.Index>
+  let searchBounds: Range<String.Index>
   let regex: Regex<Output>
   let startIndex: Index
 
-  init(base: Substring, regex: Regex<Output>) {
-    self.input = base
+  init(
+    input: String,
+    subjectBounds: Range<String.Index>,
+    searchBounds: Range<String.Index>,
+    regex: Regex<Output>
+  ) {
+    self.input = input
+    self.subjectBounds = subjectBounds
+    self.searchBounds = searchBounds
     self.regex = regex
-    self.startIndex = base.firstMatch(of: regex).map(Index.match) ?? .end
+    self.startIndex = (try? regex._firstMatch(
+      input,
+      subjectBounds: subjectBounds,
+      searchBounds: searchBounds)).map(Index.match) ?? .end
   }
 }
 
@@ -241,12 +253,15 @@ extension RegexMatchesCollection: Sequence {
       }
 
       // `nextStart` is `nil` when iteration has completed
-      guard let start = nextStart else {
+      guard let start = nextStart, start <= base.searchBounds.upperBound else {
         return nil
       }
 
       // Otherwise, find the next match (if any) and compute `nextStart`
-      let match = try? base.regex.firstMatch(in: base.input[start...])
+      let match = try? base.regex._firstMatch(
+        base.input,
+        subjectBounds: base.subjectBounds,
+        searchBounds: start..<base.searchBounds.upperBound)
       nextStart = match.flatMap(base.searchIndex(after:))
       return match
     }
@@ -310,7 +325,11 @@ extension RegexMatchesCollection: Collection {
 
     guard
       let start = searchIndex(after: currentMatch),
-      let nextMatch = try? regex.firstMatch(in: input[start...])
+      start <= searchBounds.upperBound,
+      let nextMatch = try? regex._firstMatch(
+        input,
+        subjectBounds: subjectBounds,
+        searchBounds: start..<searchBounds.upperBound)
     else {
       return .end
     }
@@ -331,7 +350,11 @@ extension BidirectionalCollection where SubSequence == Substring {
   func _matches<R: RegexComponent>(
     of regex: R
   ) -> RegexMatchesCollection<R.RegexOutput> {
-    RegexMatchesCollection(base: self[...], regex: regex.regex)
+    RegexMatchesCollection(
+      input: self[...].base,
+      subjectBounds: startIndex..<endIndex,
+      searchBounds: startIndex..<endIndex,
+      regex: regex.regex)
   }
 
   @available(SwiftStdlib 5.7, *)

diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -96,26 +96,26 @@ fileprivate extension Compiler.ByteCodeGen {
     // need to supply both a slice bounds and a per-search bounds.
     switch kind {
     case .startOfSubject:
-      builder.buildAssert { (input, pos, bounds) in
-        pos == input.startIndex
+      builder.buildAssert { (input, pos, subjectBounds) in
+        pos == subjectBounds.lowerBound
       }
 
     case .endOfSubjectBeforeNewline:
-      builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in
-        if pos == input.endIndex { return true }
+      builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
+        if pos == subjectBounds.upperBound { return true }
         switch semanticLevel {
         case .graphemeCluster:
-          return input.index(after: pos) == input.endIndex
+          return input.index(after: pos) == subjectBounds.upperBound
            && input[pos].isNewline
         case .unicodeScalar:
-          return input.unicodeScalars.index(after: pos) == input.endIndex
+          return input.unicodeScalars.index(after: pos) == subjectBounds.upperBound
            && input.unicodeScalars[pos].isNewline
         }
       }
 
     case .endOfSubject:
-      builder.buildAssert { (input, pos, bounds) in
-        pos == input.endIndex
+      builder.buildAssert { (input, pos, subjectBounds) in
+        pos == subjectBounds.upperBound
       }
 
     case .resetStartOfMatch:
@@ -124,9 +124,10 @@ fileprivate extension Compiler.ByteCodeGen {
 
     case .firstMatchingPositionInSubject:
       // TODO: We can probably build a nice model with API here
-      builder.buildAssert { (input, pos, bounds) in
-        pos == bounds.lowerBound
-      }
+
+      // FIXME: This needs to be based on `searchBounds`,
+      // not the `subjectBounds` given as an argument here
+      builder.buildAssert { (input, pos, subjectBounds) in false }
 
     case .textSegment:
       builder.buildAssert { (input, pos, _) in
@@ -141,9 +142,13 @@ fileprivate extension Compiler.ByteCodeGen {
       }
 
     case .startOfLine:
+      // FIXME: Anchor.startOfLine must always use this first branch
+      // The behavior of `^` should depend on `anchorsMatchNewlines`, but
+      // the DSL-based `.startOfLine` anchor should always match the start
+      // of a line. Right now we don't distinguish between those anchors.
       if options.anchorsMatchNewlines {
-        builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in
-          if pos == input.startIndex { return true }
+        builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
+          if pos == subjectBounds.lowerBound { return true }
           switch semanticLevel {
           case .graphemeCluster:
             return input[input.index(before: pos)].isNewline
@@ -152,15 +157,19 @@ fileprivate extension Compiler.ByteCodeGen {
           }
         }
       } else {
-        builder.buildAssert { (input, pos, bounds) in
-          pos == input.startIndex
+        builder.buildAssert { (input, pos, subjectBounds) in
+          pos == subjectBounds.lowerBound
         }
       }
 
     case .endOfLine:
+      // FIXME: Anchor.endOfLine must always use this first branch
+      // The behavior of `$` should depend on `anchorsMatchNewlines`, but
+      // the DSL-based `.endOfLine` anchor should always match the end
+      // of a line. Right now we don't distinguish between those anchors.
       if options.anchorsMatchNewlines {
-        builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in
-          if pos == input.endIndex { return true }
+        builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
+          if pos == subjectBounds.upperBound { return true }
           switch semanticLevel {
           case .graphemeCluster:
             return input[pos].isNewline
@@ -169,25 +178,25 @@ fileprivate extension Compiler.ByteCodeGen {
           }
         }
       } else {
-        builder.buildAssert { (input, pos, bounds) in
-          pos == input.endIndex
+        builder.buildAssert { (input, pos, subjectBounds) in
+          pos == subjectBounds.upperBound
         }
       }
 
     case .wordBoundary:
       // TODO: May want to consider Unicode level
-      builder.buildAssert { [options] (input, pos, bounds) in
+      builder.buildAssert { [options] (input, pos, subjectBounds) in
         // TODO: How should we handle bounds?
         _CharacterClassModel.word.isBoundary(
-          input, at: pos, bounds: bounds, with: options)
+          input, at: pos, bounds: subjectBounds, with: options)
       }
 
     case .notWordBoundary:
       // TODO: May want to consider Unicode level
-      builder.buildAssert { [options] (input, pos, bounds) in
+      builder.buildAssert { [options] (input, pos, subjectBounds) in
         // TODO: How should we handle bounds?
         !_CharacterClassModel.word.isBoundary(
-          input, at: pos, bounds: bounds, with: options)
+          input, at: pos, bounds: subjectBounds, with: options)
       }
     }
   }

diff --git a/Sources/_StringProcessing/Engine/Consume.swift b/Sources/_StringProcessing/Engine/Consume.swift
@@ -18,10 +18,25 @@ extension Engine {
     Processor(
       program: program,
       input: input,
-      bounds: bounds,
+      subjectBounds: bounds,
+      searchBounds: bounds,
       matchMode: matchMode,
       isTracingEnabled: enableTracing)
   }
+
+  func makeFirstMatchProcessor(
+    input: String,
+    subjectBounds: Range<String.Index>,
+    searchBounds: Range<String.Index>
+  ) -> Processor {
+    Processor(
+      program: program,
+      input: input,
+      subjectBounds: subjectBounds,
+      searchBounds: searchBounds,
+      matchMode: .partialFromFront,
+      isTracingEnabled: enableTracing)
+  }
 }
 
 extension Processor {