swiftlang · milseman · Aug 11, 2023 · May 22, 2023 · May 25, 2023 · May 26, 2023
diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -68,14 +68,16 @@ extension Character {
     switch opts.semanticLevel {
     case .graphemeCluster:
       return { input, bounds in
-        let low = bounds.lowerBound
+        guard let (char, next) = input.characterAndEnd(
+          at: bounds.lowerBound, limitedBy: bounds.upperBound)
+        else { return nil }
         if isCaseInsensitive && isCased {
-          return input[low].lowercased() == lowercased()
-            ? input.index(after: low)
+          return char.lowercased() == self.lowercased()
+            ? next
             : nil
         } else {
-          return input[low] == self
-            ? input.index(after: low)
+          return char == self
+            ? next
             : nil
         }
       }
@@ -188,7 +190,7 @@ extension DSLTree.Atom.CharacterClass {
   func generateConsumer(_ opts: MatchingOptions) -> MEProgram.ConsumeFunction {
     let model = asRuntimeModel(opts)
     return { input, bounds in
-      model.matches(in: input, at: bounds.lowerBound)
+      model.matches(in: input, at: bounds.lowerBound, limitedBy: bounds.upperBound)
     }
   }
 }
@@ -517,7 +519,7 @@ extension DSLTree.CustomCharacterClass {
       }
       if isInverted {
         return opts.semanticLevel == .graphemeCluster
-          ? input.index(after: bounds.lowerBound)
+          ? Swift.min(input.index(after: bounds.lowerBound), bounds.upperBound)
           : input.unicodeScalars.index(after: bounds.lowerBound)
       }
       return nil

diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift
@@ -15,9 +15,10 @@ extension Processor {
     isStrictASCII: Bool,
     isScalarSemantics: Bool
   ) -> Bool {
-    guard let next = input.matchBuiltinCC(
+    guard currentPosition < end, let next = input.matchBuiltinCC(
       cc,
       at: currentPosition,
+      limitedBy: end,
       isInverted: isInverted,
       isStrictASCII: isStrictASCII,
       isScalarSemantics: isScalarSemantics
@@ -102,56 +103,96 @@ extension Processor {
 
     case .wordBoundary:
       if payload.usesSimpleUnicodeBoundaries {
-        // TODO: How should we handle bounds?
         return atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel)
       } else {
-        return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex)
+        return input.isOnWordBoundary(at: currentPosition, in: searchBounds, using: &wordIndexCache, &wordIndexMaxIndex)
       }
 
     case .notWordBoundary:
       if payload.usesSimpleUnicodeBoundaries {
-        // TODO: How should we handle bounds?
         return !atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel)
       } else {
-        return !input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex)
+        return !input.isOnWordBoundary(at: currentPosition, in: searchBounds, using: &wordIndexCache, &wordIndexMaxIndex)
       }
     }
   }
 }
 
 // MARK: Matching `.`
 extension String {
-  // TODO: Should the below have a `limitedBy` parameter?
+  /// Returns the character at `pos`, bounded by `end`, as well as the upper
+  /// boundary of the returned character.
+  ///
+  /// This function handles loading a character from a string while respecting
+  /// an end boundary, even if that end boundary is sub-character or sub-scalar.
+  ///
+  ///   - If `pos` is at or past `end`, this function returns `nil`.
+  ///   - If `end` is between `pos` and the next grapheme cluster boundary (i.e.,
+  ///     `end` is before `self.index(after: pos)`, then the returned character
+  ///     is smaller than the one that would be produced by `self[pos]` and the
+  ///     returned index is at the end of that character.
+  ///   - If `end` is between `pos` and the next grapheme cluster boundary, and
+  ///     is not on a Unicode scalar boundary, the partial scalar is dropped. This
+  ///     can result in a `nil` return or a character that includes only part of
+  ///     the `self[pos]` character.
+  ///
+  /// - Parameters:
+  ///   - pos: The position to load a character from.
+  ///   - end: The limit for the character at `pos`.
+  /// - Returns: The character at `pos`, bounded by `end`, if it exists, along
+  ///   with the upper bound of that character. The upper bound is always
+  ///   scalar-aligned.
+  func characterAndEnd(at pos: String.Index, limitedBy end: String.Index) -> (Character, String.Index)? {
+    // FIXME: Sink into the stdlib to avoid multiple boundary calculations
+    guard pos < end else { return nil }
+    let next = index(after: pos)
+    if next <= end {
+      return (self[pos], next)
+    }
 
+    // `end` must be a sub-character position that is between `pos` and the
+    // next grapheme boundary. This is okay if `end` is on a Unicode scalar
+    // boundary, but if it's in the middle of a scalar's code units, there
+    // may not be a character to return at all after rounding down. Use
+    // `Substring`'s rounding to determine what we can return.
+    let substr = self[pos..<end]
+    return substr.isEmpty
+      ? nil
+      : (substr.first!, substr.endIndex)
+  }
+
   func matchAnyNonNewline(
     at currentPosition: String.Index,
+    limitedBy end: String.Index,
     isScalarSemantics: Bool
   ) -> String.Index? {
-    guard currentPosition < endIndex else {
-      return nil
-    }
+    guard currentPosition < end else { return nil }
     if case .definite(let result) = _quickMatchAnyNonNewline(
       at: currentPosition,
+      limitedBy: end,
       isScalarSemantics: isScalarSemantics
     ) {
       assert(result == _thoroughMatchAnyNonNewline(
         at: currentPosition,
+        limitedBy: end,
         isScalarSemantics: isScalarSemantics))
       return result
     }
     return _thoroughMatchAnyNonNewline(
       at: currentPosition,
+      limitedBy: end,
       isScalarSemantics: isScalarSemantics)
   }
 
   @inline(__always)
   private func _quickMatchAnyNonNewline(
     at currentPosition: String.Index,
+    limitedBy end: String.Index,
     isScalarSemantics: Bool
   ) -> QuickResult<String.Index?> {
-    assert(currentPosition < endIndex)
+    assert(currentPosition < end)
     guard let (asciiValue, next, isCRLF) = _quickASCIICharacter(
-      at: currentPosition
+      at: currentPosition, limitedBy: end
     ) else {
       return .unknown
     }
@@ -167,46 +208,47 @@ extension String {
   @inline(never)
   private func _thoroughMatchAnyNonNewline(
     at currentPosition: String.Index,
+    limitedBy end: String.Index,
     isScalarSemantics: Bool
   ) -> String.Index? {
-    assert(currentPosition < endIndex)
     if isScalarSemantics {
+      guard currentPosition < end else { return nil }
       let scalar = unicodeScalars[currentPosition]
       guard !scalar.isNewline else { return nil }
       return unicodeScalars.index(after: currentPosition)
     }
 
-    let char = self[currentPosition]
-    guard !char.isNewline else { return nil }
-    return index(after: currentPosition)
+    guard let (char, next) = characterAndEnd(at: currentPosition, limitedBy: end),
+          !char.isNewline
+    else { return nil }
+    return next
   }
 }
 
 // MARK: - Built-in character class matching
 extension String {
-  // TODO: Should the below have a `limitedBy` parameter?
-
   // Mentioned in ProgrammersManual.md, update docs if redesigned
   func matchBuiltinCC(
     _ cc: _CharacterClassModel.Representation,
     at currentPosition: String.Index,
+    limitedBy end: String.Index,
     isInverted: Bool,
     isStrictASCII: Bool,
     isScalarSemantics: Bool
   ) -> String.Index? {
-    guard currentPosition < endIndex else {
-      return nil
-    }
+    guard currentPosition < end else { return nil }
     if case .definite(let result) = _quickMatchBuiltinCC(
       cc,
       at: currentPosition,
+      limitedBy: end,
       isInverted: isInverted,
       isStrictASCII: isStrictASCII,
       isScalarSemantics: isScalarSemantics
     ) {
       assert(result == _thoroughMatchBuiltinCC(
         cc,
         at: currentPosition,
+        limitedBy: end,
         isInverted: isInverted,
         isStrictASCII: isStrictASCII,
         isScalarSemantics: isScalarSemantics))
@@ -215,6 +257,7 @@ extension String {
     return _thoroughMatchBuiltinCC(
       cc,
       at: currentPosition,
+      limitedBy: end,
       isInverted: isInverted,
       isStrictASCII: isStrictASCII,
       isScalarSemantics: isScalarSemantics)
@@ -225,13 +268,17 @@ extension String {
   private func _quickMatchBuiltinCC(
     _ cc: _CharacterClassModel.Representation,
     at currentPosition: String.Index,
+    limitedBy end: String.Index,
     isInverted: Bool,
     isStrictASCII: Bool,
     isScalarSemantics: Bool
   ) -> QuickResult<String.Index?> {
-    assert(currentPosition < endIndex)
+    assert(currentPosition < end)
     guard let (next, result) = _quickMatch(
-      cc, at: currentPosition, isScalarSemantics: isScalarSemantics
+      cc,
+      at: currentPosition,
+      limitedBy: end,
+      isScalarSemantics: isScalarSemantics
     ) else {
       return .unknown
     }
@@ -243,27 +290,25 @@ extension String {
   private func _thoroughMatchBuiltinCC(
     _ cc: _CharacterClassModel.Representation,
     at currentPosition: String.Index,
+    limitedBy end: String.Index,
     isInverted: Bool,
     isStrictASCII: Bool,
     isScalarSemantics: Bool
   ) -> String.Index? {
-    assert(currentPosition < endIndex)
-    let char = self[currentPosition]
+    // TODO: Branch here on scalar semantics
+    // Don't want to pay character cost if unnecessary
+    guard var (char, next) =
+            characterAndEnd(at: currentPosition, limitedBy: end)
+    else { return nil }
     let scalar = unicodeScalars[currentPosition]
 
     let asciiCheck = !isStrictASCII
     || (scalar.isASCII && isScalarSemantics)
     || char.isASCII
 
     var matched: Bool
-    var next: String.Index
-    switch (isScalarSemantics, cc) {
-    case (_, .anyGrapheme):
-      next = index(after: currentPosition)
-    case (true, _):
+    if isScalarSemantics && cc != .anyGrapheme {
       next = unicodeScalars.index(after: currentPosition)
-    case (false, _):
-      next = index(after: currentPosition)
     }
 
     switch cc {
@@ -291,7 +336,7 @@ extension String {
       if isScalarSemantics {
         matched = scalar.isNewline && asciiCheck
         if matched && scalar == "\r"
-            && next != endIndex && unicodeScalars[next] == "\n" {
+            && next < end && unicodeScalars[next] == "\n" {
           // Match a full CR-LF sequence even in scalar semantics
           unicodeScalars.formIndex(after: &next)
         }

diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift
@@ -17,18 +17,18 @@ extension Processor {
         boundaryCheck: !isScalarSemantics,
         isCaseInsensitive: false)
     case .builtin:
-      // FIXME: bounds check? endIndex or end?
+      guard currentPosition < end else { return nil }
 
       // We only emit .quantify if it consumes a single character
       return input.matchBuiltinCC(
         payload.builtin,
         at: currentPosition,
+        limitedBy: end,
         isInverted: payload.builtinIsInverted,
         isStrictASCII: payload.builtinIsStrict,
         isScalarSemantics: isScalarSemantics)
     case .any:
-      // FIXME: endIndex or end?
-      guard currentPosition < input.endIndex else { return nil }
+      guard currentPosition < end else { return nil }
 
       if payload.anyMatchesNewline {
         if isScalarSemantics {
@@ -38,7 +38,9 @@ extension Processor {
       }
 
       return input.matchAnyNonNewline(
-        at: currentPosition, isScalarSemantics: isScalarSemantics)
+        at: currentPosition,
+        limitedBy: end,
+        isScalarSemantics: isScalarSemantics)
     }
   }