swiftlang · rctcwyvrn · Aug 3, 2022 · Jul 5, 2022 · Jul 5, 2022 · Jul 5, 2022
diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift
@@ -15,27 +15,39 @@
 @available(SwiftStdlib 5.7, *)
 public struct CharacterClass {
   internal var ccc: DSLTree.CustomCharacterClass
+  /// The builtin character class, if this CharacterClass is representable by one
+  internal var builtin: DSLTree.Atom.CharacterClass?
 
   init(_ ccc: DSLTree.CustomCharacterClass) {
     self.ccc = ccc
+    self.builtin = nil
   }
 
-  init(unconverted atom: DSLTree._AST.Atom) {
-    self.ccc = .init(members: [.atom(.unconverted(atom))])
+  init(builtin: DSLTree.Atom.CharacterClass) {
+    self.ccc = .init(members: [.atom(.characterClass(builtin))])
+    self.builtin = builtin
   }
 }
 
 @available(SwiftStdlib 5.7, *)
 extension CharacterClass: RegexComponent {
   public var regex: Regex<Substring> {
-    _RegexFactory().customCharacterClass(ccc)
+    if let cc = builtin {
+      return _RegexFactory().characterClass(cc)
+    } else {
+      return _RegexFactory().customCharacterClass(ccc)
+    }
   }
 }
 
 @available(SwiftStdlib 5.7, *)
 extension CharacterClass {
   public var inverted: CharacterClass {
-    CharacterClass(ccc.inverted)
+    if let inv = builtin?.inverted {
+      return CharacterClass(builtin: inv)
+    } else {
+      return CharacterClass(ccc.inverted)
+    }
   }
 }
 
@@ -50,15 +62,15 @@ extension RegexComponent where Self == CharacterClass {
   }
 
   public static var anyGraphemeCluster: CharacterClass {
-    .init(unconverted: ._anyGrapheme)
+    .init(builtin: .anyGrapheme)
   }
 
   public static var whitespace: CharacterClass {
-    .init(unconverted: ._whitespace)
+    .init(builtin: .whitespace)
   }
 
   public static var digit: CharacterClass {
-    .init(unconverted: ._digit)
+    .init(builtin: .digit)
   }
 
   public static var hexDigit: CharacterClass {
@@ -70,19 +82,19 @@ extension RegexComponent where Self == CharacterClass {
   }
 
   public static var horizontalWhitespace: CharacterClass {
-    .init(unconverted: ._horizontalWhitespace)
+    .init(builtin: .horizontalWhitespace)
   }
 
   public static var newlineSequence: CharacterClass {
-    .init(unconverted: ._newlineSequence)
+    .init(builtin: .newlineSequence)
   }
 
   public static var verticalWhitespace: CharacterClass {
-    .init(unconverted: ._verticalWhitespace)
+    .init(builtin: .verticalWhitespace)
   }
 
   public static var word: CharacterClass {
-    .init(unconverted: ._word)
+    .init(builtin: .word)
   }
 }
 

diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -74,6 +74,9 @@ fileprivate extension Compiler.ByteCodeGen {
         emitMatchScalar(s)
       }
 
+    case let .characterClass(cc):
+      emitCharacterClass(cc)
+
     case let .assertion(kind):
       try emitAssertion(kind)
 
@@ -148,147 +151,24 @@ fileprivate extension Compiler.ByteCodeGen {
     }
   }
 
-  mutating func emitStartOfLine() {
-    builder.buildAssert { [semanticLevel = options.semanticLevel]
-        (_, _, input, pos, subjectBounds) in
-      if pos == subjectBounds.lowerBound { return true }
-      switch semanticLevel {
-      case .graphemeCluster:
-        return input[input.index(before: pos)].isNewline
-      case .unicodeScalar:
-        return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline
-      }
-    }
-  }
-
-  mutating func emitEndOfLine() {
-    builder.buildAssert { [semanticLevel = options.semanticLevel]
-      (_, _, input, pos, subjectBounds) in
-      if pos == subjectBounds.upperBound { return true }
-      switch semanticLevel {
-      case .graphemeCluster:
-        return input[pos].isNewline
-      case .unicodeScalar:
-        return input.unicodeScalars[pos].isNewline
-      }
-    }
-  }
-
   mutating func emitAssertion(
     _ kind: DSLTree.Atom.Assertion
   ) throws {
-    // FIXME: Depends on API model we have... We may want to
-    // think through some of these with API interactions in mind
-    //
-    // This might break how we use `bounds` for both slicing
-    // and things like `firstIndex`, that is `firstIndex` may
-    // need to supply both a slice bounds and a per-search bounds.
-    switch kind {
-    case .startOfSubject:
-      builder.buildAssert { (_, _, input, pos, subjectBounds) in
-        pos == subjectBounds.lowerBound
-      }
-
-    case .endOfSubjectBeforeNewline:
-      builder.buildAssert { [semanticLevel = options.semanticLevel]
-          (_, _, input, pos, subjectBounds) in
-        if pos == subjectBounds.upperBound { return true }
-        switch semanticLevel {
-        case .graphemeCluster:
-          return input.index(after: pos) == subjectBounds.upperBound
-           && input[pos].isNewline
-        case .unicodeScalar:
-          return input.unicodeScalars.index(after: pos) == subjectBounds.upperBound
-           && input.unicodeScalars[pos].isNewline
-        }
-      }
-
-    case .endOfSubject:
-      builder.buildAssert { (_, _, input, pos, subjectBounds) in
-        pos == subjectBounds.upperBound
-      }
-
-    case .resetStartOfMatch:
-      // FIXME: Figure out how to communicate this out
+    if kind == .resetStartOfMatch {
       throw Unsupported(#"\K (reset/keep assertion)"#)
-
-    case .firstMatchingPositionInSubject:
-      // TODO: We can probably build a nice model with API here
-
-      // FIXME: This needs to be based on `searchBounds`,
-      // not the `subjectBounds` given as an argument here
-      builder.buildAssert { (_, _, input, pos, subjectBounds) in false }
-
-    case .textSegment:
-      builder.buildAssert { (_, _, input, pos, _) in
-        // FIXME: Grapheme or word based on options
-        input.isOnGraphemeClusterBoundary(pos)
-      }
-
-    case .notTextSegment:
-      builder.buildAssert { (_, _, input, pos, _) in
-        // FIXME: Grapheme or word based on options
-        !input.isOnGraphemeClusterBoundary(pos)
-      }
-
-    case .startOfLine:
-      emitStartOfLine()
-
-    case .endOfLine:
-      emitEndOfLine()
-
-    case .caretAnchor:
-      if options.anchorsMatchNewlines {
-        emitStartOfLine()
-      } else {
-        builder.buildAssert { (_, _, input, pos, subjectBounds) in
-          pos == subjectBounds.lowerBound
-        }
-      }
-
-    case .dollarAnchor:
-      if options.anchorsMatchNewlines {
-        emitEndOfLine()
-      } else {
-        builder.buildAssert { (_, _, input, pos, subjectBounds) in
-          pos == subjectBounds.upperBound
-        }
-      }
-
-    case .wordBoundary:
-      builder.buildAssert { [options]
-          (cache, maxIndex, input, pos, subjectBounds) in
-        if options.usesSimpleUnicodeBoundaries {
-          // TODO: How should we handle bounds?
-          return _CharacterClassModel.word.isBoundary(
-            input,
-            at: pos,
-            bounds: subjectBounds,
-            with: options
-          )
-        } else {
-          return input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
-        }
-      }
-
-    case .notWordBoundary:
-      builder.buildAssert { [options]
-          (cache, maxIndex, input, pos, subjectBounds) in
-        if options.usesSimpleUnicodeBoundaries {
-          // TODO: How should we handle bounds?
-          return !_CharacterClassModel.word.isBoundary(
-            input,
-            at: pos,
-            bounds: subjectBounds,
-            with: options
-          )
-        } else {
-          return !input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
-        }
-      }
     }
+    builder.buildAssert(
+      by: kind,
+      options.anchorsMatchNewlines,
+      options.usesSimpleUnicodeBoundaries,
+      options.usesASCIIWord,
+      options.semanticLevel)
   }
-
+
+  mutating func emitCharacterClass(_ cc: DSLTree.Atom.CharacterClass) {
+    builder.buildMatchBuiltin(model: cc.asRuntimeModel(options))
+  }
+
   mutating func emitMatchScalar(_ s: UnicodeScalar) {
     assert(options.semanticLevel == .unicodeScalar)
     if options.isCaseInsensitive && s.properties.isCased {
@@ -907,10 +787,10 @@ fileprivate extension Compiler.ByteCodeGen {
       } else {
         builder.buildMatchAsciiBitset(asciiBitset)
       }
-    } else {
-      let consumer = try ccc.generateConsumer(options)
-      builder.buildConsume(by: consumer)
+      return
     }
+    let consumer = try ccc.generateConsumer(options)
+    builder.buildConsume(by: consumer)
   }
 
   mutating func emitConcatenation(_ children: [DSLTree.Node]) throws {

diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -162,6 +162,8 @@ extension DSLTree.Atom {
     case .assertion:
       // TODO: We could handle, should this be total?
       return nil
+    case .characterClass(let cc):
+      return cc.generateConsumer(opts)
 
     case .backreference:
       // TODO: Should we handle?
@@ -182,6 +184,15 @@ extension DSLTree.Atom {
   }
 }
 
+extension DSLTree.Atom.CharacterClass {
+  func generateConsumer(_ opts: MatchingOptions) -> MEProgram.ConsumeFunction {
+    let model = asRuntimeModel(opts)
+    return { input, bounds in
+      model.matches(in: input, at: bounds.lowerBound)
+    }
+  }
+}
+
 extension String {
   /// Compares this string to `other` using the loose matching rule UAX44-LM2,
   /// which ignores case, whitespace, underscores, and nearly all medial
@@ -269,16 +280,6 @@ extension AST.Atom {
   func generateConsumer(
     _ opts: MatchingOptions
   ) throws -> MEProgram.ConsumeFunction? {
-    // TODO: Wean ourselves off of this type...
-    if let cc = self.characterClass?.withMatchLevel(
-      opts.matchLevel
-    ) {
-      return { input, bounds in
-        // FIXME: should we worry about out of bounds?
-        cc.matches(in: input, at: bounds.lowerBound, with: opts)
-      }
-    }
-
     switch kind {
     case let .scalar(s):
       assertionFailure(
@@ -312,8 +313,11 @@ extension AST.Atom {
     case .caretAnchor, .dollarAnchor:
       // handled in emitAssertion
       return nil
+    case .escaped:
+      // handled in emitAssertion and emitCharacterClass
+      return nil
 
-    case .scalarSequence, .escaped, .keyboardControl, .keyboardMeta,
+    case .scalarSequence, .keyboardControl, .keyboardMeta,
         .keyboardMetaControl, .backreference, .subpattern, .callout,
         .backtrackingDirective, .changeMatchingOptions, .invalid:
       // FIXME: implement