swiftlang · rctcwyvrn · Aug 3, 2022 · Jul 5, 2022 · Jul 5, 2022 · Jul 5, 2022
diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -80,6 +80,14 @@ fileprivate extension Compiler.ByteCodeGen {
       options.apply(optionSequence.ast)
 
     case let .unconverted(astAtom):
+      if optimizationsEnabled,
+         let cc = astAtom.ast.characterClass?.builtinCC {
+        builder.buildMatchBuiltin(
+          cc,
+          cc.isStrict(options: options),
+          isScalar: options.semanticLevel == .unicodeScalar)
+        return
+      }
       if let consumer = try astAtom.ast.generateConsumer(options) {
         builder.buildConsume(by: consumer)
       } else {
@@ -113,136 +121,12 @@ fileprivate extension Compiler.ByteCodeGen {
   mutating func emitAssertion(
     _ kind: AST.Atom.AssertionKind
   ) throws {
-    // FIXME: Depends on API model we have... We may want to
-    // think through some of these with API interactions in mind
-    //
-    // This might break how we use `bounds` for both slicing
-    // and things like `firstIndex`, that is `firstIndex` may
-    // need to supply both a slice bounds and a per-search bounds.
-    switch kind {
-    case .startOfSubject:
-      builder.buildAssert { (_, _, input, pos, subjectBounds) in
-        pos == subjectBounds.lowerBound
-      }
-
-    case .endOfSubjectBeforeNewline:
-      builder.buildAssert { [semanticLevel = options.semanticLevel]
-          (_, _, input, pos, subjectBounds) in
-        if pos == subjectBounds.upperBound { return true }
-        switch semanticLevel {
-        case .graphemeCluster:
-          return input.index(after: pos) == subjectBounds.upperBound
-           && input[pos].isNewline
-        case .unicodeScalar:
-          return input.unicodeScalars.index(after: pos) == subjectBounds.upperBound
-           && input.unicodeScalars[pos].isNewline
-        }
-      }
-
-    case .endOfSubject:
-      builder.buildAssert { (_, _, input, pos, subjectBounds) in
-        pos == subjectBounds.upperBound
-      }
-
-    case .resetStartOfMatch:
-      // FIXME: Figure out how to communicate this out
-      throw Unsupported(#"\K (reset/keep assertion)"#)
-
-    case .firstMatchingPositionInSubject:
-      // TODO: We can probably build a nice model with API here
-
-      // FIXME: This needs to be based on `searchBounds`,
-      // not the `subjectBounds` given as an argument here
-      builder.buildAssert { (_, _, input, pos, subjectBounds) in false }
-
-    case .textSegment:
-      builder.buildAssert { (_, _, input, pos, _) in
-        // FIXME: Grapheme or word based on options
-        input.isOnGraphemeClusterBoundary(pos)
-      }
-
-    case .notTextSegment:
-      builder.buildAssert { (_, _, input, pos, _) in
-        // FIXME: Grapheme or word based on options
-        !input.isOnGraphemeClusterBoundary(pos)
-      }
-
-    case .startOfLine:
-      // FIXME: Anchor.startOfLine must always use this first branch
-      // The behavior of `^` should depend on `anchorsMatchNewlines`, but
-      // the DSL-based `.startOfLine` anchor should always match the start
-      // of a line. Right now we don't distinguish between those anchors.
-      if options.anchorsMatchNewlines {
-        builder.buildAssert { [semanticLevel = options.semanticLevel]
-            (_, _, input, pos, subjectBounds) in
-          if pos == subjectBounds.lowerBound { return true }
-          switch semanticLevel {
-          case .graphemeCluster:
-            return input[input.index(before: pos)].isNewline
-          case .unicodeScalar:
-            return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline
-          }
-        }
-      } else {
-        builder.buildAssert { (_, _, input, pos, subjectBounds) in
-          pos == subjectBounds.lowerBound
-        }
-      }
-
-    case .endOfLine:
-      // FIXME: Anchor.endOfLine must always use this first branch
-      // The behavior of `$` should depend on `anchorsMatchNewlines`, but
-      // the DSL-based `.endOfLine` anchor should always match the end
-      // of a line. Right now we don't distinguish between those anchors.
-      if options.anchorsMatchNewlines {
-        builder.buildAssert { [semanticLevel = options.semanticLevel]
-            (_, _, input, pos, subjectBounds) in
-          if pos == subjectBounds.upperBound { return true }
-          switch semanticLevel {
-          case .graphemeCluster:
-            return input[pos].isNewline
-          case .unicodeScalar:
-            return input.unicodeScalars[pos].isNewline
-          }
-        }
-      } else {
-        builder.buildAssert { (_, _, input, pos, subjectBounds) in
-          pos == subjectBounds.upperBound
-        }
-      }
-
-    case .wordBoundary:
-      builder.buildAssert { [options]
-          (cache, maxIndex, input, pos, subjectBounds) in
-        if options.usesSimpleUnicodeBoundaries {
-          // TODO: How should we handle bounds?
-          return _CharacterClassModel.word.isBoundary(
-            input,
-            at: pos,
-            bounds: subjectBounds,
-            with: options
-          )
-        } else {
-          return input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
-        }
-      }
-
-    case .notWordBoundary:
-      builder.buildAssert { [options]
-          (cache, maxIndex, input, pos, subjectBounds) in
-        if options.usesSimpleUnicodeBoundaries {
-          // TODO: How should we handle bounds?
-          return !_CharacterClassModel.word.isBoundary(
-            input,
-            at: pos,
-            bounds: subjectBounds,
-            with: options
-          )
-        } else {
-          return !input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
-        }
-      }
-    }
+    builder.buildAssert(
+      by: kind,
+      options.anchorsMatchNewlines,
+      options.usesSimpleUnicodeBoundaries,
+      options.usesASCIIWord,
+      options.semanticLevel)
   }
 
   mutating func emitScalar(_ s: UnicodeScalar) throws {

diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -11,6 +11,13 @@
 
 @_implementationOnly import _RegexParser
 
+extension Character {
+  var _singleScalarAsciiValue: UInt8? {
+    guard self != "\r\n" else { return nil }
+    return asciiValue
+  }
+}
+
 extension DSLTree.Node {
   /// Attempt to generate a consumer from this AST node
   ///

diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift
@@ -8,7 +8,7 @@
 // See https://swift.org/LICENSE.txt for license information
 //
 //===----------------------------------------------------------------------===//
-
+@_implementationOnly import _RegexParser // For AssertionKind
 
 extension Instruction {
   /// An instruction's payload packs operands and destination
@@ -51,7 +51,6 @@ extension Instruction.Payload {
     case element(ElementRegister)
     case consumer(ConsumeFunctionRegister)
     case bitset(AsciiBitsetRegister)
-    case assertion(AssertionFunctionRegister)
     case addr(InstructionAddress)
     case capture(CaptureRegister)
 
@@ -203,6 +202,22 @@ extension Instruction.Payload {
   var bitset: AsciiBitsetRegister {
     interpret()
   }
+
+  init(_ cc: BuiltinCC, _ isStrict: Bool, _ isScalar: Bool) {
+    let strictBit = isStrict ? 1 << 15 : 0
+    let scalarBit = isScalar ? 1 << 14 : 0
+    // val must be 16 bits, reserve the top 2 bits for if it is strict ascii or scalar
+    assert(cc.rawValue <= 0x3F_FF)
+    let val = cc.rawValue + UInt64(strictBit) + UInt64(scalarBit)
+    self.init(val)
+  }
+  var builtinCCPayload: (cc: BuiltinCC, isStrict: Bool, isScalar: Bool) {
+    let val = self.rawValue
+    let cc = BuiltinCC(rawValue: val & 0x3F_FF)!
+    let isStrict = (val >> 15) & 1 == 1
+    let isScalar = (val >> 14) & 1 == 1
+    return (cc, isStrict, isScalar)
+  }
 
   init(consumer: ConsumeFunctionRegister) {
     self.init(consumer)
@@ -211,11 +226,64 @@ extension Instruction.Payload {
     interpret()
   }
 
-  init(assertion: AssertionFunctionRegister) {
-    self.init(assertion)
-  }
-  var assertion: AssertionFunctionRegister {
-    interpret()
+  var _assertionKindMask: UInt64 { ~0xFFF0_0000_0000_0000 }
+  init(assertion: AST.Atom.AssertionKind,
+       _ anchorsMatchNewlines: Bool,
+       _ usesSimpleUnicodeBoundaries: Bool,
+       _ usesASCIIWord: Bool,
+       _ semanticLevel: MatchingOptions.SemanticLevel
+  ) {
+    // 4 bits of options
+    let anchorBit: UInt64 = anchorsMatchNewlines ? (1 << 55) : 0
+    let boundaryBit: UInt64 = usesSimpleUnicodeBoundaries ? (1 << 54) : 0
+    let strictBit: UInt64 = usesASCIIWord ? (1 << 53) : 0
+    let semanticLevelBit: UInt64 = semanticLevel == .unicodeScalar ? (1 << 52) : 0
+    let optionsBits: UInt64 = anchorBit + boundaryBit + strictBit + semanticLevelBit
+
+    // 4 bits for the assertion kind
+    // Future work: Optimize this layout
+    let kind: UInt64
+    switch assertion {
+    case .endOfLine: kind = 0
+    case .endOfSubject: kind = 1
+    case .endOfSubjectBeforeNewline: kind = 2
+    case .firstMatchingPositionInSubject: kind = 3
+    case .notTextSegment: kind = 4
+    case .notWordBoundary: kind = 5
+    case .resetStartOfMatch: kind = 6
+    case .startOfLine: kind = 7
+    case .startOfSubject: kind = 8
+    case .textSegment: kind = 9
+    case .wordBoundary: kind = 10
+    }
+    self.init(rawValue: kind + optionsBits)
+  }
+  var assertion: (AST.Atom.AssertionKind, Bool, Bool, Bool, MatchingOptions.SemanticLevel) {
+    let anchorsMatchNewlines = (self.rawValue >> 55) & 1 == 1
+    let usesSimpleUnicodeBoundaries = (self.rawValue >> 54) & 1 == 1
+    let usesASCIIWord = (self.rawValue >> 53) & 1 == 1
+    let semanticLevel: MatchingOptions.SemanticLevel
+    if (self.rawValue >> 52) & 1 == 1 {
+      semanticLevel = .unicodeScalar
+    } else {
+      semanticLevel = .graphemeCluster
+    }
+    let kind: AST.Atom.AssertionKind
+    switch self.rawValue & _assertionKindMask {
+    case 0: kind = .endOfLine
+    case 1: kind = .endOfSubject
+    case 2: kind = .endOfSubjectBeforeNewline
+    case 3: kind = .firstMatchingPositionInSubject
+    case 4: kind = .notTextSegment
+    case 5: kind = .notWordBoundary
+    case 6: kind = .resetStartOfMatch
+    case 7: kind = .startOfLine
+    case 8: kind = .startOfSubject
+    case 9: kind = .textSegment
+    case 10: kind = .wordBoundary
+    default: fatalError("Unreachable")
+    }
+    return (kind, anchorsMatchNewlines, usesSimpleUnicodeBoundaries, usesASCIIWord, semanticLevel)
   }
 
   init(addr: InstructionAddress) {

diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift
@@ -106,11 +106,7 @@ extension Instruction {
     /// Operand: Ascii bitset register containing the bitset
     case matchBitset
 
-    /// TODO: builtin assertions and anchors
-    case builtinAssertion
-
-    /// TODO: builtin character classes
-    case builtinCharacterClass
+    case matchBuiltin
 
     // MARK: Extension points
 

diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift
@@ -20,7 +20,6 @@ extension MEProgram {
 
     var asciiBitsets: [DSLTree.CustomCharacterClass.AsciiBitset] = []
     var consumeFunctions: [ConsumeFunction] = []
-    var assertionFunctions: [AssertionFunction] = []
     var transformFunctions: [TransformFunction] = []
     var matcherFunctions: [MatcherFunction] = []
 
@@ -163,6 +162,15 @@ extension MEProgram.Builder {
     instructions.append(.init(
       .matchBitset, .init(bitset: makeAsciiBitset(b))))
   }
+
+  mutating func buildMatchBuiltin(
+    _ cc: BuiltinCC,
+    _ isStrict: Bool,
+    isScalar: Bool
+  ) {
+    instructions.append(.init(
+      .matchBuiltin, .init(cc, isStrict, isScalar)))
+  }
 
   mutating func buildConsume(
     by p: @escaping MEProgram.ConsumeFunction
@@ -172,10 +180,20 @@ extension MEProgram.Builder {
   }
 
   mutating func buildAssert(
-    by p: @escaping MEProgram.AssertionFunction
+    by kind: AST.Atom.AssertionKind,
+    _ anchorsMatchNewlines: Bool,
+    _ usesSimpleUnicodeBoundaries: Bool,
+    _ usesASCIIWord: Bool,
+    _ semanticLevel: MatchingOptions.SemanticLevel
   ) {
     instructions.append(.init(
-      .assertBy, .init(assertion: makeAssertionFunction(p))))
+      .assertBy,
+      .init(
+        assertion: kind,
+        anchorsMatchNewlines,
+        usesSimpleUnicodeBoundaries,
+        usesASCIIWord,
+        semanticLevel)))
   }
 
   mutating func buildAccept() {
@@ -298,7 +316,6 @@ extension MEProgram.Builder {
     regInfo.positions = nextPositionRegister.rawValue
     regInfo.bitsets = asciiBitsets.count
     regInfo.consumeFunctions = consumeFunctions.count
-    regInfo.assertionFunctions = assertionFunctions.count
     regInfo.transformFunctions = transformFunctions.count
     regInfo.matcherFunctions = matcherFunctions.count
     regInfo.captures = nextCaptureRegister.rawValue
@@ -309,7 +326,6 @@ extension MEProgram.Builder {
       staticSequences: sequences.stored,
       staticBitsets: asciiBitsets,
       staticConsumeFunctions: consumeFunctions,
-      staticAssertionFunctions: assertionFunctions,
       staticTransformFunctions: transformFunctions,
       staticMatcherFunctions: matcherFunctions,
       registerInfo: regInfo,
@@ -458,12 +474,6 @@ extension MEProgram.Builder {
     defer { consumeFunctions.append(f) }
     return ConsumeFunctionRegister(consumeFunctions.count)
   }
-  mutating func makeAssertionFunction(
-    _ f: @escaping MEProgram.AssertionFunction
-  ) -> AssertionFunctionRegister {
-    defer { assertionFunctions.append(f) }
-    return AssertionFunctionRegister(assertionFunctions.count)
-  }
   mutating func makeTransformFunction(
     _ f: @escaping MEProgram.TransformFunction
   ) -> TransformRegister {

diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift