swiftlang · rctcwyvrn · Jun 29, 2022 · Jun 19, 2022 · Jun 19, 2022 · Jun 19, 2022
diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -3,13 +3,19 @@
 extension Compiler {
   struct ByteCodeGen {
     var options: MatchingOptions
+    private let compileOptions: CompileOptions
     var builder = MEProgram.Builder()
     /// A Boolean indicating whether the first matchable atom has been emitted.
     /// This is used to determine whether to apply initial options.
     var hasEmittedFirstMatchableAtom = false
 
-    init(options: MatchingOptions, captureList: CaptureList) {
+    init(
+      options: MatchingOptions,
+      compileOptions: CompileOptions,
+      captureList: CaptureList
+    ) {
       self.options = options
+      self.compileOptions = compileOptions
       self.builder.captureList = captureList
     }
   }
@@ -643,8 +649,16 @@ fileprivate extension Compiler.ByteCodeGen {
   mutating func emitCustomCharacterClass(
     _ ccc: DSLTree.CustomCharacterClass
   ) throws {
-    let consumer = try ccc.generateConsumer(options)
-    builder.buildConsume(by: consumer)
+    if let asciiBitset = ccc.asAsciiBitset(options),
+        options.semanticLevel == .graphemeCluster,
+        !compileOptions.contains(.unoptimized) {
+      // future work: add a bit to .matchBitset to consume either a character
+      // or a scalar so we can have this optimization in scalar mode
+      builder.buildMatchAsciiBitset(asciiBitset)
+    } else {
+      let consumer = try ccc.generateConsumer(options)
+      builder.buildConsume(by: consumer)
+    }
   }
 
   @discardableResult

diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift
@@ -16,6 +16,7 @@ class Compiler {
 
   // TODO: Or are these stored on the tree?
   var options = MatchingOptions()
+  private var compileOptions: CompileOptions = .default
 
   init(ast: AST) {
     self.tree = ast.dslTree
@@ -25,23 +26,22 @@ class Compiler {
     self.tree = tree
   }
 
+  init(tree: DSLTree, compileOptions: CompileOptions) {
+    self.tree = tree
+    self.compileOptions = compileOptions
+  }
+
   __consuming func emit() throws -> MEProgram {
     // TODO: Handle global options
     var codegen = ByteCodeGen(
-      options: options, captureList: tree.captureList
-    )
+      options: options,
+      compileOptions:
+        compileOptions,
+      captureList: tree.captureList)
     return try codegen.emitRoot(tree.root)
   }
 }
 
-func _compileRegex(
-  _ regex: String, _ syntax: SyntaxOptions = .traditional
-) throws -> Executor {
-  let ast = try parse(regex, .semantic, syntax)
-  let program = try Compiler(ast: ast).emit()
-  return Executor(program: program)
-}
-
 // An error produced when compiling a regular expression.
 enum RegexCompilationError: Error, CustomStringConvertible {
   // TODO: Source location?
@@ -54,3 +54,35 @@ enum RegexCompilationError: Error, CustomStringConvertible {
     }
   }
 }
+
+// Testing support
+@available(SwiftStdlib 5.7, *)
+func _compileRegex(
+  _ regex: String,
+  _ syntax: SyntaxOptions = .traditional,
+  _ semanticLevel: RegexSemanticLevel? = nil
+) throws -> Executor {
+  let ast = try parse(regex, .semantic, syntax)
+  let dsl: DSLTree
+
+  switch semanticLevel?.base {
+  case .graphemeCluster:
+    let sequence = AST.MatchingOptionSequence(adding: [.init(.graphemeClusterSemantics, location: .fake)])
+    dsl = DSLTree(.nonCapturingGroup(.init(ast: .changeMatchingOptions(sequence)), ast.dslTree.root))
+  case .unicodeScalar:
+    let sequence = AST.MatchingOptionSequence(adding: [.init(.unicodeScalarSemantics, location: .fake)])
+    dsl = DSLTree(.nonCapturingGroup(.init(ast: .changeMatchingOptions(sequence)), ast.dslTree.root))
+  case .none:
+    dsl = ast.dslTree
+  }
+  let program = try Compiler(tree: dsl).emit()
+  return Executor(program: program)
+}
+
+extension Compiler {
+  struct CompileOptions: OptionSet {
+    let rawValue: Int
+    static let unoptimized = CompileOptions(rawValue: 1)
+    static let `default`: CompileOptions = []
+  }
+}
diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -51,7 +51,26 @@ extension DSLTree.Node {
   }
 }
 
+extension DSLTree._AST.Atom {
+  var singleScalarASCIIValue: UInt8? {
+    return ast.singleScalarASCIIValue
+  }
+}
+
 extension DSLTree.Atom {
+  var singleScalarASCIIValue: UInt8? {
+    switch self {
+    case let .char(c) where c != "\r\n":
+      return c.asciiValue
+    case let .scalar(s) where s.isASCII:
+      return UInt8(ascii: s)
+    case let .unconverted(atom):
+      return atom.singleScalarASCIIValue
+    default:
+      return nil
+    }
+  }
+
   // TODO: If ByteCodeGen switches first, then this is unnecessary for
   // top-level nodes, but it's also invoked for `.atom` members of a custom CC
   func generateConsumer(
@@ -61,17 +80,32 @@ extension DSLTree.Atom {
 
     switch self {
     case let .char(c):
-      // TODO: Match level?
-      return { input, bounds in
-        let low = bounds.lowerBound
-        if isCaseInsensitive && c.isCased {
-          return input[low].lowercased() == c.lowercased()
-            ? input.index(after: low)
-            : nil
-        } else {
-          return input[low] == c
-            ? input.index(after: low)
-            : nil
+      if opts.semanticLevel == .graphemeCluster {
+        return { input, bounds in
+          let low = bounds.lowerBound
+          if isCaseInsensitive && c.isCased {
+            return input[low].lowercased() == c.lowercased()
+              ? input.index(after: low)
+              : nil
+          } else {
+            return input[low] == c
+              ? input.index(after: low)
+              : nil
+          }
+        }
+      } else {
+        let consumers = c.unicodeScalars.map { s in consumeScalar {
+          isCaseInsensitive
+            ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping
+            : $0 == s
+        }}
+        return { input, bounds in
+          for fn in consumers {
+            if let idx = fn(input, bounds) {
+              return idx
+            }
+          }
+          return nil
         }
       }
     case let .scalar(s):
@@ -177,7 +211,18 @@ extension AST.Atom {
     default: return nil
     }
   }
-
+
+  var singleScalarASCIIValue: UInt8? {
+    switch kind {
+    case let .char(c) where c != "\r\n":
+      return c.asciiValue
+    case let .scalar(s) where s.value.isASCII:
+      return UInt8(ascii: s.value)
+    default:
+      return nil
+    }
+  }
+
   func generateConsumer(
     _ opts: MatchingOptions
   ) throws -> MEProgram.ConsumeFunction? {
@@ -235,6 +280,34 @@ extension AST.Atom {
 }
 
 extension DSLTree.CustomCharacterClass.Member {
+  func asAsciiBitset(
+    _ opts: MatchingOptions,
+    _ isInverted: Bool
+  ) -> DSLTree.CustomCharacterClass.AsciiBitset? {
+    switch self {
+    case let .atom(a):
+      if let val = a.singleScalarASCIIValue {
+        return DSLTree.CustomCharacterClass.AsciiBitset(
+          val,
+          isInverted,
+          opts.isCaseInsensitive
+        )
+      }
+    case let .range(low, high):
+      if let lowVal = low.singleScalarASCIIValue, let highVal = high.singleScalarASCIIValue {
+        return DSLTree.CustomCharacterClass.AsciiBitset(
+          low: lowVal,
+          high: highVal,
+          isInverted: isInverted,
+          isCaseInsensitive: opts.isCaseInsensitive
+        )
+      }
+    default:
+      return nil
+    }
+    return nil
+  }
+
   func generateConsumer(
     _ opts: MatchingOptions
   ) throws -> MEProgram.ConsumeFunction {
@@ -342,6 +415,19 @@ extension DSLTree.CustomCharacterClass.Member {
 }
 
 extension DSLTree.CustomCharacterClass {
+  func asAsciiBitset(_ opts: MatchingOptions) -> AsciiBitset? {
+    return members.reduce(
+      .init(isInverted: isInverted),
+      {result, member in
+        if let next = member.asAsciiBitset(opts, isInverted) {
+          return result?.union(next)
+        } else {
+          return nil
+        }
+      }
+    )
+  }
+
   func generateConsumer(
     _ opts: MatchingOptions
   ) throws -> MEProgram.ConsumeFunction {

diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift
@@ -50,6 +50,7 @@ extension Instruction.Payload {
     case bool(BoolRegister)
     case element(ElementRegister)
     case consumer(ConsumeFunctionRegister)
+    case bitset(AsciiBitsetRegister)
     case assertion(AssertionFunctionRegister)
     case addr(InstructionAddress)
     case capture(CaptureRegister)
@@ -196,6 +197,13 @@ extension Instruction.Payload {
     interpret()
   }
 
+  init(bitset: AsciiBitsetRegister) {
+    self.init(bitset)
+  }
+  var bitset: AsciiBitsetRegister {
+    interpret()
+  }
+
   init(consumer: ConsumeFunctionRegister) {
     self.init(consumer)
   }

diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift
@@ -84,6 +84,10 @@ extension Instruction {
     /// Operand: Sequence register to compare against.
     case matchSequence
 
+    /// Match against a set of valid ascii values stored in a bitset
+    /// Operand: Ascii bitset register containing the bitset
+    case matchBitset
+
     /// TODO: builtin assertions and anchors
     case builtinAssertion
 

diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift
@@ -18,6 +18,7 @@ extension MEProgram {
     var elements = TypedSetVector<Input.Element, _ElementRegister>()
     var sequences = TypedSetVector<[Input.Element], _SequenceRegister>()
 
+    var asciiBitsets: [DSLTree.CustomCharacterClass.AsciiBitset] = []
     var consumeFunctions: [ConsumeFunction] = []
     var assertionFunctions: [AssertionFunction] = []
     var transformFunctions: [TransformFunction] = []
@@ -147,6 +148,13 @@ extension MEProgram.Builder {
       .init(sequence: sequences.store(.init(s)))))
   }
 
+  mutating func buildMatchAsciiBitset(
+    _ b: DSLTree.CustomCharacterClass.AsciiBitset
+  ) {
+    instructions.append(.init(
+      .matchBitset, .init(bitset: makeAsciiBitset(b))))
+  }
+
   mutating func buildConsume(
     by p: @escaping MEProgram.ConsumeFunction
   ) {
@@ -273,6 +281,7 @@ extension MEProgram.Builder {
     regInfo.sequences = sequences.count
     regInfo.ints = nextIntRegister.rawValue
     regInfo.values = nextValueRegister.rawValue
+    regInfo.bitsets = asciiBitsets.count
     regInfo.consumeFunctions = consumeFunctions.count
     regInfo.assertionFunctions = assertionFunctions.count
     regInfo.transformFunctions = transformFunctions.count
@@ -283,6 +292,7 @@ extension MEProgram.Builder {
       instructions: InstructionList(instructions),
       staticElements: elements.stored,
       staticSequences: sequences.stored,
+      staticBitsets: asciiBitsets,
       staticConsumeFunctions: consumeFunctions,
       staticAssertionFunctions: assertionFunctions,
       staticTransformFunctions: transformFunctions,
@@ -414,6 +424,13 @@ extension MEProgram.Builder {
   // TODO: A register-mapping helper struct, which could release
   // registers without monotonicity required
 
+  mutating func makeAsciiBitset(
+    _ b: DSLTree.CustomCharacterClass.AsciiBitset
+  ) -> AsciiBitsetRegister {
+    defer { asciiBitsets.append(b) }
+    return AsciiBitsetRegister(asciiBitsets.count)
+  }
+
   mutating func makeConsumeFunction(
     _ f: @escaping MEProgram.ConsumeFunction
   ) -> ConsumeFunctionRegister {

diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift
@@ -26,6 +26,7 @@ struct MEProgram {
 
   var staticElements: [Input.Element]
   var staticSequences: [[Input.Element]]
+  var staticBitsets: [DSLTree.CustomCharacterClass.AsciiBitset]
   var staticConsumeFunctions: [ConsumeFunction]
   var staticAssertionFunctions: [AssertionFunction]
   var staticTransformFunctions: [TransformFunction]

diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift
@@ -226,6 +226,20 @@ extension Processor {
     }
     return true
   }
+
+  // If we have a bitset we know that the CharacterClass only matches against
+  // ascii characters, so check if the current input element is ascii then
+  // check if it is set in the bitset
+  mutating func matchBitset(
+    _ bitset: DSLTree.CustomCharacterClass.AsciiBitset
+  ) -> Bool {
+    guard let cur = load(), bitset.matches(char: cur) else {
+      signalFailure()
+      return false
+    }
+    _uncheckedForcedConsumeOne()
+    return true
+  }
 
   mutating func signalFailure() {
     guard let (pc, pos, stackEnd, capEnds, intRegisters) =
@@ -364,6 +378,13 @@ extension Processor {
         controller.step()
       }
 
+    case .matchBitset:
+      let reg = payload.bitset
+      let bitset = registers[reg]
+      if matchBitset(bitset) {
+        controller.step()
+      }
+
     case .consumeBy:
       let reg = payload.consumer
       guard currentPosition < searchBounds.upperBound,