swiftlang · rctcwyvrn · Jul 12, 2022 · Jun 23, 2022 · Jun 23, 2022 · Jun 23, 2022
diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -74,6 +74,77 @@ fileprivate extension Compiler.ByteCodeGen {
     }
   }
 
+  mutating func emitQuotedLiteral(_ s: String) {
+    if options.semanticLevel == .graphemeCluster {
+      if options.isCaseInsensitive {
+        // future work: if all ascii, emit matchBitset instructions with
+        // case insensitive bitsets
+
+        // TODO: buildCaseInsensitiveMatchSequence(c) or alternative
+        builder.buildConsume { input, bounds in
+          var iterator = s.makeIterator()
+          var currentIndex = bounds.lowerBound
+          while let ch = iterator.next() {
+            guard currentIndex < bounds.upperBound,
+                  ch.lowercased() == input[currentIndex].lowercased()
+            else { return nil }
+            input.formIndex(after: &currentIndex)
+          }
+          return currentIndex
+        }
+      } else {
+        if optimizationsEnabled && s.allSatisfy({char in char.isASCII}) {
+          for char in s.dropLast(1) {
+            // Note: only cr-lf is multiple scalars
+            for scalar in char.unicodeScalars {
+              builder.buildMatchScalar(scalar, boundaryCheck: false)
+            }
+          }
+          let lastChar = s.last!
+          for scalar in lastChar.unicodeScalars {
+            // Only boundary check if we are the last scalar in the last character
+            // to make sure that there isn't a combining scalar after the quoted literal
+            let boundaryCheck = scalar == lastChar.unicodeScalars.last!
+            builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck)
+          }
+        } else {
+          builder.buildMatchSequence(s)
+        }
+      }
+    } else {
+      if optimizationsEnabled && !options.isCaseInsensitive {
+        // Match all scalars exactly, never boundary check because we're in
+        // unicode scalars mode
+        for char in s {
+          for scalar in char.unicodeScalars {
+            builder.buildMatchScalar(scalar, boundaryCheck: false)
+          }
+        }
+      } else {
+        builder.buildConsume {
+          [caseInsensitive = options.isCaseInsensitive] input, bounds in
+          // TODO: Case folding
+          var iterator = s.unicodeScalars.makeIterator()
+          var currentIndex = bounds.lowerBound
+          while let scalar = iterator.next() {
+            guard currentIndex < bounds.upperBound else { return nil }
+            if caseInsensitive {
+              if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping {
+                return nil
+              }
+            } else {
+              if scalar != input.unicodeScalars[currentIndex] {
+                return nil
+              }
+            }
+            input.unicodeScalars.formIndex(after: &currentIndex)
+          }
+          return currentIndex
+        }
+      }
+    }
+  }
+
   mutating func emitBackreference(
     _ ref: AST.Reference
   ) throws {
@@ -216,6 +287,11 @@ fileprivate extension Compiler.ByteCodeGen {
       builder.buildConsume(by: consumeScalar {
         $0.properties.lowercaseMapping == s.properties.lowercaseMapping
       })
+      return
+    }
+
+    if optimizationsEnabled { // should we just do this unconditionally?
+      builder.buildMatchScalar(s, boundaryCheck: false)
     } else {
       builder.buildConsume(by: consumeScalar {
         $0 == s
@@ -241,9 +317,17 @@ fileprivate extension Compiler.ByteCodeGen {
           ? input.index(after: bounds.lowerBound)
           : nil
       }
-    } else {
-      builder.buildMatch(c)
     }
+
+    if optimizationsEnabled && c.isASCII {
+      for scalar in c.unicodeScalars {
+        let boundaryCheck = scalar == c.unicodeScalars.last!
+        builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck)
+      }
+      return
+    }
+
+    builder.buildMatch(c)
   }
 
   mutating func emitAny() {
@@ -652,11 +736,12 @@ fileprivate extension Compiler.ByteCodeGen {
     _ ccc: DSLTree.CustomCharacterClass
   ) throws {
     if let asciiBitset = ccc.asAsciiBitset(options),
-        options.semanticLevel == .graphemeCluster,
         optimizationsEnabled {
-      // future work: add a bit to .matchBitset to consume either a character
-      // or a scalar so we can have this optimization in scalar mode
-      builder.buildMatchAsciiBitset(asciiBitset)
+      if options.semanticLevel == .unicodeScalar {
+        builder.buildScalarMatchAsciiBitset(asciiBitset)
+      } else {
+        builder.buildMatchAsciiBitset(asciiBitset)
+      }
     } else {
       let consumer = try ccc.generateConsumer(options)
       builder.buildConsume(by: consumer)
@@ -733,45 +818,7 @@ fileprivate extension Compiler.ByteCodeGen {
       try emitAtom(a)
 
     case let .quotedLiteral(s):
-      if options.semanticLevel == .graphemeCluster {
-        if options.isCaseInsensitive {
-          // TODO: buildCaseInsensitiveMatchSequence(c) or alternative
-          builder.buildConsume { input, bounds in
-            var iterator = s.makeIterator()
-            var currentIndex = bounds.lowerBound
-            while let ch = iterator.next() {
-              guard currentIndex < bounds.upperBound,
-                    ch.lowercased() == input[currentIndex].lowercased()
-              else { return nil }
-              input.formIndex(after: &currentIndex)
-            }
-            return currentIndex
-          }
-        } else {
-          builder.buildMatchSequence(s)
-        }
-      } else {
-        builder.buildConsume {
-          [caseInsensitive = options.isCaseInsensitive] input, bounds in
-          // TODO: Case folding
-          var iterator = s.unicodeScalars.makeIterator()
-          var currentIndex = bounds.lowerBound
-          while let scalar = iterator.next() {
-            guard currentIndex < bounds.upperBound else { return nil }
-            if caseInsensitive {
-              if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping {
-                return nil
-              }
-            } else {
-              if scalar != input.unicodeScalars[currentIndex] {
-                return nil
-              }
-            }
-            input.unicodeScalars.formIndex(after: &currentIndex)
-          }
-          return currentIndex
-        }
-      }
+      emitQuotedLiteral(s)
 
     case let .regexLiteral(l):
       return try emitNode(l.ast.dslTreeNode)

diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -11,6 +11,13 @@
 
 @_implementationOnly import _RegexParser
 
+extension Character {
+  var singleScalarAsciiValue: UInt8? {
+    guard self != "\r\n" else { return nil }
+    return asciiValue
+  }
+}
+
 extension DSLTree.Node {
   /// Attempt to generate a consumer from this AST node
   ///
@@ -60,8 +67,8 @@ extension DSLTree._AST.Atom {
 extension DSLTree.Atom {
   var singleScalarASCIIValue: UInt8? {
     switch self {
-    case let .char(c) where c != "\r\n":
-      return c.asciiValue
+    case let .char(c):
+      return c.singleScalarAsciiValue
     case let .scalar(s) where s.isASCII:
       return UInt8(ascii: s)
     case let .unconverted(atom):
@@ -214,8 +221,8 @@ extension AST.Atom {
 
   var singleScalarASCIIValue: UInt8? {
     switch kind {
-    case let .char(c) where c != "\r\n":
-      return c.asciiValue
+    case let .char(c):
+      return c.singleScalarAsciiValue
     case let .scalar(s) where s.value.isASCII:
       return UInt8(ascii: s.value)
     default:

diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift
@@ -147,6 +147,13 @@ extension Instruction.Payload {
   var string: StringRegister {
     interpret()
   }
+
+  init(scalar: Unicode.Scalar) {
+    self.init(UInt64(scalar.value))
+  }
+  var scalar: Unicode.Scalar {
+    return Unicode.Scalar(_value: UInt32(self.rawValue))
+  }
 
   init(sequence: SequenceRegister) {
     self.init(sequence)

diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift
@@ -83,10 +83,15 @@ extension Instruction {
     ///
     /// Operand: Sequence register to compare against.
     case matchSequence
+
+    case matchScalar
+    case matchScalarUnchecked
 
     /// Match against a set of valid ascii values stored in a bitset
     /// Operand: Ascii bitset register containing the bitset
     case matchBitset
+    /// matchBitset but emitted in unicode scalar semantic mode, matches and advances a single scalar
+    case matchBitsetScalar
 
     /// TODO: builtin assertions and anchors
     case builtinAssertion

diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift
@@ -147,6 +147,14 @@ extension MEProgram.Builder {
       .matchSequence,
       .init(sequence: sequences.store(.init(s)))))
   }
+
+  mutating func buildMatchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) {
+    if boundaryCheck {
+      instructions.append(.init(.matchScalar, .init(scalar: s)))
+    } else {
+      instructions.append(.init(.matchScalarUnchecked, .init(scalar: s)))
+    }
+  }
 
   mutating func buildMatchAsciiBitset(
     _ b: DSLTree.CustomCharacterClass.AsciiBitset
@@ -155,6 +163,13 @@ extension MEProgram.Builder {
       .matchBitset, .init(bitset: makeAsciiBitset(b))))
   }
 
+  mutating func buildScalarMatchAsciiBitset(
+    _ b: DSLTree.CustomCharacterClass.AsciiBitset
+  ) {
+    instructions.append(.init(
+      .matchBitsetScalar, .init(bitset: makeAsciiBitset(b))))
+  }
+
   mutating func buildConsume(
     by p: @escaping MEProgram.ConsumeFunction
   ) {

diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift
@@ -227,6 +227,30 @@ extension Processor {
     return true
   }
 
+  func loadScalar() -> Unicode.Scalar? {
+    currentPosition < end ? input.unicodeScalars[currentPosition] : nil
+  }
+
+  func nextScalarIndex(offsetBy n: Int, boundaryCheck: Bool) -> Input.Index? {
+    if let idx = input.unicodeScalars.index(currentPosition, offsetBy: 1, limitedBy: end),
+       (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) {
+      return idx
+    }
+    return nil
+  }
+
+  mutating func matchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) -> Bool {
+    guard let curScalar = loadScalar(),
+          curScalar == s,
+          let idx = nextScalarIndex(offsetBy: 1, boundaryCheck: boundaryCheck)
+    else {
+      signalFailure()
+      return false
+    }
+    currentPosition = idx
+    return true
+  }
+
   // If we have a bitset we know that the CharacterClass only matches against
   // ascii characters, so check if the current input element is ascii then
   // check if it is set in the bitset
@@ -241,6 +265,20 @@ extension Processor {
     return true
   }
 
+  // Equivalent of matchBitset but emitted when in unicode scalar semantic mode
+  mutating func matchBitsetScalar(
+    _ bitset: DSLTree.CustomCharacterClass.AsciiBitset
+  ) -> Bool {
+    guard let curScalar = loadScalar(),
+            bitset.matches(scalar: curScalar),
+          let idx = nextScalarIndex(offsetBy: 1, boundaryCheck: false) else {
+      signalFailure()
+      return false
+    }
+    currentPosition = idx
+    return true
+  }
+
   mutating func signalFailure() {
     guard let (pc, pos, stackEnd, capEnds, intRegisters) =
             savePoints.popLast()?.destructure
@@ -378,12 +416,29 @@ extension Processor {
         controller.step()
       }
 
+    case .matchScalar:
+      let scalar = payload.scalar
+      if matchScalar(scalar, boundaryCheck: true) {
+        controller.step()
+      }
+    case .matchScalarUnchecked:
+      let scalar = payload.scalar
+      if matchScalar(scalar, boundaryCheck: false) {
+        controller.step()
+      }
+
     case .matchBitset:
       let reg = payload.bitset
       let bitset = registers[reg]
       if matchBitset(bitset) {
         controller.step()
       }
+    case .matchBitsetScalar:
+      let reg = payload.bitset
+      let bitset = registers[reg]
+      if matchBitsetScalar(bitset) {
+        controller.step()
+      }
 
     case .consumeBy:
       let reg = payload.consumer