swiftlang · hamishknight · Jul 26, 2022 · Jul 20, 2022 · Jul 20, 2022 · Jul 20, 2022
diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
@@ -480,35 +480,37 @@ extension Parser {
   ///
   mutating func lexQuantifier(
   ) -> (Located<Quant.Amount>, Located<Quant.Kind>, [AST.Trivia])? {
-    var trivia: [AST.Trivia] = []
+    tryEating { p in
+      var trivia: [AST.Trivia] = []
 
-    if let t = lexNonSemanticWhitespace() { trivia.append(t) }
+      if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }
 
-    let amt: Located<Quant.Amount>? = recordLoc { p in
-      if p.tryEat("*") { return .zeroOrMore }
-      if p.tryEat("+") { return .oneOrMore }
-      if p.tryEat("?") { return .zeroOrOne }
+      let amt: Located<Quant.Amount>? = p.recordLoc { p in
+        if p.tryEat("*") { return .zeroOrMore }
+        if p.tryEat("+") { return .oneOrMore }
+        if p.tryEat("?") { return .zeroOrOne }
 
-      return p.tryEating { p in
-        guard p.tryEat("{"),
-              let range = p.lexRange(trivia: &trivia),
-              p.tryEat("}")
-        else { return nil }
-        return range.value
+        return p.tryEating { p in
+          guard p.tryEat("{"),
+                let range = p.lexRange(trivia: &trivia),
+                p.tryEat("}")
+          else { return nil }
+          return range.value
+        }
       }
-    }
-    guard let amt = amt else { return nil }
+      guard let amt = amt else { return nil }
 
-    // PCRE allows non-semantic whitespace here in extended syntax mode.
-    if let t = lexNonSemanticWhitespace() { trivia.append(t) }
+      // PCRE allows non-semantic whitespace here in extended syntax mode.
+      if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }
 
-    let kind: Located<Quant.Kind> = recordLoc { p in
-      if p.tryEat("?") { return .reluctant  }
-      if p.tryEat("+") { return .possessive }
-      return .eager
-    }
+      let kind: Located<Quant.Kind> = p.recordLoc { p in
+        if p.tryEat("?") { return .reluctant  }
+        if p.tryEat("+") { return .possessive }
+        return .eager
+      }
 
-    return (amt, kind, trivia)
+      return (amt, kind, trivia)
+    }
   }
 
   /// Try to consume a range, returning `nil` if unsuccessful.

diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -775,9 +775,131 @@ fileprivate extension Compiler.ByteCodeGen {
     builder.label(exit)
   }
 
+  /// Coalesce any adjacent scalar members in a custom character class together.
+  /// This is required in order to produce correct grapheme matching behavior.
+  func coalescingCustomCharacterClassMembers(
+    _ members: [DSLTree.CustomCharacterClass.Member]
+  ) -> [DSLTree.CustomCharacterClass.Member] {
+    struct Accumulator {
+      /// A series of range operands. For example, in `[ab-cde-fg]`, this will
+      /// contain the strings `["ab", "cde", "fg"]`. From there, the resulting
+      /// ranges will be created.
+      private var rangeOperands: [String] = [""]
+
+      /// The current range operand.
+      private var current: String {
+        _read { yield rangeOperands[rangeOperands.count - 1] }
+        _modify { yield &rangeOperands[rangeOperands.count - 1] }
+      }
+
+      /// Try to accumulate a character class member, returning `true` if
+      /// successful, `false` otherwise.
+      mutating func tryAccumulate(
+        _ member: DSLTree.CustomCharacterClass.Member
+      ) -> Bool {
+        switch member {
+        case .atom(let a):
+          guard let c = a.literalCharacterValue else { return false }
+          current.append(c)
+          return true
+        case .quotedLiteral(let str):
+          current += str
+          return true
+        case let .range(lhs, rhs):
+          guard let lhs = lhs.literalCharacterValue,
+                let rhs = rhs.literalCharacterValue
+          else { return false }
+          current.append(lhs)
+          rangeOperands.append(String(rhs))
+          return true
+        case .trivia:
+          // Trivia can be completely ignored if we've already coalesced
+          // something.
+          return !current.isEmpty
+        default:
+          return false
+        }
+      }
+
+      func finish() -> [DSLTree.CustomCharacterClass.Member] {
+        if rangeOperands.count == 1 {
+          // If we didn't have any additional range operands, this isn't a
+          // range, we can just form a standard quoted literal.
+          return [.quotedLiteral(current)]
+        }
+        var members = [DSLTree.CustomCharacterClass.Member]()
+
+        // We have other range operands, splice them together. For N operands
+        // we have N - 1 ranges.
+        for (i, lhs) in rangeOperands.dropLast().enumerated() {
+          let rhs = rangeOperands[i + 1]
+
+          // If this is the first operand we only need to drop the last
+          // character for its quoted members, otherwise this is both an LHS
+          // and RHS of a range, and as such needs both sides trimmed.
+          let leading = i == 0 ? lhs.dropLast() : lhs.dropFirst().dropLast()
+          if !leading.isEmpty {
+            members.append(.quotedLiteral(String(leading)))
+          }
+          members.append(.range(.char(lhs.last!), .char(rhs.first!)))
+        }
+        // We've handled everything except the quoted portion of the last
+        // operand, add it now.
+        let trailing = rangeOperands.last!.dropFirst()
+        if !trailing.isEmpty {
+          members.append(.quotedLiteral(String(trailing)))
+        }
+        return members
+      }
+    }
+    return members
+      .map { m -> DSLTree.CustomCharacterClass.Member in
+        // First we need to recursively coalsce any child character classes.
+        switch m {
+        case .custom(let ccc):
+          return .custom(coalescingCustomCharacterClass(ccc))
+        case .intersection(let lhs, let rhs):
+          return .intersection(
+            coalescingCustomCharacterClass(lhs),
+            coalescingCustomCharacterClass(rhs))
+        case .subtraction(let lhs, let rhs):
+          return .subtraction(
+            coalescingCustomCharacterClass(lhs),
+            coalescingCustomCharacterClass(rhs))
+        case .symmetricDifference(let lhs, let rhs):
+          return .symmetricDifference(
+            coalescingCustomCharacterClass(lhs),
+            coalescingCustomCharacterClass(rhs))
+        case .atom, .range, .quotedLiteral, .trivia:
+          return m
+        }
+      }
+      .coalescing(with: Accumulator(), into: { $0.finish() }) { accum, member in
+        accum.tryAccumulate(member)
+      }
+  }
+
+  func coalescingCustomCharacterClass(
+    _ ccc: DSLTree.CustomCharacterClass
+  ) -> DSLTree.CustomCharacterClass {
+    // This only needs to be done in grapheme semantic mode. In scalar semantic
+    // mode, we don't want to coalesce any scalars into a grapheme. This
+    // means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
+    // U+302.
+    guard options.semanticLevel == .graphemeCluster else { return ccc }
+
+    let members = coalescingCustomCharacterClassMembers(ccc.members)
+    return .init(members: members, isInverted: ccc.isInverted)
+  }
+
   mutating func emitCustomCharacterClass(
     _ ccc: DSLTree.CustomCharacterClass
   ) throws {
+    // Before emitting a custom character class in grapheme semantic mode, we
+    // need to coalesce together any adjacent characters and scalars, over which
+    // we can perform grapheme breaking. This includes e.g range bounds for
+    // `[e\u{301}-\u{302}]`.
+    let ccc = coalescingCustomCharacterClass(ccc)
     if let asciiBitset = ccc.asAsciiBitset(options),
         optimizationsEnabled {
       if options.semanticLevel == .unicodeScalar {
@@ -791,6 +913,45 @@ fileprivate extension Compiler.ByteCodeGen {
     }
   }
 
+  mutating func emitConcatenation(_ children: [DSLTree.Node]) throws {
+    // Before emitting a concatenation, we need to flatten out any nested
+    // concatenations, and coalesce any adjacent characters and scalars, forming
+    // quoted literals of their contents, over which we can perform grapheme
+    // breaking.
+    func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] {
+      switch node {
+      case .concatenation(let ch):
+        return ch.flatMap(flatten)
+      case .convertedRegexLiteral(let n, _):
+        return flatten(n)
+      default:
+        return [node]
+      }
+    }
+    let children = children
+      .flatMap(flatten)
+      .coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in
+        switch node {
+        case .atom(let a):
+          guard let c = a.literalCharacterValue else { return false }
+          str.append(c)
+          return true
+        case .quotedLiteral(let q):
+          str += q
+          return true
+        case .trivia:
+          // Trivia can be completely ignored if we've already coalesced
+          // something.
+          return !str.isEmpty
+        default:
+          return false
+        }
+      }
+    for child in children {
+      try emitConcatenationComponent(child)
+    }
+  }
+
   @discardableResult
   mutating func emitNode(_ node: DSLTree.Node) throws -> ValueRegister? {
     switch node {
@@ -799,9 +960,7 @@ fileprivate extension Compiler.ByteCodeGen {
       try emitAlternation(children)
 
     case let .concatenation(children):
-      for child in children {
-        try emitConcatenationComponent(child)
-      }
+      try emitConcatenation(children)
 
     case let .capture(name, refId, child, transform):
       options.beginScope()

diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift
@@ -42,19 +42,43 @@ class Compiler {
   }
 }
 
+/// Hashable wrapper for `Any.Type`.
+struct AnyHashableType: CustomStringConvertible, Hashable {
+  var ty: Any.Type
+  init(_ ty: Any.Type) {
+    self.ty = ty
+  }
+  var description: String { "\(ty)" }
+
+  static func == (lhs: Self, rhs: Self) -> Bool {
+    lhs.ty == rhs.ty
+  }
+  func hash(into hasher: inout Hasher) {
+    hasher.combine(ObjectIdentifier(ty))
+  }
+}
+
 // An error produced when compiling a regular expression.
-enum RegexCompilationError: Error, CustomStringConvertible {
+enum RegexCompilationError: Error, Hashable, CustomStringConvertible {
   // TODO: Source location?
   case uncapturedReference
+  case incorrectOutputType(incorrect: AnyHashableType, correct: AnyHashableType)
+  case invalidCharacterClassRangeOperand(Character)
+
+  static func incorrectOutputType(
+    incorrect: Any.Type, correct: Any.Type
+  ) -> Self {
+    .incorrectOutputType(incorrect: .init(incorrect), correct: .init(correct))
+  }
 
-  case incorrectOutputType(incorrect: Any.Type, correct: Any.Type)
-
   var description: String {
     switch self {
     case .uncapturedReference:
       return "Found a reference used before it captured any match."
     case .incorrectOutputType(let incorrect, let correct):
       return "Cast to incorrect type 'Regex<\(incorrect)>', expected 'Regex<\(correct)>'"
+    case .invalidCharacterClassRangeOperand(let c):
+      return "'\(c)' is an invalid bound for character class range"
     }
   }
 }

diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -63,7 +63,7 @@ extension DSLTree._AST.Atom {
 extension Character {
   func generateConsumer(
     _ opts: MatchingOptions
-  ) throws -> MEProgram.ConsumeFunction? {
+  ) throws -> MEProgram.ConsumeFunction {
     let isCaseInsensitive = opts.isCaseInsensitive
     switch opts.semanticLevel {
     case .graphemeCluster:
@@ -327,24 +327,25 @@ extension DSLTree.CustomCharacterClass.Member {
     _ opts: MatchingOptions,
     _ isInverted: Bool
   ) -> DSLTree.CustomCharacterClass.AsciiBitset? {
+    typealias Bitset = DSLTree.CustomCharacterClass.AsciiBitset
     switch self {
     case let .atom(a):
       if let val = a.singleScalarASCIIValue {
-        return DSLTree.CustomCharacterClass.AsciiBitset(
-          val,
-          isInverted,
-          opts.isCaseInsensitive
-        )
+        return Bitset(val, isInverted, opts.isCaseInsensitive)
       }
     case let .range(low, high):
-      if let lowVal = low.singleScalarASCIIValue, let highVal = high.singleScalarASCIIValue {
-        return DSLTree.CustomCharacterClass.AsciiBitset(
-          low: lowVal,
-          high: highVal,
-          isInverted: isInverted,
-          isCaseInsensitive: opts.isCaseInsensitive
-        )
+      if let lowVal = low.singleScalarASCIIValue,
+         let highVal = high.singleScalarASCIIValue {
+        return Bitset(low: lowVal, high: highVal, isInverted: isInverted,
+                      isCaseInsensitive: opts.isCaseInsensitive)
+      }
+    case .quotedLiteral(let str):
+      var bitset = Bitset(isInverted: isInverted)
+      for c in str {
+        guard let ascii = c._singleScalarAsciiValue else { return nil }
+        bitset = bitset.union(Bitset(ascii, isInverted, opts.isCaseInsensitive))
       }
+      return bitset
     default:
       return nil
     }
@@ -361,12 +362,20 @@ extension DSLTree.CustomCharacterClass.Member {
       }
       return c
     case let .range(low, high):
-      guard let lhs = low.literalCharacterValue?.singleScalar, lhs.isNFC else {
+      guard let lhsChar = low.literalCharacterValue else {
         throw Unsupported("\(low) in range")
       }
-      guard let rhs = high.literalCharacterValue?.singleScalar, rhs.isNFC else {
+      guard let rhsChar = high.literalCharacterValue else {
         throw Unsupported("\(high) in range")
       }
+
+      // We must have NFC single scalar bounds.
+      guard let lhs = lhsChar.singleScalar, lhs.isNFC else {
+        throw RegexCompilationError.invalidCharacterClassRangeOperand(lhsChar)
+      }
+      guard let rhs = rhsChar.singleScalar, rhs.isNFC else {
+        throw RegexCompilationError.invalidCharacterClassRangeOperand(rhsChar)
+      }
       guard lhs <= rhs else {
         throw Unsupported("Invalid range \(low)-\(high)")
       }
@@ -456,21 +465,17 @@ extension DSLTree.CustomCharacterClass.Member {
         }
         return rhs(input, bounds)
       }
-    case .quotedLiteral(let s):
-      if opts.isCaseInsensitive {
-        return { input, bounds in
-          guard s.lowercased()._contains(input[bounds.lowerBound].lowercased()) else {
-            return nil
-          }
-          return input.index(after: bounds.lowerBound)
-        }
-      } else {
-        return { input, bounds in
-          guard s.contains(input[bounds.lowerBound]) else {
-            return nil
+    case .quotedLiteral(let str):
+      let consumers = try str.map {
+        try $0.generateConsumer(opts)
+      }
+      return { input, bounds in
+        for fn in consumers {
+          if let idx = fn(input, bounds) {
+            return idx
           }
-          return input.index(after: bounds.lowerBound)
         }
+        return nil
       }
     case .trivia:
       // TODO: Should probably strip this earlier...