swiftlang · hamishknight · Jul 21, 2022 · Jul 7, 2022 · Jul 19, 2022 · Jul 19, 2022
diff --git a/Package.swift b/Package.swift
@@ -75,15 +75,17 @@ let package = Package(
             name: "RegexBuilder",
             dependencies: ["_StringProcessing", "_RegexParser"],
             swiftSettings: publicStdlibSettings),
+        .target(name: "TestSupport",
+                swiftSettings: [availabilityDefinition]),
         .testTarget(
             name: "RegexTests",
-            dependencies: ["_StringProcessing"],
+            dependencies: ["_StringProcessing", "TestSupport"],
             swiftSettings: [
                 .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]),
             ]),
         .testTarget(
             name: "RegexBuilderTests",
-            dependencies: ["_StringProcessing", "RegexBuilder"],
+            dependencies: ["_StringProcessing", "RegexBuilder", "TestSupport"],
             swiftSettings: [
                 .unsafeFlags(["-Xfrontend", "-disable-availability-checking"])
             ]),

diff --git a/Sources/TestSupport/TestSupport.swift b/Sources/TestSupport/TestSupport.swift
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// This source file is part of the Swift.org open source project
+//
+// Copyright (c) 2022 Apple Inc. and the Swift project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See https://swift.org/LICENSE.txt for license information
+//
+//===----------------------------------------------------------------------===//
+
+import XCTest
+
+// We need to split this out of the test files, as it needs to be compiled
+// *without* `-disable-availability-checking` to ensure the #available check is
+// not compiled into a no-op.
+
+#if os(Linux)
+public func XCTExpectFailure(
+  _ message: String? = nil, body: () throws -> Void
+) rethrows {}
+#endif
+
+/// Guards certain tests to make sure we have a new stdlib available.
+public func ensureNewStdlib(
+  file: StaticString = #file, line: UInt = #line
+) -> Bool {
+  guard #available(SwiftStdlib 5.7, *) else {
+    XCTExpectFailure { XCTFail("Unsupported stdlib", file: file, line: line) }
+    return false
+  }
+  return true
+}
diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift
@@ -755,8 +755,10 @@ extension AST.Atom {
   /// Whether this atom is valid as the operand of a custom character class
   /// range.
   public var isValidCharacterClassRangeBound: Bool {
-    // If we have a literal character value for this, it can be used as a bound.
-    if literalCharacterValue != nil { return true }
+    if let c = literalCharacterValue {
+      // We only match character range bounds that are single scalar NFC.
+      return c.hasExactlyOneScalar && c.isNFC
+    }
     switch kind {
     // \cx, \C-x, \M-x, \M-\C-x, \N{...}
     case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter:

diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
@@ -480,35 +480,37 @@ extension Parser {
   ///
   mutating func lexQuantifier(
   ) -> (Located<Quant.Amount>, Located<Quant.Kind>, [AST.Trivia])? {
-    var trivia: [AST.Trivia] = []
+    tryEating { p in
+      var trivia: [AST.Trivia] = []
 
-    if let t = lexNonSemanticWhitespace() { trivia.append(t) }
+      if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }
 
-    let amt: Located<Quant.Amount>? = recordLoc { p in
-      if p.tryEat("*") { return .zeroOrMore }
-      if p.tryEat("+") { return .oneOrMore }
-      if p.tryEat("?") { return .zeroOrOne }
+      let amt: Located<Quant.Amount>? = p.recordLoc { p in
+        if p.tryEat("*") { return .zeroOrMore }
+        if p.tryEat("+") { return .oneOrMore }
+        if p.tryEat("?") { return .zeroOrOne }
 
-      return p.tryEating { p in
-        guard p.tryEat("{"),
-              let range = p.lexRange(trivia: &trivia),
-              p.tryEat("}")
-        else { return nil }
-        return range.value
+        return p.tryEating { p in
+          guard p.tryEat("{"),
+                let range = p.lexRange(trivia: &trivia),
+                p.tryEat("}")
+          else { return nil }
+          return range.value
+        }
       }
-    }
-    guard let amt = amt else { return nil }
+      guard let amt = amt else { return nil }
 
-    // PCRE allows non-semantic whitespace here in extended syntax mode.
-    if let t = lexNonSemanticWhitespace() { trivia.append(t) }
+      // PCRE allows non-semantic whitespace here in extended syntax mode.
+      if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }
 
-    let kind: Located<Quant.Kind> = recordLoc { p in
-      if p.tryEat("?") { return .reluctant  }
-      if p.tryEat("+") { return .possessive }
-      return .eager
-    }
+      let kind: Located<Quant.Kind> = p.recordLoc { p in
+        if p.tryEat("?") { return .reluctant  }
+        if p.tryEat("+") { return .possessive }
+        return .eager
+      }
 
-    return (amt, kind, trivia)
+      return (amt, kind, trivia)
+    }
   }
 
   /// Try to consume a range, returning `nil` if unsuccessful.

diff --git a/Sources/_RegexParser/Utility/Misc.swift b/Sources/_RegexParser/Utility/Misc.swift
@@ -19,6 +19,21 @@ extension Substring {
   var string: String { String(self) }
 }
 
+extension Character {
+  /// Whether this character is made up of exactly one Unicode scalar value.
+  public var hasExactlyOneScalar: Bool {
+    let scalars = unicodeScalars
+    return scalars.index(after: scalars.startIndex) == scalars.endIndex
+  }
+
+  /// Whether the given character is in NFC form.
+  internal var isNFC: Bool {
+    if isASCII { return true }
+    let str = String(self)
+    return str._nfcCodeUnits.elementsEqual(str.utf8)
+  }
+}
+
 extension CustomStringConvertible {
   @_alwaysEmitIntoClient
   public var halfWidthCornerQuoted: String {

diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -775,9 +775,131 @@ fileprivate extension Compiler.ByteCodeGen {
     builder.label(exit)
   }
 
+  /// Coalesce any adjacent scalar members in a custom character class together.
+  /// This is required in order to produce correct grapheme matching behavior.
+  func coalescingCustomCharacterClassMembers(
+    _ members: [DSLTree.CustomCharacterClass.Member]
+  ) -> [DSLTree.CustomCharacterClass.Member] {
+    struct Accumulator {
+      /// A series of range operands. For example, in `[ab-cde-fg]`, this will
+      /// contain the strings `["ab", "cde", "fg"]`. From there, the resulting
+      /// ranges will be created.
+      private var rangeOperands: [String] = [""]
+
+      /// The current range operand.
+      private var current: String {
+        _read { yield rangeOperands[rangeOperands.count - 1] }
+        _modify { yield &rangeOperands[rangeOperands.count - 1] }
+      }
+
+      /// Try to accumulate a character class member, returning `true` if
+      /// successful, `false` otherwise.
+      mutating func tryAccumulate(
+        _ member: DSLTree.CustomCharacterClass.Member
+      ) -> Bool {
+        switch member {
+        case .atom(let a):
+          guard let c = a.literalCharacterValue else { return false }
+          current.append(c)
+          return true
+        case .quotedLiteral(let str):
+          current += str
+          return true
+        case let .range(lhs, rhs):
+          guard let lhs = lhs.literalCharacterValue,
+                let rhs = rhs.literalCharacterValue
+          else { return false }
+          current.append(lhs)
+          rangeOperands.append(String(rhs))
+          return true
+        case .trivia:
+          // Trivia can be completely ignored if we've already coalesced
+          // something.
+          return !current.isEmpty
+        default:
+          return false
+        }
+      }
+
+      func finish() -> [DSLTree.CustomCharacterClass.Member] {
+        if rangeOperands.count == 1 {
+          // If we didn't have any additional range operands, this isn't a
+          // range, we can just form a standard quoted literal.
+          return [.quotedLiteral(current)]
+        }
+        var members = [DSLTree.CustomCharacterClass.Member]()
+
+        // We have other range operands, splice them together. For N operands
+        // we have N - 1 ranges.
+        for (i, lhs) in rangeOperands.dropLast().enumerated() {
+          let rhs = rangeOperands[i + 1]
+
+          // If this is the first operand we only need to drop the last
+          // character for its quoted members, otherwise this is both an LHS
+          // and RHS of a range, and as such needs both sides trimmed.
+          let leading = i == 0 ? lhs.dropLast() : lhs.dropFirst().dropLast()
+          if !leading.isEmpty {
+            members.append(.quotedLiteral(String(leading)))
+          }
+          members.append(.range(.char(lhs.last!), .char(rhs.first!)))
+        }
+        // We've handled everything except the quoted portion of the last
+        // operand, add it now.
+        let trailing = rangeOperands.last!.dropFirst()
+        if !trailing.isEmpty {
+          members.append(.quotedLiteral(String(trailing)))
+        }
+        return members
+      }
+    }
+    return members
+      .map { m -> DSLTree.CustomCharacterClass.Member in
+        // First we need to recursively coalsce any child character classes.
+        switch m {
+        case .custom(let ccc):
+          return .custom(coalescingCustomCharacterClass(ccc))
+        case .intersection(let lhs, let rhs):
+          return .intersection(
+            coalescingCustomCharacterClass(lhs),
+            coalescingCustomCharacterClass(rhs))
+        case .subtraction(let lhs, let rhs):
+          return .subtraction(
+            coalescingCustomCharacterClass(lhs),
+            coalescingCustomCharacterClass(rhs))
+        case .symmetricDifference(let lhs, let rhs):
+          return .symmetricDifference(
+            coalescingCustomCharacterClass(lhs),
+            coalescingCustomCharacterClass(rhs))
+        case .atom, .range, .quotedLiteral, .trivia:
+          return m
+        }
+      }
+      .coalescing(with: Accumulator(), into: { $0.finish() }) { accum, member in
+        accum.tryAccumulate(member)
+      }
+  }
+
+  func coalescingCustomCharacterClass(
+    _ ccc: DSLTree.CustomCharacterClass
+  ) -> DSLTree.CustomCharacterClass {
+    // This only needs to be done in grapheme semantic mode. In scalar semantic
+    // mode, we don't want to coalesce any scalars into a grapheme. This
+    // means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
+    // U+302.
+    guard options.semanticLevel == .graphemeCluster else { return ccc }
+
+    let members = coalescingCustomCharacterClassMembers(ccc.members)
+    return .init(members: members, isInverted: ccc.isInverted)
+  }
+
   mutating func emitCustomCharacterClass(
     _ ccc: DSLTree.CustomCharacterClass
   ) throws {
+    // Before emitting a custom character class in grapheme semantic mode, we
+    // need to coalesce together any adjacent characters and scalars, over which
+    // we can perform grapheme breaking. This includes e.g range bounds for
+    // `[e\u{301}-\u{302}]`.
+    let ccc = coalescingCustomCharacterClass(ccc)
     if let asciiBitset = ccc.asAsciiBitset(options),
         optimizationsEnabled {
       if options.semanticLevel == .unicodeScalar {
@@ -791,6 +913,45 @@ fileprivate extension Compiler.ByteCodeGen {
     }
   }
 
+  mutating func emitConcatenation(_ children: [DSLTree.Node]) throws {
+    // Before emitting a concatenation, we need to flatten out any nested
+    // concatenations, and coalesce any adjacent characters and scalars, forming
+    // quoted literals of their contents, over which we can perform grapheme
+    // breaking.
+    func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] {
+      switch node {
+      case .concatenation(let ch):
+        return ch.flatMap(flatten)
+      case .convertedRegexLiteral(let n, _):
+        return flatten(n)
+      default:
+        return [node]
+      }
+    }
+    let children = children
+      .flatMap(flatten)
+      .coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in
+        switch node {
+        case .atom(let a):
+          guard let c = a.literalCharacterValue else { return false }
+          str.append(c)
+          return true
+        case .quotedLiteral(let q):
+          str += q
+          return true
+        case .trivia:
+          // Trivia can be completely ignored if we've already coalesced
+          // something.
+          return !str.isEmpty
+        default:
+          return false
+        }
+      }
+    for child in children {
+      try emitConcatenationComponent(child)
+    }
+  }
+
   @discardableResult
   mutating func emitNode(_ node: DSLTree.Node) throws -> ValueRegister? {
     switch node {
@@ -799,9 +960,7 @@ fileprivate extension Compiler.ByteCodeGen {
       try emitAlternation(children)
 
     case let .concatenation(children):
-      for child in children {
-        try emitConcatenationComponent(child)
-      }
+      try emitConcatenation(children)
 
     case let .capture(name, refId, child, transform):
       options.beginScope()

diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift
@@ -42,19 +42,43 @@ class Compiler {
   }
 }
 
+/// Hashable wrapper for `Any.Type`.
+struct AnyHashableType: CustomStringConvertible, Hashable {
+  var ty: Any.Type
+  init(_ ty: Any.Type) {
+    self.ty = ty
+  }
+  var description: String { "\(ty)" }
+
+  static func == (lhs: Self, rhs: Self) -> Bool {
+    lhs.ty == rhs.ty
+  }
+  func hash(into hasher: inout Hasher) {
+    hasher.combine(ObjectIdentifier(ty))
+  }
+}
+
 // An error produced when compiling a regular expression.
-enum RegexCompilationError: Error, CustomStringConvertible {
+enum RegexCompilationError: Error, Hashable, CustomStringConvertible {
   // TODO: Source location?
   case uncapturedReference
+  case incorrectOutputType(incorrect: AnyHashableType, correct: AnyHashableType)
+  case invalidCharacterClassRangeOperand(Character)
+
+  static func incorrectOutputType(
+    incorrect: Any.Type, correct: Any.Type
+  ) -> Self {
+    .incorrectOutputType(incorrect: .init(incorrect), correct: .init(correct))
+  }
 
-  case incorrectOutputType(incorrect: Any.Type, correct: Any.Type)
-
   var description: String {
     switch self {
     case .uncapturedReference:
       return "Found a reference used before it captured any match."
     case .incorrectOutputType(let incorrect, let correct):
       return "Cast to incorrect type 'Regex<\(incorrect)>', expected 'Regex<\(correct)>'"
+    case .invalidCharacterClassRangeOperand(let c):
+      return "'\(c)' is an invalid bound for character class range"
     }
   }
 }