swiftlang · hamishknight · Jul 8, 2022 · Jul 8, 2022 · milseman · Jul 11, 2022
diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -62,7 +62,10 @@ fileprivate extension Compiler.ByteCodeGen {
       try emitCharacter(c)
 
     case let .scalar(s):
-      try emitScalar(s)
+      // A scalar always matches the same as a single scalar character. This
+      // means it must match a whole grapheme in grapheme semantic mode, but
+      // can match a single scalar in scalar semantic mode.
+      try emitCharacter(Character(s))
 
     case let .assertion(kind):
       try emitAssertion(kind.ast)
@@ -244,8 +247,12 @@ fileprivate extension Compiler.ByteCodeGen {
       }
     }
   }
-
-  mutating func emitScalar(_ s: UnicodeScalar) throws {
+
+  /// Emit a consume of a single scalar value. This must only be used in scalar
+  /// semantic mode.
+  mutating func emitConsumeScalar(_ s: UnicodeScalar) throws {
+    assert(options.semanticLevel == .unicodeScalar, "Wrong semantic level")
+
     // TODO: Native instruction buildMatchScalar(s)
     if options.isCaseInsensitive {
       // TODO: e.g. buildCaseInsensitiveMatchScalar(s)
@@ -263,7 +270,7 @@ fileprivate extension Compiler.ByteCodeGen {
     // Unicode scalar matches the specific scalars that comprise a character
     if options.semanticLevel == .unicodeScalar {
       for scalar in c.unicodeScalars {
-        try emitScalar(scalar)
+        try emitConsumeScalar(scalar)
       }
       return
     }

diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -53,6 +53,45 @@ extension DSLTree._AST.Atom {
   }
 }
 
+extension Character {
+  func generateConsumer(
+    _ opts: MatchingOptions
+  ) throws -> MEProgram.ConsumeFunction? {
+    let isCaseInsensitive = opts.isCaseInsensitive
+    switch opts.semanticLevel {
+    case .graphemeCluster:
+      return { input, bounds in
+        let low = bounds.lowerBound
+        if isCaseInsensitive && isCased {
+          return input[low].lowercased() == lowercased()
+            ? input.index(after: low)
+            : nil
+        } else {
+          return input[low] == self
+            ? input.index(after: low)
+            : nil
+        }
+      }
+    case .unicodeScalar:
+      // TODO: This should only be reachable from character class emission, can
+      // we guarantee that? Otherwise we'd want a different matching behavior.
+      let consumers = unicodeScalars.map { s in consumeScalar {
+        isCaseInsensitive
+          ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping
+          : $0 == s
+      }}
+      return { input, bounds in
+        for fn in consumers {
+          if let idx = fn(input, bounds) {
+            return idx
+          }
+        }
+        return nil
+      }
+    }
+  }
+}
+
 extension DSLTree.Atom {
   var singleScalarASCIIValue: UInt8? {
     switch self {
@@ -72,44 +111,15 @@ extension DSLTree.Atom {
   func generateConsumer(
     _ opts: MatchingOptions
   ) throws -> MEProgram.ConsumeFunction? {
-    let isCaseInsensitive = opts.isCaseInsensitive
-
     switch self {
     case let .char(c):
-      if opts.semanticLevel == .graphemeCluster {
-        return { input, bounds in
-          let low = bounds.lowerBound
-          if isCaseInsensitive && c.isCased {
-            return input[low].lowercased() == c.lowercased()
-              ? input.index(after: low)
-              : nil
-          } else {
-            return input[low] == c
-              ? input.index(after: low)
-              : nil
-          }
-        }
-      } else {
-        let consumers = c.unicodeScalars.map { s in consumeScalar {
-          isCaseInsensitive
-            ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping
-            : $0 == s
-        }}
-        return { input, bounds in
-          for fn in consumers {
-            if let idx = fn(input, bounds) {
-              return idx
-            }
-          }
-          return nil
-        }
-      }
+      return try c.generateConsumer(opts)
+
     case let .scalar(s):
-      return consumeScalar {
-        isCaseInsensitive
-          ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping
-          : $0 == s
-      }
+      // A scalar always matches the same as a single scalar character. This
+      // means it must match a whole grapheme in grapheme semantic mode, but
+      // can match a single scalar in scalar semantic mode.
+      return try Character(s).generateConsumer(opts)
 
     case .any:
       // FIXME: Should this be a total ordering?
@@ -211,16 +221,20 @@ extension AST.Atom {
   var singleScalar: UnicodeScalar? {
     switch kind {
     case .scalar(let s): return s.value
+    case .escaped(let e):
+      guard let s = e.scalarValue else { return nil }
+      return s
     default: return nil
     }
   }
 
   var singleScalarASCIIValue: UInt8? {
+    if let s = singleScalar, s.isASCII {
+      return UInt8(ascii: s)
+    }
     switch kind {
     case let .char(c) where c != "\r\n":
       return c.asciiValue
-    case let .scalar(s) where s.value.isASCII:
-      return UInt8(ascii: s.value)
     default:
       return nil
     }

diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift
@@ -315,8 +315,7 @@ extension PrettyPrinter {
       return
     }
 
-    var charMembers = ""
-
+    var charMembers = StringLiteralBuilder()
 
     // This iterates through all of the character class members collecting all
     // of the members who can be stuffed into a singular '.anyOf(...)' vs.
@@ -340,14 +339,10 @@ extension PrettyPrinter {
         switch a {
         case let .char(c):
           charMembers.append(c)
-
-          if c == "\\" {
-            charMembers.append(c)
-          }
-
           return false
         case let .scalar(s):
-          charMembers += "\\u{\(String(s.value, radix: 16, uppercase: true))}"
+          charMembers.append(
+            unescaped: "\\u{\(String(s.value, radix: 16, uppercase: true))}")
           return false
         case .unconverted(_):
           return true
@@ -356,7 +351,7 @@ extension PrettyPrinter {
         }
 
       case let .quotedLiteral(s):
-        charMembers += s
+        charMembers.append(s)
         return false
 
       case .trivia(_):
@@ -370,7 +365,7 @@ extension PrettyPrinter {
     // Also in the same vein, if we have a few atom members but no
     // nonAtomMembers, then we can emit a single .anyOf(...) for them.
     if !charMembers.isEmpty, nonCharMembers.isEmpty {
-      let anyOf = ".anyOf(\(charMembers._quoted))"
+      let anyOf = ".anyOf(\(charMembers))"
 
       indent()
 
@@ -393,7 +388,7 @@ extension PrettyPrinter {
       printer.indent()
 
       if !charMembers.isEmpty {
-        printer.output(".anyOf(\(charMembers._quoted))")
+        printer.output(".anyOf(\(charMembers))")
 
         if nonCharMembers.count > 0 {
           printer.output(",")
@@ -617,10 +612,39 @@ extension PrettyPrinter {
 }
 
 extension String {
-  // TODO: Escaping?
+  fileprivate var _escaped: String {
+    _replacing(#"\"#, with: #"\\"#)._replacing(#"""#, with: #"\""#)
+  }
+
   fileprivate var _quoted: String {
-    "\"\(self._replacing(#"\"#, with: #"\\"#)._replacing(#"""#, with: #"\""#))\""
+    _escaped._bareQuoted
+  }
+
+  fileprivate var _bareQuoted: String {
+    #""\#(self)""#
+  }
+}
+
+/// A helper for building string literals, which handles escaping the contents
+/// appended.
+fileprivate struct StringLiteralBuilder {
+  private var contents = ""
+
+  var result: String { contents._bareQuoted }
+  var isEmpty: Bool { contents.isEmpty }
+
+  mutating func append(_ str: String) {
+    contents += str._escaped
+  }
+  mutating func append(_ c: Character) {
+    contents += String(c)._escaped
   }
+  mutating func append(unescaped str: String) {
+    contents += str
+  }
+}
+extension StringLiteralBuilder: CustomStringConvertible {
+  var description: String { result }
 }
 
 extension AST.Atom.AssertionKind {
@@ -1107,8 +1131,8 @@ extension DSLTree.Atom {
 
     case let .scalar(s):
       let hex = String(s.value, radix: 16, uppercase: true)
-      return ("\\u{\(hex)}"._quoted, false)
-      
+      return ("\\u{\(hex)}"._bareQuoted, false)
+
     case let .unconverted(a):
       if a.ast.isUnprintableAtom {
         return ("#/\(a.ast._regexBase)/#", false)
@@ -1149,7 +1173,7 @@ extension DSLTree.Atom {
 
     case let .scalar(s):
       let hex = String(s.value, radix: 16, uppercase: true)
-      return "\\u{\(hex)}"._quoted
+      return "\\u{\(hex)}"._bareQuoted
 
     case let .unconverted(a):
       return a.ast._regexBase

diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift
@@ -216,7 +216,7 @@ extension AST.Atom {
 
     switch self.kind {
     case let .char(c):                    return .char(c)
-    case let .scalar(s):                  return .char(Character(s.value))
+    case let .scalar(s):                  return .scalar(s.value)
     case .any:                            return .any
     case let .backreference(r):           return .backreference(.init(ast: r))
     case let .changeMatchingOptions(seq): return .changeMatchingOptions(.init(ast: seq))

diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift
@@ -1120,6 +1120,45 @@ class RegexDSLTests: XCTestCase {
     }
   }
 
+  func testScalarMatching() throws {
+    // RegexBuilder provides a RegexComponent conformance for UnicodeScalar. In
+    // grapheme cluster mode, it should only match entire graphemes. It may
+    // match a single scalar of a grapheme cluster in scalar semantic mode.
+    XCTAssertNotNil("a".firstMatch(of: "a" as UnicodeScalar))
+    XCTAssertNil("a\u{301}".firstMatch(of: "a" as UnicodeScalar))
+    XCTAssertNotNil("a\u{301}".firstMatch(
+      of: ("a" as UnicodeScalar).regex.matchingSemantics(.unicodeScalar)))
+
+    let r1 = Regex {
+      "a" as UnicodeScalar
+    }
+    XCTAssertNil(try r1.firstMatch(in: "a\u{301}"))
+    XCTAssertNotNil(
+      try r1.matchingSemantics(.unicodeScalar).firstMatch(in: "a\u{301}")
+    )
+
+    let r2 = Regex {
+      CharacterClass.anyOf(["a" as UnicodeScalar, "👍"])
+    }
+    XCTAssertNil(try r2.firstMatch(in: "a\u{301}"))
+    XCTAssertNotNil(
+      try r2.matchingSemantics(.unicodeScalar).firstMatch(in: "a\u{301}")
+    )
+
+    let r3 = Regex {
+      "👨" as UnicodeScalar
+      "\u{200D}" as UnicodeScalar
+      "👨" as UnicodeScalar
+      "\u{200D}" as UnicodeScalar
+      "👧" as UnicodeScalar
+      "\u{200D}" as UnicodeScalar
+      "👦" as UnicodeScalar
+    }
+    XCTAssertNil(try r3.firstMatch(in: "👨‍👨‍👧‍👦"))
+    XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).firstMatch(in: "👨‍👨‍👧‍👦"))
+    XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).wholeMatch(in: "👨‍👨‍👧‍👦"))
+  }
+
   struct SemanticVersion: Equatable {
     var major: Int
     var minor: Int

diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift
@@ -117,4 +117,34 @@ extension RenderDSLTests {
       }
       """#)
   }
+
+  func testScalar() throws {
+    try testConversion(#"\u{B4}"#, #"""
+      Regex {
+        "\u{B4}"
+      }
+      """#)
+    try testConversion(#"\u{301}"#, #"""
+      Regex {
+        "\u{301}"
+      }
+      """#)
+    try testConversion(#"[\u{301}]"#, #"""
+      Regex {
+        One(.anyOf("\u{301}"))
+      }
+      """#)
+    try testConversion(#"[abc\u{301}]"#, #"""
+      Regex {
+        One(.anyOf("abc\u{301}"))
+      }
+      """#)
+
+    // TODO: We ought to try and preserve the scalar syntax here.
+    try testConversion(#"a\u{301}"#, #"""
+      Regex {
+        "á"
+      }
+      """#)
+  }
 }