swiftlang · natecook1000 · May 16, 2022 · May 7, 2022 · May 7, 2022 · May 7, 2022
diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift
@@ -427,11 +427,32 @@ extension AST.Atom.CharacterProperty {
     /// Character name in the form `\p{name=...}`
     case named(String)
 
+    /// Numeric type.
+    case numericType(Unicode.NumericType)
+
+    /// Numeric value.
+    case numericValue(Double)
+
+    /// Case mapping.
+    case mapping(MapKind, String)
+
+    /// Canonical Combining Class.
+    case ccc(Unicode.CanonicalCombiningClass)
+
+    /// Character age, as per UnicodeScalar.Properties.age.
+    case age(major: Int, minor: Int)
+
     case posix(Unicode.POSIXProperty)
 
     /// Some special properties implemented by PCRE and Oniguruma.
     case pcreSpecial(PCRESpecialCategory)
     case onigurumaSpecial(OnigurumaSpecialProperty)
+
+    public enum MapKind: Hashable {
+      case lowercase
+      case uppercase
+      case titlecase
+    }
   }
 
   // TODO: erm, separate out or fold into something? splat it in?

diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift
@@ -13,17 +13,17 @@ extension Source {
   typealias PropertyKind = AST.Atom.CharacterProperty.Kind
 
   static private func withNormalizedForms<T>(
-    _ str: String, match: (String) -> T?
-  ) -> T? {
+    _ str: String, match: (String) throws -> T?
+  ) rethrows -> T? {
     // This follows the rules provided by UAX44-LM3, including trying to drop an
     // "is" prefix, which isn't required by UTS#18 RL1.2, but is nice for
     // consistency with other engines and the Unicode.Scalar.Properties names.
     let str = str.filter { !$0.isPatternWhitespace && $0 != "_" && $0 != "-" }
                  .lowercased()
-    if let m = match(str) {
+    if let m = try match(str) {
       return m
     }
-    if str.hasPrefix("is"), let m = match(String(str.dropFirst(2))) {
+    if str.hasPrefix("is"), let m = try match(String(str.dropFirst(2))) {
       return m
     }
     return nil
@@ -79,6 +79,19 @@ extension Source {
     }
   }
 
+  static private func classifyNumericType(
+    _ str: String
+  ) -> Unicode.NumericType? {
+    withNormalizedForms(str) { str in
+      switch str {
+      case "decimal":   return .decimal
+      case "digit":     return .digit
+      case "numeric":   return .numeric
+      default:          return nil
+      }
+    }
+  }
+
   static private func classifyBoolProperty(
     _ str: String
   ) -> Unicode.BinaryProperty? {
@@ -361,6 +374,27 @@ extension Source {
       }
     }
   }
+
+  static func parseAge(_ value: String) -> Unicode.Version? {
+    // Age can be specified in the form '3.0' or 'V3_0'.
+    // Other formats are not supported.
+    var str = value[...]
+
+    let separator: Character
+    if str.first == "V" {
+      str.removeFirst()
+      separator = "_"
+    } else {
+      separator = "."
+    }
+
+    guard let sepIndex = str.firstIndex(of: separator),
+          let major = Int(str[..<sepIndex]),
+          let minor = Int(str[sepIndex...].dropFirst())
+    else { return nil }
+
+    return (major, minor)
+  }
 
   static func classifyCharacterPropertyValueOnly(
     _ value: String
@@ -414,22 +448,51 @@ extension Source {
 
     // This uses the aliases defined in
     // https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt.
-    let match = withNormalizedForms(key) { key -> PropertyKind? in
-      switch key {
+    let match = try withNormalizedForms(key) { normalizedKey -> PropertyKind? in
+      switch normalizedKey {
       case "script", "sc":
-        if let script = classifyScriptProperty(value) {
-          return .script(script)
+        guard let script = classifyScriptProperty(value) else {
+          throw ParseError.unrecognizedScript(value)
         }
+        return .script(script)
       case "scriptextensions", "scx":
-        if let script = classifyScriptProperty(value) {
-          return .scriptExtension(script)
+        guard let script = classifyScriptProperty(value) else {
+          throw ParseError.unrecognizedScript(value)
         }
+        return .scriptExtension(script)
       case "gc", "generalcategory":
-        if let cat = classifyGeneralCategory(value) {
-          return .generalCategory(cat)
+        guard let cat = classifyGeneralCategory(value) else {
+          throw ParseError.unrecognizedCategory(value)
+        }
+        return .generalCategory(cat)
+      case "age":
+        guard let (major, minor) = parseAge(value) else {
+          throw ParseError.invalidAge(value)
         }
+        return .age(major: major, minor: minor)
       case "name", "na":
         return .named(value)
+      case "numericvalue", "nv":
+        guard let numericValue = Double(value) else {
+          throw ParseError.invalidNumericValue(value)
+        }
+        return .numericValue(numericValue)
+      case "numerictype", "nt":
+        guard let type = classifyNumericType(value) else {
+          throw ParseError.unrecognizedNumericType(value)
+        }
+        return .numericType(type)
+      case "slc", "simplelowercasemapping":
+        return .mapping(.lowercase, value)
+      case "suc", "simpleuppercasemapping":
+        return .mapping(.uppercase, value)
+      case "stc", "simpletitlecasemapping":
+        return .mapping(.titlecase, value)
+      case "ccc", "canonicalcombiningclass":
+        guard let cccValue = UInt8(value), cccValue <= 254 else {
+          throw ParseError.invalidCCC(value)
+        }
+        return .ccc(.init(rawValue: cccValue))
       default:
         break
       }

diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift
@@ -60,7 +60,13 @@ enum ParseError: Error, Hashable {
 
   case emptyProperty
   case unknownProperty(key: String?, value: String)
-
+  case unrecognizedScript(String)
+  case unrecognizedCategory(String)
+  case invalidAge(String)
+  case invalidNumericValue(String)
+  case unrecognizedNumericType(String)
+  case invalidCCC(String)
+
   case expectedGroupSpecifier
   case unbalancedEndOfGroup
 
@@ -184,6 +190,18 @@ extension ParseError: CustomStringConvertible {
       return "extended syntax may not be disabled in multi-line mode"
     case .expectedCalloutArgument:
       return "expected argument to callout"
+    case .unrecognizedScript(let value):
+      return "unrecognized script '\(value)'"
+    case .unrecognizedCategory(let value):
+      return "unrecognized category '\(value)'"
+    case .unrecognizedNumericType(let value):
+      return "unrecognized numeric type '\(value)'"
+    case .invalidAge(let value):
+      return "invalid age format for '\(value)' - use '3.0' or 'V3_0' formats"
+    case .invalidNumericValue(let value):
+      return "invalid numeric value '\(value)'"
+    case .invalidCCC(let value):
+      return "invalid canonical combining class '\(value)'"
 
     // MARK: Semantic Errors
 

diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift
@@ -127,8 +127,8 @@ extension RegexValidator {
     _ prop: Unicode.BinaryProperty, at loc: SourceLocation
   ) throws {
     switch prop {
-    case .asciiHexDigit, .alphabetic, .bidiMirrored, .cased, .caseIgnorable,
-        .changesWhenCasefolded, .changesWhenCasemapped,
+    case .asciiHexDigit, .alphabetic, .bidiControl, .bidiMirrored, .cased,
+        .caseIgnorable, .changesWhenCasefolded, .changesWhenCasemapped,
         .changesWhenNFKCCasefolded, .changesWhenLowercased,
         .changesWhenTitlecased, .changesWhenUppercased, .dash, .deprecated,
         .defaultIgnorableCodePoint, .diacratic, .extender,
@@ -150,7 +150,7 @@ extension RegexValidator {
     case .expandsOnNFC, .expandsOnNFD, .expandsOnNFKD, .expandsOnNFKC:
       throw error(.deprecatedUnicode(prop.rawValue.quoted), at: loc)
 
-    case .bidiControl, .compositionExclusion, .emojiComponent,
+    case .compositionExclusion, .emojiComponent,
         .extendedPictographic, .graphemeLink, .hyphen, .otherAlphabetic,
         .otherDefaultIgnorableCodePoint, .otherGraphemeExtended,
         .otherIDContinue, .otherIDStart, .otherLowercase, .otherMath,
@@ -169,7 +169,7 @@ extension RegexValidator {
     case .binary(let b, _):
       try validateBinaryProperty(b, at: loc)
     case .any, .assigned, .ascii, .generalCategory, .posix, .named, .script,
-        .scriptExtension:
+        .scriptExtension, .age, .numericType, .numericValue, .mapping, .ccc:
       break
     case .pcreSpecial:
       throw error(.unsupported("PCRE property"), at: loc)

diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -188,7 +188,6 @@ extension Compiler.ByteCodeGen {
   mutating func emitCharacter(_ c: Character) throws {
     // Unicode scalar matches the specific scalars that comprise a character
     if options.semanticLevel == .unicodeScalar {
-      print("emitting '\(c)' as a sequence of \(c.unicodeScalars.count) scalars")
       for scalar in c.unicodeScalars {
         try emitScalar(scalar)
       }

diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -145,10 +145,7 @@ extension String {
 }
 
 func consumeName(_ name: String, opts: MatchingOptions) -> MEProgram<String>.ConsumeFunction {
-  let consume = opts.semanticLevel == .graphemeCluster
-    ? consumeCharacterWithSingleScalar
-    : consumeScalar
-
+  let consume = consumeFunction(for: opts)
   return consume(propertyScalarPredicate {
     // FIXME: name aliases not covered by $0.nameAlias are missed
     // e.g. U+FEFF has both 'BYTE ORDER MARK' and 'BOM' as aliases
@@ -491,6 +488,30 @@ extension AST.Atom.CharacterProperty {
       case .named(let n):
         return consumeName(n, opts: opts)
 
+      case .age(let major, let minor):
+        return consume {
+          guard let age = $0.properties.age else { return false }
+          return age <= (major, minor)
+        }
+
+      case .numericValue(let value):
+        return consume { $0.properties.numericValue == value }
+
+      case .numericType(let type):
+        return consume { $0.properties.numericType == type }
+
+      case .ccc(let ccc):
+        return consume { $0.properties.canonicalCombiningClass == ccc }
+
+      case .mapping(.lowercase, let value):
+        return consume { $0.properties.lowercaseMapping == value }
+
+      case .mapping(.uppercase, let value):
+        return consume { $0.properties.uppercaseMapping == value }
+
+      case .mapping(.titlecase, let value):
+        return consume { $0.properties.titlecaseMapping == value }
+
       case .posix(let p):
         return p.generateConsumer(opts)
 
@@ -525,7 +546,7 @@ extension Unicode.BinaryProperty {
     case .alphabetic:
       return consume(propertyScalarPredicate(\.isAlphabetic))
     case .bidiControl:
-      break
+      return consume(propertyScalarPredicate(\.isBidiControl))
     case .bidiMirrored:
       return consume(propertyScalarPredicate(\.isBidiMirrored))
     case .cased:

diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
@@ -2457,6 +2457,20 @@ extension RegexTests {
     diagnosticTest(#"\p{a=b"#, .unknownProperty(key: "a", value: "b"))
     diagnosticTest(#"\p{aaa[b]}"#, .unknownProperty(key: nil, value: "aaa"))
     diagnosticTest(#"\p{a=b=c}"#, .unknownProperty(key: "a", value: "b"))
+    diagnosticTest(#"\p{script=Not_A_Script}"#, .unrecognizedScript("Not_A_Script"))
+    diagnosticTest(#"\p{scx=Not_A_Script}"#, .unrecognizedScript("Not_A_Script"))
+    diagnosticTest(#"\p{gc=Not_A_Category}"#, .unrecognizedCategory("Not_A_Category"))
+    diagnosticTest(#"\p{age=3}"#, .invalidAge("3"))
+    diagnosticTest(#"\p{age=V3}"#, .invalidAge("V3"))
+    diagnosticTest(#"\p{age=3.0.1}"#, .invalidAge("3.0.1"))
+    diagnosticTest(#"\p{nv=A}"#, .invalidNumericValue("A"))
+    diagnosticTest(#"\p{Numeric_Value=1.2.3.4}"#, .invalidNumericValue("1.2.3.4"))
+    diagnosticTest(#"\p{nt=Not_A_NumericType}"#, .unrecognizedNumericType("Not_A_NumericType"))
+    diagnosticTest(#"\p{Numeric_Type=Nuemric}"#, .unrecognizedNumericType("Nuemric"))
+    diagnosticTest(#"\p{Simple_Lowercase_Mapping}"#, .unknownProperty(key: nil, value: "Simple_Lowercase_Mapping"))
+    diagnosticTest(#"\p{Simple_Lowercase_Mapping=}"#, .emptyProperty)
+    diagnosticTest(#"\p{ccc=255}"#, .invalidCCC("255"))
+    diagnosticTest(#"\p{ccc=Nada}"#, .invalidCCC("Nada"))
     diagnosticTest(#"(?#"#, .expected(")"))
     diagnosticTest(#"(?x"#, .expected(")"))