Skip to content

Commit 8b0e5f0

Browse files
committed
More unicode properties (swiftlang#385)
Add validation testing for supported and unsupported Unicode properties, along with support for the following properties: - age - numeric type - numeric value - lower/upper/titlecase mapping - canonical combining class
1 parent cda88e4 commit 8b0e5f0

File tree

7 files changed

+526
-40
lines changed

7 files changed

+526
-40
lines changed

Sources/_RegexParser/Regex/AST/Atom.swift

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,11 +427,32 @@ extension AST.Atom.CharacterProperty {
427427
/// Character name in the form `\p{name=...}`
428428
case named(String)
429429

430+
/// Numeric type.
431+
case numericType(Unicode.NumericType)
432+
433+
/// Numeric value.
434+
case numericValue(Double)
435+
436+
/// Case mapping.
437+
case mapping(MapKind, String)
438+
439+
/// Canonical Combining Class.
440+
case ccc(Unicode.CanonicalCombiningClass)
441+
442+
/// Character age, as per UnicodeScalar.Properties.age.
443+
case age(major: Int, minor: Int)
444+
430445
case posix(Unicode.POSIXProperty)
431446

432447
/// Some special properties implemented by PCRE and Oniguruma.
433448
case pcreSpecial(PCRESpecialCategory)
434449
case onigurumaSpecial(OnigurumaSpecialProperty)
450+
451+
public enum MapKind: Hashable {
452+
case lowercase
453+
case uppercase
454+
case titlecase
455+
}
435456
}
436457

437458
// TODO: erm, separate out or fold into something? splat it in?

Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift

Lines changed: 75 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,17 @@ extension Source {
1313
typealias PropertyKind = AST.Atom.CharacterProperty.Kind
1414

1515
static private func withNormalizedForms<T>(
16-
_ str: String, match: (String) -> T?
17-
) -> T? {
16+
_ str: String, match: (String) throws -> T?
17+
) rethrows -> T? {
1818
// This follows the rules provided by UAX44-LM3, including trying to drop an
1919
// "is" prefix, which isn't required by UTS#18 RL1.2, but is nice for
2020
// consistency with other engines and the Unicode.Scalar.Properties names.
2121
let str = str.filter { !$0.isPatternWhitespace && $0 != "_" && $0 != "-" }
2222
.lowercased()
23-
if let m = match(str) {
23+
if let m = try match(str) {
2424
return m
2525
}
26-
if str.hasPrefix("is"), let m = match(String(str.dropFirst(2))) {
26+
if str.hasPrefix("is"), let m = try match(String(str.dropFirst(2))) {
2727
return m
2828
}
2929
return nil
@@ -79,6 +79,19 @@ extension Source {
7979
}
8080
}
8181

82+
static private func classifyNumericType(
83+
_ str: String
84+
) -> Unicode.NumericType? {
85+
withNormalizedForms(str) { str in
86+
switch str {
87+
case "decimal": return .decimal
88+
case "digit": return .digit
89+
case "numeric": return .numeric
90+
default: return nil
91+
}
92+
}
93+
}
94+
8295
static private func classifyBoolProperty(
8396
_ str: String
8497
) -> Unicode.BinaryProperty? {
@@ -361,6 +374,27 @@ extension Source {
361374
}
362375
}
363376
}
377+
378+
static func parseAge(_ value: String) -> Unicode.Version? {
379+
// Age can be specified in the form '3.0' or 'V3_0'.
380+
// Other formats are not supported.
381+
var str = value[...]
382+
383+
let separator: Character
384+
if str.first == "V" {
385+
str.removeFirst()
386+
separator = "_"
387+
} else {
388+
separator = "."
389+
}
390+
391+
guard let sepIndex = str.firstIndex(of: separator),
392+
let major = Int(str[..<sepIndex]),
393+
let minor = Int(str[sepIndex...].dropFirst())
394+
else { return nil }
395+
396+
return (major, minor)
397+
}
364398

365399
static func classifyCharacterPropertyValueOnly(
366400
_ value: String
@@ -414,22 +448,51 @@ extension Source {
414448

415449
// This uses the aliases defined in
416450
// https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt.
417-
let match = withNormalizedForms(key) { key -> PropertyKind? in
418-
switch key {
451+
let match = try withNormalizedForms(key) { normalizedKey -> PropertyKind? in
452+
switch normalizedKey {
419453
case "script", "sc":
420-
if let script = classifyScriptProperty(value) {
421-
return .script(script)
454+
guard let script = classifyScriptProperty(value) else {
455+
throw ParseError.unrecognizedScript(value)
422456
}
457+
return .script(script)
423458
case "scriptextensions", "scx":
424-
if let script = classifyScriptProperty(value) {
425-
return .scriptExtension(script)
459+
guard let script = classifyScriptProperty(value) else {
460+
throw ParseError.unrecognizedScript(value)
426461
}
462+
return .scriptExtension(script)
427463
case "gc", "generalcategory":
428-
if let cat = classifyGeneralCategory(value) {
429-
return .generalCategory(cat)
464+
guard let cat = classifyGeneralCategory(value) else {
465+
throw ParseError.unrecognizedCategory(value)
466+
}
467+
return .generalCategory(cat)
468+
case "age":
469+
guard let (major, minor) = parseAge(value) else {
470+
throw ParseError.invalidAge(value)
430471
}
472+
return .age(major: major, minor: minor)
431473
case "name", "na":
432474
return .named(value)
475+
case "numericvalue", "nv":
476+
guard let numericValue = Double(value) else {
477+
throw ParseError.invalidNumericValue(value)
478+
}
479+
return .numericValue(numericValue)
480+
case "numerictype", "nt":
481+
guard let type = classifyNumericType(value) else {
482+
throw ParseError.unrecognizedNumericType(value)
483+
}
484+
return .numericType(type)
485+
case "slc", "simplelowercasemapping":
486+
return .mapping(.lowercase, value)
487+
case "suc", "simpleuppercasemapping":
488+
return .mapping(.uppercase, value)
489+
case "stc", "simpletitlecasemapping":
490+
return .mapping(.titlecase, value)
491+
case "ccc", "canonicalcombiningclass":
492+
guard let cccValue = UInt8(value), cccValue <= 254 else {
493+
throw ParseError.invalidCCC(value)
494+
}
495+
return .ccc(.init(rawValue: cccValue))
433496
default:
434497
break
435498
}

Sources/_RegexParser/Regex/Parse/Diagnostics.swift

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,13 @@ enum ParseError: Error, Hashable {
5959

6060
case emptyProperty
6161
case unknownProperty(key: String?, value: String)
62-
62+
case unrecognizedScript(String)
63+
case unrecognizedCategory(String)
64+
case invalidAge(String)
65+
case invalidNumericValue(String)
66+
case unrecognizedNumericType(String)
67+
case invalidCCC(String)
68+
6369
case expectedGroupSpecifier
6470
case unbalancedEndOfGroup
6571

@@ -181,6 +187,18 @@ extension ParseError: CustomStringConvertible {
181187
return "extended syntax may not be disabled in multi-line mode"
182188
case .expectedCalloutArgument:
183189
return "expected argument to callout"
190+
case .unrecognizedScript(let value):
191+
return "unrecognized script '\(value)'"
192+
case .unrecognizedCategory(let value):
193+
return "unrecognized category '\(value)'"
194+
case .unrecognizedNumericType(let value):
195+
return "unrecognized numeric type '\(value)'"
196+
case .invalidAge(let value):
197+
return "invalid age format for '\(value)' - use '3.0' or 'V3_0' formats"
198+
case .invalidNumericValue(let value):
199+
return "invalid numeric value '\(value)'"
200+
case .invalidCCC(let value):
201+
return "invalid canonical combining class '\(value)'"
184202

185203
// MARK: Semantic Errors
186204

Sources/_RegexParser/Regex/Parse/Sema.swift

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,8 @@ extension RegexValidator {
127127
_ prop: Unicode.BinaryProperty, at loc: SourceLocation
128128
) throws {
129129
switch prop {
130-
case .asciiHexDigit, .alphabetic, .bidiMirrored, .cased, .caseIgnorable,
131-
.changesWhenCasefolded, .changesWhenCasemapped,
130+
case .asciiHexDigit, .alphabetic, .bidiControl, .bidiMirrored, .cased,
131+
.caseIgnorable, .changesWhenCasefolded, .changesWhenCasemapped,
132132
.changesWhenNFKCCasefolded, .changesWhenLowercased,
133133
.changesWhenTitlecased, .changesWhenUppercased, .dash, .deprecated,
134134
.defaultIgnorableCodePoint, .diacratic, .extender,
@@ -150,7 +150,7 @@ extension RegexValidator {
150150
case .expandsOnNFC, .expandsOnNFD, .expandsOnNFKD, .expandsOnNFKC:
151151
throw error(.deprecatedUnicode(prop.rawValue.quoted), at: loc)
152152

153-
case .bidiControl, .compositionExclusion, .emojiComponent,
153+
case .compositionExclusion, .emojiComponent,
154154
.extendedPictographic, .graphemeLink, .hyphen, .otherAlphabetic,
155155
.otherDefaultIgnorableCodePoint, .otherGraphemeExtended,
156156
.otherIDContinue, .otherIDStart, .otherLowercase, .otherMath,
@@ -169,7 +169,7 @@ extension RegexValidator {
169169
case .binary(let b, _):
170170
try validateBinaryProperty(b, at: loc)
171171
case .any, .assigned, .ascii, .generalCategory, .posix, .named, .script,
172-
.scriptExtension:
172+
.scriptExtension, .age, .numericType, .numericValue, .mapping, .ccc:
173173
break
174174
case .pcreSpecial:
175175
throw error(.unsupported("PCRE property"), at: loc)

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -145,10 +145,7 @@ extension String {
145145
}
146146

147147
func consumeName(_ name: String, opts: MatchingOptions) -> MEProgram<String>.ConsumeFunction {
148-
let consume = opts.semanticLevel == .graphemeCluster
149-
? consumeCharacterWithSingleScalar
150-
: consumeScalar
151-
148+
let consume = consumeFunction(for: opts)
152149
return consume(propertyScalarPredicate {
153150
// FIXME: name aliases not covered by $0.nameAlias are missed
154151
// e.g. U+FEFF has both 'BYTE ORDER MARK' and 'BOM' as aliases
@@ -491,6 +488,30 @@ extension AST.Atom.CharacterProperty {
491488
case .named(let n):
492489
return consumeName(n, opts: opts)
493490

491+
case .age(let major, let minor):
492+
return consume {
493+
guard let age = $0.properties.age else { return false }
494+
return age <= (major, minor)
495+
}
496+
497+
case .numericValue(let value):
498+
return consume { $0.properties.numericValue == value }
499+
500+
case .numericType(let type):
501+
return consume { $0.properties.numericType == type }
502+
503+
case .ccc(let ccc):
504+
return consume { $0.properties.canonicalCombiningClass == ccc }
505+
506+
case .mapping(.lowercase, let value):
507+
return consume { $0.properties.lowercaseMapping == value }
508+
509+
case .mapping(.uppercase, let value):
510+
return consume { $0.properties.uppercaseMapping == value }
511+
512+
case .mapping(.titlecase, let value):
513+
return consume { $0.properties.titlecaseMapping == value }
514+
494515
case .posix(let p):
495516
return p.generateConsumer(opts)
496517

@@ -525,7 +546,7 @@ extension Unicode.BinaryProperty {
525546
case .alphabetic:
526547
return consume(propertyScalarPredicate(\.isAlphabetic))
527548
case .bidiControl:
528-
break
549+
return consume(propertyScalarPredicate(\.isBidiControl))
529550
case .bidiMirrored:
530551
return consume(propertyScalarPredicate(\.isBidiMirrored))
531552
case .cased:

Tests/RegexTests/ParseTests.swift

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2447,6 +2447,20 @@ extension RegexTests {
24472447
diagnosticTest(#"\p{a=b"#, .unknownProperty(key: "a", value: "b"))
24482448
diagnosticTest(#"\p{aaa[b]}"#, .unknownProperty(key: nil, value: "aaa"))
24492449
diagnosticTest(#"\p{a=b=c}"#, .unknownProperty(key: "a", value: "b"))
2450+
diagnosticTest(#"\p{script=Not_A_Script}"#, .unrecognizedScript("Not_A_Script"))
2451+
diagnosticTest(#"\p{scx=Not_A_Script}"#, .unrecognizedScript("Not_A_Script"))
2452+
diagnosticTest(#"\p{gc=Not_A_Category}"#, .unrecognizedCategory("Not_A_Category"))
2453+
diagnosticTest(#"\p{age=3}"#, .invalidAge("3"))
2454+
diagnosticTest(#"\p{age=V3}"#, .invalidAge("V3"))
2455+
diagnosticTest(#"\p{age=3.0.1}"#, .invalidAge("3.0.1"))
2456+
diagnosticTest(#"\p{nv=A}"#, .invalidNumericValue("A"))
2457+
diagnosticTest(#"\p{Numeric_Value=1.2.3.4}"#, .invalidNumericValue("1.2.3.4"))
2458+
diagnosticTest(#"\p{nt=Not_A_NumericType}"#, .unrecognizedNumericType("Not_A_NumericType"))
2459+
diagnosticTest(#"\p{Numeric_Type=Nuemric}"#, .unrecognizedNumericType("Nuemric"))
2460+
diagnosticTest(#"\p{Simple_Lowercase_Mapping}"#, .unknownProperty(key: nil, value: "Simple_Lowercase_Mapping"))
2461+
diagnosticTest(#"\p{Simple_Lowercase_Mapping=}"#, .emptyProperty)
2462+
diagnosticTest(#"\p{ccc=255}"#, .invalidCCC("255"))
2463+
diagnosticTest(#"\p{ccc=Nada}"#, .invalidCCC("Nada"))
24502464
diagnosticTest(#"(?#"#, .expected(")"))
24512465
diagnosticTest(#"(?x"#, .expected(")"))
24522466

0 commit comments

Comments
 (0)