Skip to content

More unicode properties #385

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
May 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions Sources/_RegexParser/Regex/AST/Atom.swift
Original file line number Diff line number Diff line change
Expand Up @@ -427,11 +427,32 @@ extension AST.Atom.CharacterProperty {
/// Character name in the form `\p{name=...}`
case named(String)

/// Numeric type.
case numericType(Unicode.NumericType)

/// Numeric value.
case numericValue(Double)

/// Case mapping.
case mapping(MapKind, String)

/// Canonical Combining Class.
case ccc(Unicode.CanonicalCombiningClass)

/// Character age, as per UnicodeScalar.Properties.age.
case age(major: Int, minor: Int)

case posix(Unicode.POSIXProperty)

/// Some special properties implemented by PCRE and Oniguruma.
case pcreSpecial(PCRESpecialCategory)
case onigurumaSpecial(OnigurumaSpecialProperty)

public enum MapKind: Hashable {
case lowercase
case uppercase
case titlecase
}
}

// TODO: erm, separate out or fold into something? splat it in?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,17 @@ extension Source {
typealias PropertyKind = AST.Atom.CharacterProperty.Kind

static private func withNormalizedForms<T>(
_ str: String, match: (String) -> T?
) -> T? {
_ str: String, match: (String) throws -> T?
) rethrows -> T? {
// This follows the rules provided by UAX44-LM3, including trying to drop an
// "is" prefix, which isn't required by UTS#18 RL1.2, but is nice for
// consistency with other engines and the Unicode.Scalar.Properties names.
let str = str.filter { !$0.isPatternWhitespace && $0 != "_" && $0 != "-" }
.lowercased()
if let m = match(str) {
if let m = try match(str) {
return m
}
if str.hasPrefix("is"), let m = match(String(str.dropFirst(2))) {
if str.hasPrefix("is"), let m = try match(String(str.dropFirst(2))) {
return m
}
return nil
Expand Down Expand Up @@ -79,6 +79,19 @@ extension Source {
}
}

static private func classifyNumericType(
_ str: String
) -> Unicode.NumericType? {
withNormalizedForms(str) { str in
switch str {
case "decimal": return .decimal
case "digit": return .digit
case "numeric": return .numeric
default: return nil
}
}
}

static private func classifyBoolProperty(
_ str: String
) -> Unicode.BinaryProperty? {
Expand Down Expand Up @@ -361,6 +374,27 @@ extension Source {
}
}
}

static func parseAge(_ value: String) -> Unicode.Version? {
// Age can be specified in the form '3.0' or 'V3_0'.
// Other formats are not supported.
var str = value[...]

let separator: Character
if str.first == "V" {
str.removeFirst()
separator = "_"
} else {
separator = "."
}

guard let sepIndex = str.firstIndex(of: separator),
let major = Int(str[..<sepIndex]),
let minor = Int(str[sepIndex...].dropFirst())
else { return nil }

return (major, minor)
}

static func classifyCharacterPropertyValueOnly(
_ value: String
Expand Down Expand Up @@ -414,22 +448,51 @@ extension Source {

// This uses the aliases defined in
// https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt.
let match = withNormalizedForms(key) { key -> PropertyKind? in
switch key {
let match = try withNormalizedForms(key) { normalizedKey -> PropertyKind? in
switch normalizedKey {
case "script", "sc":
if let script = classifyScriptProperty(value) {
return .script(script)
guard let script = classifyScriptProperty(value) else {
throw ParseError.unrecognizedScript(value)
}
return .script(script)
case "scriptextensions", "scx":
if let script = classifyScriptProperty(value) {
return .scriptExtension(script)
guard let script = classifyScriptProperty(value) else {
throw ParseError.unrecognizedScript(value)
}
return .scriptExtension(script)
case "gc", "generalcategory":
if let cat = classifyGeneralCategory(value) {
return .generalCategory(cat)
guard let cat = classifyGeneralCategory(value) else {
throw ParseError.unrecognizedCategory(value)
}
return .generalCategory(cat)
case "age":
guard let (major, minor) = parseAge(value) else {
throw ParseError.invalidAge(value)
}
return .age(major: major, minor: minor)
case "name", "na":
return .named(value)
case "numericvalue", "nv":
guard let numericValue = Double(value) else {
throw ParseError.invalidNumericValue(value)
}
return .numericValue(numericValue)
case "numerictype", "nt":
guard let type = classifyNumericType(value) else {
throw ParseError.unrecognizedNumericType(value)
}
return .numericType(type)
case "slc", "simplelowercasemapping":
return .mapping(.lowercase, value)
case "suc", "simpleuppercasemapping":
return .mapping(.uppercase, value)
case "stc", "simpletitlecasemapping":
return .mapping(.titlecase, value)
case "ccc", "canonicalcombiningclass":
guard let cccValue = UInt8(value), cccValue <= 254 else {
throw ParseError.invalidCCC(value)
}
return .ccc(.init(rawValue: cccValue))
default:
break
}
Expand Down
20 changes: 19 additions & 1 deletion Sources/_RegexParser/Regex/Parse/Diagnostics.swift
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,13 @@ enum ParseError: Error, Hashable {

case emptyProperty
case unknownProperty(key: String?, value: String)

case unrecognizedScript(String)
case unrecognizedCategory(String)
case invalidAge(String)
case invalidNumericValue(String)
case unrecognizedNumericType(String)
case invalidCCC(String)

case expectedGroupSpecifier
case unbalancedEndOfGroup

Expand Down Expand Up @@ -184,6 +190,18 @@ extension ParseError: CustomStringConvertible {
return "extended syntax may not be disabled in multi-line mode"
case .expectedCalloutArgument:
return "expected argument to callout"
case .unrecognizedScript(let value):
return "unrecognized script '\(value)'"
case .unrecognizedCategory(let value):
return "unrecognized category '\(value)'"
case .unrecognizedNumericType(let value):
return "unrecognized numeric type '\(value)'"
case .invalidAge(let value):
return "invalid age format for '\(value)' - use '3.0' or 'V3_0' formats"
case .invalidNumericValue(let value):
return "invalid numeric value '\(value)'"
case .invalidCCC(let value):
return "invalid canonical combining class '\(value)'"

// MARK: Semantic Errors

Expand Down
8 changes: 4 additions & 4 deletions Sources/_RegexParser/Regex/Parse/Sema.swift
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,8 @@ extension RegexValidator {
_ prop: Unicode.BinaryProperty, at loc: SourceLocation
) throws {
switch prop {
case .asciiHexDigit, .alphabetic, .bidiMirrored, .cased, .caseIgnorable,
.changesWhenCasefolded, .changesWhenCasemapped,
case .asciiHexDigit, .alphabetic, .bidiControl, .bidiMirrored, .cased,
.caseIgnorable, .changesWhenCasefolded, .changesWhenCasemapped,
.changesWhenNFKCCasefolded, .changesWhenLowercased,
.changesWhenTitlecased, .changesWhenUppercased, .dash, .deprecated,
.defaultIgnorableCodePoint, .diacratic, .extender,
Expand All @@ -150,7 +150,7 @@ extension RegexValidator {
case .expandsOnNFC, .expandsOnNFD, .expandsOnNFKD, .expandsOnNFKC:
throw error(.deprecatedUnicode(prop.rawValue.quoted), at: loc)

case .bidiControl, .compositionExclusion, .emojiComponent,
case .compositionExclusion, .emojiComponent,
.extendedPictographic, .graphemeLink, .hyphen, .otherAlphabetic,
.otherDefaultIgnorableCodePoint, .otherGraphemeExtended,
.otherIDContinue, .otherIDStart, .otherLowercase, .otherMath,
Expand All @@ -169,7 +169,7 @@ extension RegexValidator {
case .binary(let b, _):
try validateBinaryProperty(b, at: loc)
case .any, .assigned, .ascii, .generalCategory, .posix, .named, .script,
.scriptExtension:
.scriptExtension, .age, .numericType, .numericValue, .mapping, .ccc:
break
case .pcreSpecial:
throw error(.unsupported("PCRE property"), at: loc)
Expand Down
1 change: 0 additions & 1 deletion Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,6 @@ extension Compiler.ByteCodeGen {
mutating func emitCharacter(_ c: Character) throws {
// Unicode scalar matches the specific scalars that comprise a character
if options.semanticLevel == .unicodeScalar {
print("emitting '\(c)' as a sequence of \(c.unicodeScalars.count) scalars")
for scalar in c.unicodeScalars {
try emitScalar(scalar)
}
Expand Down
31 changes: 26 additions & 5 deletions Sources/_StringProcessing/ConsumerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,7 @@ extension String {
}

func consumeName(_ name: String, opts: MatchingOptions) -> MEProgram<String>.ConsumeFunction {
let consume = opts.semanticLevel == .graphemeCluster
? consumeCharacterWithSingleScalar
: consumeScalar

let consume = consumeFunction(for: opts)
return consume(propertyScalarPredicate {
// FIXME: name aliases not covered by $0.nameAlias are missed
// e.g. U+FEFF has both 'BYTE ORDER MARK' and 'BOM' as aliases
Expand Down Expand Up @@ -491,6 +488,30 @@ extension AST.Atom.CharacterProperty {
case .named(let n):
return consumeName(n, opts: opts)

case .age(let major, let minor):
return consume {
guard let age = $0.properties.age else { return false }
return age <= (major, minor)
}

case .numericValue(let value):
return consume { $0.properties.numericValue == value }

case .numericType(let type):
return consume { $0.properties.numericType == type }

case .ccc(let ccc):
return consume { $0.properties.canonicalCombiningClass == ccc }

case .mapping(.lowercase, let value):
return consume { $0.properties.lowercaseMapping == value }

case .mapping(.uppercase, let value):
return consume { $0.properties.uppercaseMapping == value }

case .mapping(.titlecase, let value):
return consume { $0.properties.titlecaseMapping == value }

case .posix(let p):
return p.generateConsumer(opts)

Expand Down Expand Up @@ -525,7 +546,7 @@ extension Unicode.BinaryProperty {
case .alphabetic:
return consume(propertyScalarPredicate(\.isAlphabetic))
case .bidiControl:
break
return consume(propertyScalarPredicate(\.isBidiControl))
case .bidiMirrored:
return consume(propertyScalarPredicate(\.isBidiMirrored))
case .cased:
Expand Down
14 changes: 14 additions & 0 deletions Tests/RegexTests/ParseTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -2457,6 +2457,20 @@ extension RegexTests {
diagnosticTest(#"\p{a=b"#, .unknownProperty(key: "a", value: "b"))
diagnosticTest(#"\p{aaa[b]}"#, .unknownProperty(key: nil, value: "aaa"))
diagnosticTest(#"\p{a=b=c}"#, .unknownProperty(key: "a", value: "b"))
diagnosticTest(#"\p{script=Not_A_Script}"#, .unrecognizedScript("Not_A_Script"))
diagnosticTest(#"\p{scx=Not_A_Script}"#, .unrecognizedScript("Not_A_Script"))
diagnosticTest(#"\p{gc=Not_A_Category}"#, .unrecognizedCategory("Not_A_Category"))
diagnosticTest(#"\p{age=3}"#, .invalidAge("3"))
diagnosticTest(#"\p{age=V3}"#, .invalidAge("V3"))
diagnosticTest(#"\p{age=3.0.1}"#, .invalidAge("3.0.1"))
diagnosticTest(#"\p{nv=A}"#, .invalidNumericValue("A"))
diagnosticTest(#"\p{Numeric_Value=1.2.3.4}"#, .invalidNumericValue("1.2.3.4"))
diagnosticTest(#"\p{nt=Not_A_NumericType}"#, .unrecognizedNumericType("Not_A_NumericType"))
diagnosticTest(#"\p{Numeric_Type=Nuemric}"#, .unrecognizedNumericType("Nuemric"))
diagnosticTest(#"\p{Simple_Lowercase_Mapping}"#, .unknownProperty(key: nil, value: "Simple_Lowercase_Mapping"))
diagnosticTest(#"\p{Simple_Lowercase_Mapping=}"#, .emptyProperty)
diagnosticTest(#"\p{ccc=255}"#, .invalidCCC("255"))
diagnosticTest(#"\p{ccc=Nada}"#, .invalidCCC("Nada"))
diagnosticTest(#"(?#"#, .expected(")"))
diagnosticTest(#"(?x"#, .expected(")"))

Expand Down
Loading