-
Notifications
You must be signed in to change notification settings - Fork 49
Add additional Unicode API to RegexBuilder.CharacterClass #435
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
fd05304
39b770b
5c2f21e
f7739cb
35a6718
1c1f20b
4487713
c2270e0
b1a115d
6fe8ed6
5afce48
e39f0a4
41b02c5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,11 @@ | |
@_implementationOnly import _RegexParser | ||
@_spi(RegexBuilder) import _StringProcessing | ||
|
||
/// A class of characters that match in a regex. | ||
/// | ||
/// A character class can represent individual characters, a group of | ||
/// characters, the set of character that match some set of criteria, or | ||
/// a set algebraic combination of all of the above. | ||
@available(SwiftStdlib 5.7, *) | ||
public struct CharacterClass { | ||
internal var ccc: DSLTree.CustomCharacterClass | ||
|
@@ -37,33 +42,45 @@ extension CharacterClass: RegexComponent { | |
|
||
@available(SwiftStdlib 5.7, *) | ||
extension CharacterClass { | ||
/// A character class that matches any character that does not match this | ||
/// character class. | ||
public var inverted: CharacterClass { | ||
CharacterClass(ccc.inverted) | ||
} | ||
} | ||
|
||
@available(SwiftStdlib 5.7, *) | ||
extension RegexComponent where Self == CharacterClass { | ||
/// A character class that matches any element. | ||
/// | ||
/// This character class is unaffected by the `dotMatchesNewlines()` method. | ||
public static var any: CharacterClass { | ||
.init(DSLTree.CustomCharacterClass(members: [.atom(.any)])) | ||
} | ||
|
||
/// A character class that matches any element that isn't a newline. | ||
public static var anyNonNewline: CharacterClass { | ||
.init(DSLTree.CustomCharacterClass(members: [.atom(.any)])) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
} | ||
|
||
/// A character class that matches any single `Character`, or extended | ||
/// grapheme cluster, regardless of the current semantic level. | ||
public static var anyGrapheme: CharacterClass { | ||
.init(unconverted: .anyGrapheme) | ||
} | ||
|
||
/// A character class that matches any single Unicode scalar, regardless | ||
/// of the current semantic level. | ||
public static var anyUnicodeScalar: CharacterClass { | ||
.init(unconverted: .anyUnicodeScalar) | ||
} | ||
|
||
public static var whitespace: CharacterClass { | ||
.init(unconverted: .whitespace) | ||
} | ||
|
||
/// A character class that matches any digit. | ||
public static var digit: CharacterClass { | ||
.init(unconverted: .digit) | ||
} | ||
|
||
/// A character class that matches any hexadecimal digit. | ||
public static var hexDigit: CharacterClass { | ||
.init(DSLTree.CustomCharacterClass(members: [ | ||
.range(.char("A"), .char("F")), | ||
|
@@ -72,20 +89,32 @@ extension RegexComponent where Self == CharacterClass { | |
])) | ||
} | ||
|
||
public static var horizontalWhitespace: CharacterClass { | ||
.init(unconverted: .horizontalWhitespace) | ||
/// A character class that matches any element that is a "word character". | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to double check, is there any better description than "word character"? "Word character" can be mentioned as an aside but that's more of a historical note. @Azoy does Unicode have another name? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
public static var wordCharacter: CharacterClass { | ||
.init(unconverted: .word) | ||
} | ||
|
||
public static var newlineSequence: CharacterClass { | ||
.init(unconverted: .newlineSequence) | ||
/// A character class that matches any element that is classified as | ||
/// whitespace. | ||
public static var whitespace: CharacterClass { | ||
.init(unconverted: .whitespace) | ||
} | ||
|
||
/// A character class that matches any element that is classified as | ||
/// horizontal whitespace. | ||
public static var horizontalWhitespace: CharacterClass { | ||
.init(unconverted: .horizontalWhitespace) | ||
} | ||
|
||
/// A character class that matches any element that is classified as | ||
/// vertical whitespace. | ||
public static var verticalWhitespace: CharacterClass { | ||
.init(unconverted: .verticalWhitespace) | ||
} | ||
|
||
public static var word: CharacterClass { | ||
.init(unconverted: .word) | ||
|
||
/// A character class that matches any newline sequence. | ||
public static var newlineSequence: CharacterClass { | ||
.init(unconverted: .newlineSequence) | ||
} | ||
} | ||
|
||
|
@@ -108,14 +137,111 @@ extension RegexComponent where Self == CharacterClass { | |
CharacterClass(DSLTree.CustomCharacterClass( | ||
members: s.map { .atom(.scalar($0)) })) | ||
} | ||
|
||
/// Returns a character class that matches none of the characters in the given | ||
/// string or sequence. | ||
public static func noneOf<S: Sequence>(_ s: S) -> CharacterClass | ||
where S.Element == Character | ||
{ | ||
CharacterClass(DSLTree.CustomCharacterClass( | ||
members: s.map { .atom(.char($0)) })).inverted | ||
} | ||
|
||
/// Returns a character class that matches none of the Unicode scalars in the | ||
/// given sequence. | ||
public static func noneOf<S: Sequence>(_ s: S) -> CharacterClass | ||
where S.Element == UnicodeScalar | ||
{ | ||
CharacterClass(DSLTree.CustomCharacterClass( | ||
members: s.map { .atom(.scalar($0)) })).inverted | ||
} | ||
} | ||
|
||
// Unicode properties | ||
@available(SwiftStdlib 5.7, *) | ||
extension CharacterClass { | ||
extension RegexComponent where Self == CharacterClass { | ||
/// Returns a character class that matches any element with the given Unicode | ||
/// general category. | ||
/// | ||
/// For example, when passed `.uppercaseLetter`, this method is equivalent to | ||
/// `/\p{Uppercase_Letter}/` or `/\p{Lu}/`. | ||
public static func generalCategory(_ category: Unicode.GeneralCategory) -> CharacterClass { | ||
return CharacterClass(.generalCategory(category)) | ||
} | ||
|
||
/// Returns a character class that matches any element with the given Unicode | ||
/// binary property. | ||
/// | ||
/// For example, when passed `\.isAlphabetic`, this method is equivalent to | ||
/// `/\p{Alphabetic}/` or `/\p{Is_Alphabetic=true}/`. | ||
public static func binaryProperty(_ property: KeyPath<UnicodeScalar.Properties, Bool>, value: Bool = true) -> CharacterClass { | ||
return CharacterClass(.binaryProperty(property, value: value)) | ||
} | ||
|
||
/// Returns a character class that matches any element with the given Unicode | ||
/// name. | ||
/// | ||
/// This method is equivalent to `/\p{Name=name}/`. | ||
public static func name(_ name: String) -> CharacterClass { | ||
return CharacterClass(.named(name)) | ||
} | ||
|
||
/// Returns a character class that matches any element that was included in | ||
/// the specified Unicode version. | ||
/// | ||
/// This method is equivalent to `/\p{Age=version}/`. | ||
public static func age(_ version: Unicode.Version) -> CharacterClass { | ||
return CharacterClass(.age(version)) | ||
} | ||
|
||
/// Returns a character class that matches any element with the given Unicode | ||
/// numeric type. | ||
/// | ||
/// This method is equivalent to `/\p{Numeric_Type=type}/`. | ||
public static func numericType(_ type: Unicode.NumericType) -> CharacterClass { | ||
return CharacterClass(.numericType(type)) | ||
} | ||
|
||
/// Returns a character class that matches any element with the given numeric | ||
/// value. | ||
/// | ||
/// This method is equivalent to `/\p{Numeric_Value=value}/`. | ||
public static func numericValue(_ value: Double) -> CharacterClass { | ||
return CharacterClass(.numericValue(value)) | ||
} | ||
|
||
/// Returns a character class that matches any element with the given Unicode | ||
/// canonical combining class. | ||
/// | ||
/// This method is equivalent to | ||
/// `/\p{Canonical_Combining_Class=combiningClass}/`. | ||
public static func canonicalCombiningClass(_ combiningClass: Unicode.CanonicalCombiningClass) -> CharacterClass { | ||
return CharacterClass(.ccc(combiningClass)) | ||
} | ||
|
||
/// Returns a character class that matches any element with the given | ||
/// lowercase mapping. | ||
/// | ||
/// This method is equivalent to `/\p{Lowercase_Mapping=value}/`. | ||
public static func lowercaseMapping(_ value: String) -> CharacterClass { | ||
return CharacterClass(.lowercaseMapping(value)) | ||
} | ||
|
||
/// Returns a character class that matches any element with the given | ||
/// uppercase mapping. | ||
/// | ||
/// This method is equivalent to `/\p{Uppercase_Mapping=value}/`. | ||
public static func uppercaseMapping(_ value: String) -> CharacterClass { | ||
return CharacterClass(.uppercaseMapping(value)) | ||
} | ||
|
||
/// Returns a character class that matches any element with the given | ||
/// titlecase mapping. | ||
/// | ||
/// This method is equivalent to `/\p{Titlecase_Mapping=value}/`. | ||
public static func titlecaseMapping(_ value: String) -> CharacterClass { | ||
return CharacterClass(.titlecaseMapping(value)) | ||
} | ||
} | ||
|
||
/// Returns a character class that includes the characters in the given range. | ||
|
@@ -139,37 +265,47 @@ public func ...(lhs: UnicodeScalar, rhs: UnicodeScalar) -> CharacterClass { | |
|
||
@available(SwiftStdlib 5.7, *) | ||
extension RegexComponent where Self == CharacterClass { | ||
/// Returns a character class that combines all the given characters classes | ||
/// via union. | ||
public init(_ first: CharacterClass, _ rest: CharacterClass...) { | ||
if rest.isEmpty { | ||
self.init(first.ccc) | ||
} else { | ||
let members: [DSLTree.CustomCharacterClass.Member] = | ||
(CollectionOfOne(first) + rest).map { .custom($0.ccc) } | ||
var members: [DSLTree.CustomCharacterClass.Member] = [.custom(first.ccc)] | ||
members.append(contentsOf: rest.lazy.map { .custom($0.ccc) }) | ||
self.init(.init(members: members)) | ||
} | ||
} | ||
} | ||
|
||
@available(SwiftStdlib 5.7, *) | ||
extension CharacterClass { | ||
/// Returns a character class that is matches the union of this class and the | ||
/// given class. | ||
public func union(_ other: CharacterClass) -> CharacterClass { | ||
CharacterClass(.init(members: [ | ||
.custom(self.ccc), | ||
.custom(other.ccc)])) | ||
} | ||
|
||
/// Returns a character class that is matches the intersection of this class | ||
/// and the given class. | ||
public func intersection(_ other: CharacterClass) -> CharacterClass { | ||
CharacterClass(.init(members: [ | ||
.intersection(self.ccc, other.ccc) | ||
])) | ||
} | ||
|
||
|
||
/// Returns a character class that is matches the difference of this class | ||
/// and the given class. | ||
public func subtracting(_ other: CharacterClass) -> CharacterClass { | ||
CharacterClass(.init(members: [ | ||
.subtraction(self.ccc, other.ccc) | ||
])) | ||
} | ||
|
||
|
||
/// Returns a character class that is matches the symmetric difference of | ||
/// this class and the given class. | ||
public func symmetricDifference(_ other: CharacterClass) -> CharacterClass { | ||
CharacterClass(.init(members: [ | ||
.symmetricDifference(self.ccc, other.ccc) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -140,12 +140,122 @@ extension DSLTree { | |
self.isInverted = isInverted | ||
} | ||
|
||
public static func generalCategory(_ category: Unicode.GeneralCategory) -> Self { | ||
let property = AST.Atom.CharacterProperty(.generalCategory(category.extendedGeneralCategory!), isInverted: false, isPOSIX: false) | ||
private static func astCharacterProperty( | ||
_ property: AST.Atom.CharacterProperty.Kind | ||
) -> Self { | ||
let property = AST.Atom.CharacterProperty(property, isInverted: false, isPOSIX: false) | ||
let astAtom = AST.Atom(.property(property), .fake) | ||
return .init(members: [.atom(.unconverted(.init(ast: astAtom)))]) | ||
} | ||
|
||
public static func generalCategory(_ category: Unicode.GeneralCategory) -> Self { | ||
astCharacterProperty(.generalCategory(category.extendedGeneralCategory!)) | ||
} | ||
|
||
public static func binaryProperty( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we get this code in a different file? |
||
_ property: KeyPath<UnicodeScalar.Properties, Bool>, | ||
value: Bool | ||
) -> Self { | ||
var binaryProperty: Unicode.BinaryProperty? = nil | ||
switch property { | ||
case \.isAlphabetic: binaryProperty = .alphabetic | ||
case \.isASCIIHexDigit: binaryProperty = .asciiHexDigit | ||
case \.isBidiControl: binaryProperty = .bidiControl | ||
case \.isBidiMirrored: binaryProperty = .bidiMirrored | ||
case \.isCased: binaryProperty = .cased | ||
case \.isCaseIgnorable: binaryProperty = .caseIgnorable | ||
case \.changesWhenCaseFolded: binaryProperty = .changesWhenCasefolded | ||
case \.changesWhenCaseMapped: binaryProperty = .changesWhenCasemapped | ||
case \.changesWhenNFKCCaseFolded: binaryProperty = .changesWhenNFKCCasefolded | ||
case \.changesWhenLowercased: binaryProperty = .changesWhenLowercased | ||
case \.changesWhenTitlecased: binaryProperty = .changesWhenTitlecased | ||
case \.changesWhenUppercased: binaryProperty = .changesWhenUppercased | ||
case \.isDash: binaryProperty = .dash | ||
case \.isDefaultIgnorableCodePoint: binaryProperty = .defaultIgnorableCodePoint | ||
case \.isDeprecated: binaryProperty = .deprecated | ||
case \.isDiacritic: binaryProperty = .diacratic | ||
case \.isExtender: binaryProperty = .extender | ||
case \.isFullCompositionExclusion: binaryProperty = .fullCompositionExclusion | ||
case \.isGraphemeBase: binaryProperty = .graphemeBase | ||
case \.isGraphemeExtend: binaryProperty = .graphemeExtended | ||
case \.isHexDigit: binaryProperty = .hexDigit | ||
case \.isIDContinue: binaryProperty = .idContinue | ||
case \.isIdeographic: binaryProperty = .ideographic | ||
case \.isIDStart: binaryProperty = .idStart | ||
case \.isIDSBinaryOperator: binaryProperty = .idsBinaryOperator | ||
case \.isIDSTrinaryOperator: binaryProperty = .idsTrinaryOperator | ||
case \.isJoinControl: binaryProperty = .joinControl | ||
case \.isLogicalOrderException: binaryProperty = .logicalOrderException | ||
case \.isLowercase: binaryProperty = .lowercase | ||
case \.isMath: binaryProperty = .math | ||
case \.isNoncharacterCodePoint: binaryProperty = .noncharacterCodePoint | ||
case \.isPatternSyntax: binaryProperty = .patternSyntax | ||
case \.isPatternWhitespace: binaryProperty = .patternWhitespace | ||
case \.isQuotationMark: binaryProperty = .quotationMark | ||
case \.isRadical: binaryProperty = .radical | ||
case \.isSoftDotted: binaryProperty = .softDotted | ||
case \.isSentenceTerminal: binaryProperty = .sentenceTerminal | ||
case \.isTerminalPunctuation: binaryProperty = .terminalPunctuation | ||
case \.isUnifiedIdeograph: binaryProperty = .unifiedIdiograph | ||
case \.isUppercase: binaryProperty = .uppercase | ||
case \.isVariationSelector: binaryProperty = .variationSelector | ||
case \.isWhitespace: binaryProperty = .whitespace | ||
case \.isXIDContinue: binaryProperty = .xidContinue | ||
case \.isXIDStart: binaryProperty = .xidStart | ||
default: | ||
if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) { | ||
// FIXME: Other platforms | ||
switch property { | ||
case \.isEmojiModifierBase: binaryProperty = .emojiModifierBase | ||
case \.isEmojiModifier: binaryProperty = .emojiModifier | ||
case \.isEmoji: binaryProperty = .emoji | ||
case \.isEmojiPresentation: binaryProperty = .emojiPresentation | ||
default: | ||
break | ||
} | ||
} | ||
} | ||
|
||
if let binaryProperty = binaryProperty { | ||
return astCharacterProperty(.binary(binaryProperty, value: value)) | ||
} else { | ||
// FIXME: Support via a _UnicodeScalarPredicate interface? | ||
fatalError("Unsupported Unicode binary property") | ||
} | ||
} | ||
|
||
public static func age(_ version: Unicode.Version) -> Self { | ||
astCharacterProperty(.age(major: version.major, minor: version.minor)) | ||
} | ||
|
||
public static func named(_ name: String) -> Self { | ||
astCharacterProperty(.named(name)) | ||
} | ||
|
||
public static func numericType(_ type: Unicode.NumericType) -> Self { | ||
astCharacterProperty(.numericType(type)) | ||
} | ||
|
||
public static func numericValue(_ value: Double) -> Self { | ||
astCharacterProperty(.numericValue(value)) | ||
} | ||
|
||
public static func ccc(_ combiningClass: Unicode.CanonicalCombiningClass) -> Self { | ||
astCharacterProperty(.ccc(combiningClass)) | ||
} | ||
|
||
public static func lowercaseMapping(_ value: String) -> Self { | ||
astCharacterProperty(.mapping(.lowercase, value)) | ||
} | ||
|
||
public static func uppercaseMapping(_ value: String) -> Self { | ||
astCharacterProperty(.mapping(.uppercase, value)) | ||
} | ||
|
||
public static func titlecaseMapping(_ value: String) -> Self { | ||
astCharacterProperty(.mapping(.titlecase, value)) | ||
} | ||
|
||
public var inverted: CustomCharacterClass { | ||
var result = self | ||
result.isInverted.toggle() | ||
|
Uh oh!
There was an error while loading. Please reload this page.