Skip to content

Add additional Unicode API to RegexBuilder.CharacterClass #435

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 13 commits into from
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 152 additions & 16 deletions Sources/RegexBuilder/CharacterClass.swift
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
@_implementationOnly import _RegexParser
@_spi(RegexBuilder) import _StringProcessing

/// A class of characters that match in a regex.
///
/// A character class can represent individual characters, a group of
/// characters, the set of character that match some set of criteria, or
/// a set algebraic combination of all of the above.
@available(SwiftStdlib 5.7, *)
public struct CharacterClass {
internal var ccc: DSLTree.CustomCharacterClass
Expand All @@ -37,33 +42,45 @@ extension CharacterClass: RegexComponent {

@available(SwiftStdlib 5.7, *)
extension CharacterClass {
/// A character class that matches any character that does not match this
/// character class.
public var inverted: CharacterClass {
CharacterClass(ccc.inverted)
}
}

@available(SwiftStdlib 5.7, *)
extension RegexComponent where Self == CharacterClass {
/// A character class that matches any element.
///
/// This character class is unaffected by the `dotMatchesNewlines()` method.
public static var any: CharacterClass {
.init(DSLTree.CustomCharacterClass(members: [.atom(.any)]))
}

/// A character class that matches any element that isn't a newline.
public static var anyNonNewline: CharacterClass {
.init(DSLTree.CustomCharacterClass(members: [.atom(.any)]))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.any? Aren't these two things the same?

}

/// A character class that matches any single `Character`, or extended
/// grapheme cluster, regardless of the current semantic level.
public static var anyGrapheme: CharacterClass {
.init(unconverted: .anyGrapheme)
}

/// A character class that matches any single Unicode scalar, regardless
/// of the current semantic level.
public static var anyUnicodeScalar: CharacterClass {
.init(unconverted: .anyUnicodeScalar)
}

public static var whitespace: CharacterClass {
.init(unconverted: .whitespace)
}

/// A character class that matches any digit.
public static var digit: CharacterClass {
.init(unconverted: .digit)
}

/// A character class that matches any hexadecimal digit.
public static var hexDigit: CharacterClass {
.init(DSLTree.CustomCharacterClass(members: [
.range(.char("A"), .char("F")),
Expand All @@ -72,20 +89,32 @@ extension RegexComponent where Self == CharacterClass {
]))
}

public static var horizontalWhitespace: CharacterClass {
.init(unconverted: .horizontalWhitespace)
/// A character class that matches any element that is a "word character".
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to double check, is there any better description than "word character"? "Word character" can be mentioned as an aside but that's more of a historical note. @Azoy does Unicode have another name?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The class of <word_character> includes all the Alphabetic values from the Unicode character database ...
https://unicode.org/reports/tr18/#RL1.4

public static var wordCharacter: CharacterClass {
.init(unconverted: .word)
}

public static var newlineSequence: CharacterClass {
.init(unconverted: .newlineSequence)
/// A character class that matches any element that is classified as
/// whitespace.
public static var whitespace: CharacterClass {
.init(unconverted: .whitespace)
}

/// A character class that matches any element that is classified as
/// horizontal whitespace.
public static var horizontalWhitespace: CharacterClass {
.init(unconverted: .horizontalWhitespace)
}

/// A character class that matches any element that is classified as
/// vertical whitespace.
public static var verticalWhitespace: CharacterClass {
.init(unconverted: .verticalWhitespace)
}

public static var word: CharacterClass {
.init(unconverted: .word)

/// A character class that matches any newline sequence.
public static var newlineSequence: CharacterClass {
.init(unconverted: .newlineSequence)
}
}

Expand All @@ -108,14 +137,111 @@ extension RegexComponent where Self == CharacterClass {
CharacterClass(DSLTree.CustomCharacterClass(
members: s.map { .atom(.scalar($0)) }))
}

/// Returns a character class that matches none of the characters in the given
/// string or sequence.
public static func noneOf<S: Sequence>(_ s: S) -> CharacterClass
where S.Element == Character
{
CharacterClass(DSLTree.CustomCharacterClass(
members: s.map { .atom(.char($0)) })).inverted
}

/// Returns a character class that matches none of the Unicode scalars in the
/// given sequence.
public static func noneOf<S: Sequence>(_ s: S) -> CharacterClass
where S.Element == UnicodeScalar
{
CharacterClass(DSLTree.CustomCharacterClass(
members: s.map { .atom(.scalar($0)) })).inverted
}
}

// Unicode properties
@available(SwiftStdlib 5.7, *)
extension CharacterClass {
extension RegexComponent where Self == CharacterClass {
/// Returns a character class that matches any element with the given Unicode
/// general category.
///
/// For example, when passed `.uppercaseLetter`, this method is equivalent to
/// `/\p{Uppercase_Letter}/` or `/\p{Lu}/`.
public static func generalCategory(_ category: Unicode.GeneralCategory) -> CharacterClass {
return CharacterClass(.generalCategory(category))
}

/// Returns a character class that matches any element with the given Unicode
/// binary property.
///
/// For example, when passed `\.isAlphabetic`, this method is equivalent to
/// `/\p{Alphabetic}/` or `/\p{Is_Alphabetic=true}/`.
public static func binaryProperty(_ property: KeyPath<UnicodeScalar.Properties, Bool>, value: Bool = true) -> CharacterClass {
return CharacterClass(.binaryProperty(property, value: value))
}

/// Returns a character class that matches any element with the given Unicode
/// name.
///
/// This method is equivalent to `/\p{Name=name}/`.
public static func name(_ name: String) -> CharacterClass {
return CharacterClass(.named(name))
}

/// Returns a character class that matches any element that was included in
/// the specified Unicode version.
///
/// This method is equivalent to `/\p{Age=version}/`.
public static func age(_ version: Unicode.Version) -> CharacterClass {
return CharacterClass(.age(version))
}

/// Returns a character class that matches any element with the given Unicode
/// numeric type.
///
/// This method is equivalent to `/\p{Numeric_Type=type}/`.
public static func numericType(_ type: Unicode.NumericType) -> CharacterClass {
return CharacterClass(.numericType(type))
}

/// Returns a character class that matches any element with the given numeric
/// value.
///
/// This method is equivalent to `/\p{Numeric_Value=value}/`.
public static func numericValue(_ value: Double) -> CharacterClass {
return CharacterClass(.numericValue(value))
}

/// Returns a character class that matches any element with the given Unicode
/// canonical combining class.
///
/// This method is equivalent to
/// `/\p{Canonical_Combining_Class=combiningClass}/`.
public static func canonicalCombiningClass(_ combiningClass: Unicode.CanonicalCombiningClass) -> CharacterClass {
return CharacterClass(.ccc(combiningClass))
}

/// Returns a character class that matches any element with the given
/// lowercase mapping.
///
/// This method is equivalent to `/\p{Lowercase_Mapping=value}/`.
public static func lowercaseMapping(_ value: String) -> CharacterClass {
return CharacterClass(.lowercaseMapping(value))
}

/// Returns a character class that matches any element with the given
/// uppercase mapping.
///
/// This method is equivalent to `/\p{Uppercase_Mapping=value}/`.
public static func uppercaseMapping(_ value: String) -> CharacterClass {
return CharacterClass(.uppercaseMapping(value))
}

/// Returns a character class that matches any element with the given
/// titlecase mapping.
///
/// This method is equivalent to `/\p{Titlecase_Mapping=value}/`.
public static func titlecaseMapping(_ value: String) -> CharacterClass {
return CharacterClass(.titlecaseMapping(value))
}
}

/// Returns a character class that includes the characters in the given range.
Expand All @@ -139,37 +265,47 @@ public func ...(lhs: UnicodeScalar, rhs: UnicodeScalar) -> CharacterClass {

@available(SwiftStdlib 5.7, *)
extension RegexComponent where Self == CharacterClass {
/// Returns a character class that combines all the given characters classes
/// via union.
public init(_ first: CharacterClass, _ rest: CharacterClass...) {
if rest.isEmpty {
self.init(first.ccc)
} else {
let members: [DSLTree.CustomCharacterClass.Member] =
(CollectionOfOne(first) + rest).map { .custom($0.ccc) }
var members: [DSLTree.CustomCharacterClass.Member] = [.custom(first.ccc)]
members.append(contentsOf: rest.lazy.map { .custom($0.ccc) })
self.init(.init(members: members))
}
}
}

@available(SwiftStdlib 5.7, *)
extension CharacterClass {
/// Returns a character class that is matches the union of this class and the
/// given class.
public func union(_ other: CharacterClass) -> CharacterClass {
CharacterClass(.init(members: [
.custom(self.ccc),
.custom(other.ccc)]))
}

/// Returns a character class that is matches the intersection of this class
/// and the given class.
public func intersection(_ other: CharacterClass) -> CharacterClass {
CharacterClass(.init(members: [
.intersection(self.ccc, other.ccc)
]))
}


/// Returns a character class that is matches the difference of this class
/// and the given class.
public func subtracting(_ other: CharacterClass) -> CharacterClass {
CharacterClass(.init(members: [
.subtraction(self.ccc, other.ccc)
]))
}


/// Returns a character class that is matches the symmetric difference of
/// this class and the given class.
public func symmetricDifference(_ other: CharacterClass) -> CharacterClass {
CharacterClass(.init(members: [
.symmetricDifference(self.ccc, other.ccc)
Expand Down
114 changes: 112 additions & 2 deletions Sources/_StringProcessing/Regex/DSLTree.swift
Original file line number Diff line number Diff line change
Expand Up @@ -140,12 +140,122 @@ extension DSLTree {
self.isInverted = isInverted
}

public static func generalCategory(_ category: Unicode.GeneralCategory) -> Self {
let property = AST.Atom.CharacterProperty(.generalCategory(category.extendedGeneralCategory!), isInverted: false, isPOSIX: false)
private static func astCharacterProperty(
_ property: AST.Atom.CharacterProperty.Kind
) -> Self {
let property = AST.Atom.CharacterProperty(property, isInverted: false, isPOSIX: false)
let astAtom = AST.Atom(.property(property), .fake)
return .init(members: [.atom(.unconverted(.init(ast: astAtom)))])
}

public static func generalCategory(_ category: Unicode.GeneralCategory) -> Self {
astCharacterProperty(.generalCategory(category.extendedGeneralCategory!))
}

public static func binaryProperty(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we get this code in a different file?

_ property: KeyPath<UnicodeScalar.Properties, Bool>,
value: Bool
) -> Self {
var binaryProperty: Unicode.BinaryProperty? = nil
switch property {
case \.isAlphabetic: binaryProperty = .alphabetic
case \.isASCIIHexDigit: binaryProperty = .asciiHexDigit
case \.isBidiControl: binaryProperty = .bidiControl
case \.isBidiMirrored: binaryProperty = .bidiMirrored
case \.isCased: binaryProperty = .cased
case \.isCaseIgnorable: binaryProperty = .caseIgnorable
case \.changesWhenCaseFolded: binaryProperty = .changesWhenCasefolded
case \.changesWhenCaseMapped: binaryProperty = .changesWhenCasemapped
case \.changesWhenNFKCCaseFolded: binaryProperty = .changesWhenNFKCCasefolded
case \.changesWhenLowercased: binaryProperty = .changesWhenLowercased
case \.changesWhenTitlecased: binaryProperty = .changesWhenTitlecased
case \.changesWhenUppercased: binaryProperty = .changesWhenUppercased
case \.isDash: binaryProperty = .dash
case \.isDefaultIgnorableCodePoint: binaryProperty = .defaultIgnorableCodePoint
case \.isDeprecated: binaryProperty = .deprecated
case \.isDiacritic: binaryProperty = .diacratic
case \.isExtender: binaryProperty = .extender
case \.isFullCompositionExclusion: binaryProperty = .fullCompositionExclusion
case \.isGraphemeBase: binaryProperty = .graphemeBase
case \.isGraphemeExtend: binaryProperty = .graphemeExtended
case \.isHexDigit: binaryProperty = .hexDigit
case \.isIDContinue: binaryProperty = .idContinue
case \.isIdeographic: binaryProperty = .ideographic
case \.isIDStart: binaryProperty = .idStart
case \.isIDSBinaryOperator: binaryProperty = .idsBinaryOperator
case \.isIDSTrinaryOperator: binaryProperty = .idsTrinaryOperator
case \.isJoinControl: binaryProperty = .joinControl
case \.isLogicalOrderException: binaryProperty = .logicalOrderException
case \.isLowercase: binaryProperty = .lowercase
case \.isMath: binaryProperty = .math
case \.isNoncharacterCodePoint: binaryProperty = .noncharacterCodePoint
case \.isPatternSyntax: binaryProperty = .patternSyntax
case \.isPatternWhitespace: binaryProperty = .patternWhitespace
case \.isQuotationMark: binaryProperty = .quotationMark
case \.isRadical: binaryProperty = .radical
case \.isSoftDotted: binaryProperty = .softDotted
case \.isSentenceTerminal: binaryProperty = .sentenceTerminal
case \.isTerminalPunctuation: binaryProperty = .terminalPunctuation
case \.isUnifiedIdeograph: binaryProperty = .unifiedIdiograph
case \.isUppercase: binaryProperty = .uppercase
case \.isVariationSelector: binaryProperty = .variationSelector
case \.isWhitespace: binaryProperty = .whitespace
case \.isXIDContinue: binaryProperty = .xidContinue
case \.isXIDStart: binaryProperty = .xidStart
default:
if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) {
// FIXME: Other platforms
switch property {
case \.isEmojiModifierBase: binaryProperty = .emojiModifierBase
case \.isEmojiModifier: binaryProperty = .emojiModifier
case \.isEmoji: binaryProperty = .emoji
case \.isEmojiPresentation: binaryProperty = .emojiPresentation
default:
break
}
}
}

if let binaryProperty = binaryProperty {
return astCharacterProperty(.binary(binaryProperty, value: value))
} else {
// FIXME: Support via a _UnicodeScalarPredicate interface?
fatalError("Unsupported Unicode binary property")
}
}

public static func age(_ version: Unicode.Version) -> Self {
astCharacterProperty(.age(major: version.major, minor: version.minor))
}

public static func named(_ name: String) -> Self {
astCharacterProperty(.named(name))
}

public static func numericType(_ type: Unicode.NumericType) -> Self {
astCharacterProperty(.numericType(type))
}

public static func numericValue(_ value: Double) -> Self {
astCharacterProperty(.numericValue(value))
}

public static func ccc(_ combiningClass: Unicode.CanonicalCombiningClass) -> Self {
astCharacterProperty(.ccc(combiningClass))
}

public static func lowercaseMapping(_ value: String) -> Self {
astCharacterProperty(.mapping(.lowercase, value))
}

public static func uppercaseMapping(_ value: String) -> Self {
astCharacterProperty(.mapping(.uppercase, value))
}

public static func titlecaseMapping(_ value: String) -> Self {
astCharacterProperty(.mapping(.titlecase, value))
}

public var inverted: CustomCharacterClass {
var result = self
result.isInverted.toggle()
Expand Down
Loading