Skip to content

Move CharacterClass API into RegexBuilder #254

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Apr 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
215 changes: 215 additions & 0 deletions Sources/RegexBuilder/CharacterClass.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//

import _RegexParser
@_spi(RegexBuilder) import _StringProcessing

public struct CharacterClass {
internal var ccc: DSLTree.CustomCharacterClass

init(_ ccc: DSLTree.CustomCharacterClass) {
self.ccc = ccc
}

init(unconverted model: _CharacterClassModel) {
// FIXME: Implement in DSLTree instead of wrapping an AST atom
switch model.makeAST() {
case .atom(let atom):
self.ccc = .init(members: [.atom(.unconverted(atom))])
default:
fatalError("Unsupported _CharacterClassModel")
}
}

init(property: AST.Atom.CharacterProperty) {
// FIXME: Implement in DSLTree instead of wrapping an AST atom
let astAtom = AST.Atom(.property(property), .fake)
self.ccc = .init(members: [.atom(.unconverted(astAtom))])
}
}

extension CharacterClass: RegexComponent {
public var regex: Regex<Substring> {
return Regex(node: DSLTree.Node.customCharacterClass(ccc))
}
}

extension CharacterClass {
public var inverted: CharacterClass {
CharacterClass(ccc.inverted)
}
}

extension RegexComponent where Self == CharacterClass {
public static var any: CharacterClass {
.init(DSLTree.CustomCharacterClass(members: [.atom(.any)]))
}

public static var anyGrapheme: CharacterClass {
.init(unconverted: .anyGrapheme)
}

public static var whitespace: CharacterClass {
.init(unconverted: .whitespace)
}

public static var digit: CharacterClass {
.init(unconverted: .digit)
}

public static var hexDigit: CharacterClass {
.init(DSLTree.CustomCharacterClass(members: [
.range(.char("A"), .char("F")),
.range(.char("a"), .char("f")),
.range(.char("0"), .char("9")),
]))
}

public static var horizontalWhitespace: CharacterClass {
.init(unconverted: .horizontalWhitespace)
}

public static var newlineSequence: CharacterClass {
.init(unconverted: .newlineSequence)
}

public static var verticalWhitespace: CharacterClass {
.init(unconverted: .verticalWhitespace)
}

public static var word: CharacterClass {
.init(unconverted: .word)
}
}

extension RegexComponent where Self == CharacterClass {
/// Returns a character class that matches any character in the given string
/// or sequence.
public static func anyOf<S: Sequence>(_ s: S) -> CharacterClass
where S.Element == Character
{
CharacterClass(DSLTree.CustomCharacterClass(
members: s.map { .atom(.char($0)) }))
}

/// Returns a character class that matches any unicode scalar in the given
/// sequence.
public static func anyOf<S: Sequence>(_ s: S) -> CharacterClass
where S.Element == UnicodeScalar
{
CharacterClass(DSLTree.CustomCharacterClass(
members: s.map { .atom(.scalar($0)) }))
}
}

// Unicode properties
extension CharacterClass {
public static func generalCategory(_ category: Unicode.GeneralCategory) -> CharacterClass {
guard let extendedCategory = category.extendedGeneralCategory else {
fatalError("Unexpected general category")
}
return CharacterClass(property:
.init(.generalCategory(extendedCategory), isInverted: false, isPOSIX: false))
}
}

/// Range syntax for characters in `CharacterClass`es.
public func ...(lhs: Character, rhs: Character) -> CharacterClass {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be useful to define these operators as static methods on extensions?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think(?) we get any benefit from making them static methods — that syntax is to support implementation of protocol requirements, which isn't the case here, and all operator functions end up in the global namespace in any case.

let range: DSLTree.CustomCharacterClass.Member = .range(.char(lhs), .char(rhs))
let ccc = DSLTree.CustomCharacterClass(members: [range], isInverted: false)
return CharacterClass(ccc)
}

/// Range syntax for unicode scalars in `CharacterClass`es.
@_disfavoredOverload
public func ...(lhs: UnicodeScalar, rhs: UnicodeScalar) -> CharacterClass {
let range: DSLTree.CustomCharacterClass.Member = .range(.scalar(lhs), .scalar(rhs))
let ccc = DSLTree.CustomCharacterClass(members: [range], isInverted: false)
return CharacterClass(ccc)
}

extension Unicode.GeneralCategory {
var extendedGeneralCategory: Unicode.ExtendedGeneralCategory? {
switch self {
case .uppercaseLetter: return .uppercaseLetter
case .lowercaseLetter: return .lowercaseLetter
case .titlecaseLetter: return .titlecaseLetter
case .modifierLetter: return .modifierLetter
case .otherLetter: return .otherLetter
case .nonspacingMark: return .nonspacingMark
case .spacingMark: return .spacingMark
case .enclosingMark: return .enclosingMark
case .decimalNumber: return .decimalNumber
case .letterNumber: return .letterNumber
case .otherNumber: return .otherNumber
case .connectorPunctuation: return .connectorPunctuation
case .dashPunctuation: return .dashPunctuation
case .openPunctuation: return .openPunctuation
case .closePunctuation: return .closePunctuation
case .initialPunctuation: return .initialPunctuation
case .finalPunctuation: return .finalPunctuation
case .otherPunctuation: return .otherPunctuation
case .mathSymbol: return .mathSymbol
case .currencySymbol: return .currencySymbol
case .modifierSymbol: return .modifierSymbol
case .otherSymbol: return .otherSymbol
case .spaceSeparator: return .spaceSeparator
case .lineSeparator: return .lineSeparator
case .paragraphSeparator: return .paragraphSeparator
case .control: return .control
case .format: return .format
case .surrogate: return .surrogate
case .privateUse: return .privateUse
case .unassigned: return .unassigned
@unknown default: return nil
}
}
}

// MARK: - Set algebra methods

extension RegexComponent where Self == CharacterClass {
public init(_ first: CharacterClass, _ rest: CharacterClass...) {
if rest.isEmpty {
self.init(first.ccc)
} else {
let members: [DSLTree.CustomCharacterClass.Member] =
(CollectionOfOne(first) + rest).map { .custom($0.ccc) }
self.init(.init(members: members))
}
}
}

extension CharacterClass {
public func union(_ other: CharacterClass) -> CharacterClass {
CharacterClass(.init(members: [
.custom(self.ccc),
.custom(other.ccc)]))
}

public func intersection(_ other: CharacterClass) -> CharacterClass {
CharacterClass(.init(members: [
.intersection(self.ccc, other.ccc)
]))
}

public func subtracting(_ other: CharacterClass) -> CharacterClass {
CharacterClass(.init(members: [
.subtraction(self.ccc, other.ccc)
]))
}

public func symmetricDifference(_ other: CharacterClass) -> CharacterClass {
CharacterClass(.init(members: [
.symmetricDifference(self.ccc, other.ccc)
]))
}
}
14 changes: 11 additions & 3 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -136,15 +136,15 @@ extension Compiler.ByteCodeGen {
// TODO: May want to consider Unicode level
builder.buildAssert { [options] (input, pos, bounds) in
// TODO: How should we handle bounds?
CharacterClass.word.isBoundary(
_CharacterClassModel.word.isBoundary(
input, at: pos, bounds: bounds, with: options)
}

case .notWordBoundary:
// TODO: May want to consider Unicode level
builder.buildAssert { [options] (input, pos, bounds) in
// TODO: How should we handle bounds?
!CharacterClass.word.isBoundary(
!_CharacterClassModel.word.isBoundary(
input, at: pos, bounds: bounds, with: options)
}
}
Expand Down Expand Up @@ -595,7 +595,15 @@ extension Compiler.ByteCodeGen {
try emitQuantification(amt, kind, child)

case let .customCharacterClass(ccc):
try emitCustomCharacterClass(ccc)
if ccc.containsAny {
if !ccc.isInverted {
emitAny()
} else {
throw Unsupported("Inverted any")
}
} else {
try emitCustomCharacterClass(ccc)
}

case let .atom(a):
try emitAtom(a)
Expand Down
2 changes: 1 addition & 1 deletion Sources/_StringProcessing/MatchingOptions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ extension MatchingOptions {
// Deprecated CharacterClass.MatchLevel API
extension MatchingOptions {
@available(*, deprecated)
var matchLevel: CharacterClass.MatchLevel {
var matchLevel: _CharacterClassModel.MatchLevel {
switch semanticLevel {
case .graphemeCluster:
return .graphemeCluster
Expand Down
25 changes: 24 additions & 1 deletion Sources/_StringProcessing/Regex/DSLTree.swift
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,31 @@ extension DSLTree {
public struct CustomCharacterClass {
var members: [Member]
var isInverted: Bool

var containsAny: Bool {
members.contains { member in
switch member {
case .atom(.any): return true
case .custom(let ccc): return ccc.containsAny
default:
return false
}
}
}

public init(members: [DSLTree.CustomCharacterClass.Member], isInverted: Bool = false) {
self.members = members
self.isInverted = isInverted
}

public var inverted: CustomCharacterClass {
var result = self
result.isInverted.toggle()
return result
}

enum Member {
@_spi(RegexBuilder)
public enum Member {
case atom(Atom)
case range(Atom, Atom)
case custom(CustomCharacterClass)
Expand Down
Loading