Skip to content

Commit 93abfcb

Browse files
authored
Move CharacterClass API into RegexBuilder (#254)
Makes the existing CharacterClass model type SPI, and adds a public CharacterClass type to the RegexBuilder module, which uses a DSLTree char class instead of the AST's version. RegexBuilder.CharacterClass is a more limited API than we need for the internal character class model, giving us room to expand on it as necessary in the future.
1 parent 3b77fe4 commit 93abfcb

File tree

6 files changed

+338
-35
lines changed

6 files changed

+338
-35
lines changed
Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
import _RegexParser
13+
@_spi(RegexBuilder) import _StringProcessing
14+
15+
public struct CharacterClass {
16+
internal var ccc: DSLTree.CustomCharacterClass
17+
18+
init(_ ccc: DSLTree.CustomCharacterClass) {
19+
self.ccc = ccc
20+
}
21+
22+
init(unconverted model: _CharacterClassModel) {
23+
// FIXME: Implement in DSLTree instead of wrapping an AST atom
24+
switch model.makeAST() {
25+
case .atom(let atom):
26+
self.ccc = .init(members: [.atom(.unconverted(atom))])
27+
default:
28+
fatalError("Unsupported _CharacterClassModel")
29+
}
30+
}
31+
32+
init(property: AST.Atom.CharacterProperty) {
33+
// FIXME: Implement in DSLTree instead of wrapping an AST atom
34+
let astAtom = AST.Atom(.property(property), .fake)
35+
self.ccc = .init(members: [.atom(.unconverted(astAtom))])
36+
}
37+
}
38+
39+
extension CharacterClass: RegexComponent {
40+
public var regex: Regex<Substring> {
41+
return Regex(node: DSLTree.Node.customCharacterClass(ccc))
42+
}
43+
}
44+
45+
extension CharacterClass {
46+
public var inverted: CharacterClass {
47+
CharacterClass(ccc.inverted)
48+
}
49+
}
50+
51+
extension RegexComponent where Self == CharacterClass {
52+
public static var any: CharacterClass {
53+
.init(DSLTree.CustomCharacterClass(members: [.atom(.any)]))
54+
}
55+
56+
public static var anyGrapheme: CharacterClass {
57+
.init(unconverted: .anyGrapheme)
58+
}
59+
60+
public static var whitespace: CharacterClass {
61+
.init(unconverted: .whitespace)
62+
}
63+
64+
public static var digit: CharacterClass {
65+
.init(unconverted: .digit)
66+
}
67+
68+
public static var hexDigit: CharacterClass {
69+
.init(DSLTree.CustomCharacterClass(members: [
70+
.range(.char("A"), .char("F")),
71+
.range(.char("a"), .char("f")),
72+
.range(.char("0"), .char("9")),
73+
]))
74+
}
75+
76+
public static var horizontalWhitespace: CharacterClass {
77+
.init(unconverted: .horizontalWhitespace)
78+
}
79+
80+
public static var newlineSequence: CharacterClass {
81+
.init(unconverted: .newlineSequence)
82+
}
83+
84+
public static var verticalWhitespace: CharacterClass {
85+
.init(unconverted: .verticalWhitespace)
86+
}
87+
88+
public static var word: CharacterClass {
89+
.init(unconverted: .word)
90+
}
91+
}
92+
93+
extension RegexComponent where Self == CharacterClass {
94+
/// Returns a character class that matches any character in the given string
95+
/// or sequence.
96+
public static func anyOf<S: Sequence>(_ s: S) -> CharacterClass
97+
where S.Element == Character
98+
{
99+
CharacterClass(DSLTree.CustomCharacterClass(
100+
members: s.map { .atom(.char($0)) }))
101+
}
102+
103+
/// Returns a character class that matches any unicode scalar in the given
104+
/// sequence.
105+
public static func anyOf<S: Sequence>(_ s: S) -> CharacterClass
106+
where S.Element == UnicodeScalar
107+
{
108+
CharacterClass(DSLTree.CustomCharacterClass(
109+
members: s.map { .atom(.scalar($0)) }))
110+
}
111+
}
112+
113+
// Unicode properties
114+
extension CharacterClass {
115+
public static func generalCategory(_ category: Unicode.GeneralCategory) -> CharacterClass {
116+
guard let extendedCategory = category.extendedGeneralCategory else {
117+
fatalError("Unexpected general category")
118+
}
119+
return CharacterClass(property:
120+
.init(.generalCategory(extendedCategory), isInverted: false, isPOSIX: false))
121+
}
122+
}
123+
124+
/// Range syntax for characters in `CharacterClass`es.
125+
public func ...(lhs: Character, rhs: Character) -> CharacterClass {
126+
let range: DSLTree.CustomCharacterClass.Member = .range(.char(lhs), .char(rhs))
127+
let ccc = DSLTree.CustomCharacterClass(members: [range], isInverted: false)
128+
return CharacterClass(ccc)
129+
}
130+
131+
/// Range syntax for unicode scalars in `CharacterClass`es.
132+
@_disfavoredOverload
133+
public func ...(lhs: UnicodeScalar, rhs: UnicodeScalar) -> CharacterClass {
134+
let range: DSLTree.CustomCharacterClass.Member = .range(.scalar(lhs), .scalar(rhs))
135+
let ccc = DSLTree.CustomCharacterClass(members: [range], isInverted: false)
136+
return CharacterClass(ccc)
137+
}
138+
139+
extension Unicode.GeneralCategory {
140+
var extendedGeneralCategory: Unicode.ExtendedGeneralCategory? {
141+
switch self {
142+
case .uppercaseLetter: return .uppercaseLetter
143+
case .lowercaseLetter: return .lowercaseLetter
144+
case .titlecaseLetter: return .titlecaseLetter
145+
case .modifierLetter: return .modifierLetter
146+
case .otherLetter: return .otherLetter
147+
case .nonspacingMark: return .nonspacingMark
148+
case .spacingMark: return .spacingMark
149+
case .enclosingMark: return .enclosingMark
150+
case .decimalNumber: return .decimalNumber
151+
case .letterNumber: return .letterNumber
152+
case .otherNumber: return .otherNumber
153+
case .connectorPunctuation: return .connectorPunctuation
154+
case .dashPunctuation: return .dashPunctuation
155+
case .openPunctuation: return .openPunctuation
156+
case .closePunctuation: return .closePunctuation
157+
case .initialPunctuation: return .initialPunctuation
158+
case .finalPunctuation: return .finalPunctuation
159+
case .otherPunctuation: return .otherPunctuation
160+
case .mathSymbol: return .mathSymbol
161+
case .currencySymbol: return .currencySymbol
162+
case .modifierSymbol: return .modifierSymbol
163+
case .otherSymbol: return .otherSymbol
164+
case .spaceSeparator: return .spaceSeparator
165+
case .lineSeparator: return .lineSeparator
166+
case .paragraphSeparator: return .paragraphSeparator
167+
case .control: return .control
168+
case .format: return .format
169+
case .surrogate: return .surrogate
170+
case .privateUse: return .privateUse
171+
case .unassigned: return .unassigned
172+
@unknown default: return nil
173+
}
174+
}
175+
}
176+
177+
// MARK: - Set algebra methods
178+
179+
extension RegexComponent where Self == CharacterClass {
180+
public init(_ first: CharacterClass, _ rest: CharacterClass...) {
181+
if rest.isEmpty {
182+
self.init(first.ccc)
183+
} else {
184+
let members: [DSLTree.CustomCharacterClass.Member] =
185+
(CollectionOfOne(first) + rest).map { .custom($0.ccc) }
186+
self.init(.init(members: members))
187+
}
188+
}
189+
}
190+
191+
extension CharacterClass {
192+
public func union(_ other: CharacterClass) -> CharacterClass {
193+
CharacterClass(.init(members: [
194+
.custom(self.ccc),
195+
.custom(other.ccc)]))
196+
}
197+
198+
public func intersection(_ other: CharacterClass) -> CharacterClass {
199+
CharacterClass(.init(members: [
200+
.intersection(self.ccc, other.ccc)
201+
]))
202+
}
203+
204+
public func subtracting(_ other: CharacterClass) -> CharacterClass {
205+
CharacterClass(.init(members: [
206+
.subtraction(self.ccc, other.ccc)
207+
]))
208+
}
209+
210+
public func symmetricDifference(_ other: CharacterClass) -> CharacterClass {
211+
CharacterClass(.init(members: [
212+
.symmetricDifference(self.ccc, other.ccc)
213+
]))
214+
}
215+
}

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,15 +136,15 @@ extension Compiler.ByteCodeGen {
136136
// TODO: May want to consider Unicode level
137137
builder.buildAssert { [options] (input, pos, bounds) in
138138
// TODO: How should we handle bounds?
139-
CharacterClass.word.isBoundary(
139+
_CharacterClassModel.word.isBoundary(
140140
input, at: pos, bounds: bounds, with: options)
141141
}
142142

143143
case .notWordBoundary:
144144
// TODO: May want to consider Unicode level
145145
builder.buildAssert { [options] (input, pos, bounds) in
146146
// TODO: How should we handle bounds?
147-
!CharacterClass.word.isBoundary(
147+
!_CharacterClassModel.word.isBoundary(
148148
input, at: pos, bounds: bounds, with: options)
149149
}
150150
}
@@ -595,7 +595,15 @@ extension Compiler.ByteCodeGen {
595595
try emitQuantification(amt, kind, child)
596596

597597
case let .customCharacterClass(ccc):
598-
try emitCustomCharacterClass(ccc)
598+
if ccc.containsAny {
599+
if !ccc.isInverted {
600+
emitAny()
601+
} else {
602+
throw Unsupported("Inverted any")
603+
}
604+
} else {
605+
try emitCustomCharacterClass(ccc)
606+
}
599607

600608
case let .atom(a):
601609
try emitAtom(a)

Sources/_StringProcessing/MatchingOptions.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ extension MatchingOptions {
105105
// Deprecated CharacterClass.MatchLevel API
106106
extension MatchingOptions {
107107
@available(*, deprecated)
108-
var matchLevel: CharacterClass.MatchLevel {
108+
var matchLevel: _CharacterClassModel.MatchLevel {
109109
switch semanticLevel {
110110
case .graphemeCluster:
111111
return .graphemeCluster

Sources/_StringProcessing/Regex/DSLTree.swift

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,31 @@ extension DSLTree {
107107
public struct CustomCharacterClass {
108108
var members: [Member]
109109
var isInverted: Bool
110+
111+
var containsAny: Bool {
112+
members.contains { member in
113+
switch member {
114+
case .atom(.any): return true
115+
case .custom(let ccc): return ccc.containsAny
116+
default:
117+
return false
118+
}
119+
}
120+
}
121+
122+
public init(members: [DSLTree.CustomCharacterClass.Member], isInverted: Bool = false) {
123+
self.members = members
124+
self.isInverted = isInverted
125+
}
126+
127+
public var inverted: CustomCharacterClass {
128+
var result = self
129+
result.isInverted.toggle()
130+
return result
131+
}
110132

111-
enum Member {
133+
@_spi(RegexBuilder)
134+
public enum Member {
112135
case atom(Atom)
113136
case range(Atom, Atom)
114137
case custom(CustomCharacterClass)

0 commit comments

Comments
 (0)