Skip to content

Commit 3b77fe4

Browse files
authored
Fill out remainder of options API (#246)
This adds methods to RegexComponent for the remainder of the regex options, and passes the current MatchingOptions further down into the consumers so that the correct behavior can be used.
1 parent 52bc932 commit 3b77fe4

File tree

7 files changed

+379
-72
lines changed

7 files changed

+379
-72
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -111,30 +111,41 @@ extension Compiler.ByteCodeGen {
111111
}
112112

113113
case .startOfLine:
114-
builder.buildAssert { (input, pos, bounds) in
115-
pos == input.startIndex ||
116-
input[input.index(before: pos)].isNewline
114+
if options.anchorsMatchNewlines {
115+
builder.buildAssert { (input, pos, bounds) in
116+
pos == input.startIndex || input[input.index(before: pos)].isNewline
117+
}
118+
} else {
119+
builder.buildAssert { (input, pos, bounds) in
120+
pos == input.startIndex
121+
}
117122
}
118-
123+
119124
case .endOfLine:
120-
builder.buildAssert { (input, pos, bounds) in
121-
pos == input.endIndex || input[pos].isNewline
125+
if options.anchorsMatchNewlines {
126+
builder.buildAssert { (input, pos, bounds) in
127+
pos == input.endIndex || input[pos].isNewline
128+
}
129+
} else {
130+
builder.buildAssert { (input, pos, bounds) in
131+
pos == input.endIndex
132+
}
122133
}
123134

124135
case .wordBoundary:
125136
// TODO: May want to consider Unicode level
126-
builder.buildAssert { (input, pos, bounds) in
137+
builder.buildAssert { [options] (input, pos, bounds) in
127138
// TODO: How should we handle bounds?
128139
CharacterClass.word.isBoundary(
129-
input, at: pos, bounds: bounds)
140+
input, at: pos, bounds: bounds, with: options)
130141
}
131142

132143
case .notWordBoundary:
133144
// TODO: May want to consider Unicode level
134-
builder.buildAssert { (input, pos, bounds) in
145+
builder.buildAssert { [options] (input, pos, bounds) in
135146
// TODO: How should we handle bounds?
136147
!CharacterClass.word.isBoundary(
137-
input, at: pos, bounds: bounds)
148+
input, at: pos, bounds: bounds, with: options)
138149
}
139150
}
140151
}

Sources/_StringProcessing/CharacterClass.swift

Lines changed: 49 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -59,14 +59,14 @@ public struct CharacterClass: Hashable {
5959
var op: SetOperator
6060
var rhs: CharacterSetComponent
6161

62-
public func matches(_ c: Character) -> Bool {
62+
public func matches(_ c: Character, with options: MatchingOptions) -> Bool {
6363
switch op {
6464
case .intersection:
65-
return lhs.matches(c) && rhs.matches(c)
65+
return lhs.matches(c, with: options) && rhs.matches(c, with: options)
6666
case .subtraction:
67-
return lhs.matches(c) && !rhs.matches(c)
67+
return lhs.matches(c, with: options) && !rhs.matches(c, with: options)
6868
case .symmetricDifference:
69-
return lhs.matches(c) != rhs.matches(c)
69+
return lhs.matches(c, with: options) != rhs.matches(c, with: options)
7070
}
7171
}
7272
}
@@ -87,14 +87,28 @@ public struct CharacterClass: Hashable {
8787
.setOperation(.init(lhs: lhs, op: op, rhs: rhs))
8888
}
8989

90-
public func matches(_ character: Character) -> Bool {
90+
public func matches(_ character: Character, with options: MatchingOptions) -> Bool {
9191
switch self {
92-
case .character(let c): return c == character
93-
case .range(let range): return range.contains(character)
92+
case .character(let c):
93+
if options.isCaseInsensitive {
94+
return c.lowercased() == character.lowercased()
95+
} else {
96+
return c == character
97+
}
98+
case .range(let range):
99+
if options.isCaseInsensitive {
100+
let newLower = range.lowerBound.lowercased()
101+
let newUpper = range.upperBound.lowercased()
102+
// FIXME: Is failing this possible? Is this the right behavior if so?
103+
guard newLower <= newUpper else { return false }
104+
return (newLower...newUpper).contains(character.lowercased())
105+
} else {
106+
return range.contains(character)
107+
}
94108
case .characterClass(let custom):
95109
let str = String(character)
96-
return custom.matches(in: str, at: str.startIndex) != nil
97-
case .setOperation(let op): return op.matches(character)
110+
return custom.matches(in: str, at: str.startIndex, with: options) != nil
111+
case .setOperation(let op): return op.matches(character, with: options)
98112
}
99113
}
100114
}
@@ -135,21 +149,26 @@ public struct CharacterClass: Hashable {
135149

136150
/// Returns the end of the match of this character class in `str`, if
137151
/// it matches.
138-
public func matches(in str: String, at i: String.Index) -> String.Index? {
152+
public func matches(in str: String, at i: String.Index, with options: MatchingOptions) -> String.Index? {
139153
switch matchLevel {
140154
case .graphemeCluster:
141155
let c = str[i]
142156
var matched: Bool
143157
switch cc {
144158
case .any, .anyGrapheme: matched = true
145-
case .digit: matched = c.isNumber
146-
case .hexDigit: matched = c.isHexDigit
159+
case .digit:
160+
matched = c.isNumber && (c.isASCII || !options.usesASCIIDigits)
161+
case .hexDigit:
162+
matched = c.isHexDigit && (c.isASCII || !options.usesASCIIDigits)
147163
case .horizontalWhitespace: fatalError("Not implemented")
148-
case .newlineSequence: matched = c.isNewline
164+
case .newlineSequence:
165+
matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces)
149166
case .verticalWhitespace: fatalError("Not implemented")
150-
case .whitespace: matched = c.isWhitespace
151-
case .word: matched = c.isWordCharacter
152-
case .custom(let set): matched = set.any { $0.matches(c) }
167+
case .whitespace:
168+
matched = c.isWhitespace && (c.isASCII || !options.usesASCIISpaces)
169+
case .word:
170+
matched = c.isWordCharacter && (c.isASCII || !options.usesASCIIWord)
171+
case .custom(let set): matched = set.any { $0.matches(c, with: options) }
153172
}
154173
if isInverted {
155174
matched.toggle()
@@ -161,13 +180,17 @@ public struct CharacterClass: Hashable {
161180
switch cc {
162181
case .any: matched = true
163182
case .anyGrapheme: fatalError("Not matched in this mode")
164-
case .digit: matched = c.properties.numericType != nil
165-
case .hexDigit: matched = Character(c).isHexDigit
183+
case .digit:
184+
matched = c.properties.numericType != nil && (c.isASCII || !options.usesASCIIDigits)
185+
case .hexDigit:
186+
matched = Character(c).isHexDigit && (c.isASCII || !options.usesASCIIDigits)
166187
case .horizontalWhitespace: fatalError("Not implemented")
167188
case .newlineSequence: fatalError("Not implemented")
168189
case .verticalWhitespace: fatalError("Not implemented")
169-
case .whitespace: matched = c.properties.isWhitespace
170-
case .word: matched = c.properties.isAlphabetic || c == "_"
190+
case .whitespace:
191+
matched = c.properties.isWhitespace && (c.isASCII || !options.usesASCIISpaces)
192+
case .word:
193+
matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !options.usesASCIIWord)
171194
case .custom: fatalError("Not supported")
172195
}
173196
if isInverted {
@@ -495,21 +518,22 @@ extension CharacterClass {
495518
func isBoundary(
496519
_ input: String,
497520
at pos: String.Index,
498-
bounds: Range<String.Index>
521+
bounds: Range<String.Index>,
522+
with options: MatchingOptions
499523
) -> Bool {
500524
// FIXME: How should we handle bounds?
501525
// We probably need two concepts
502526
if input.isEmpty { return false }
503527
if pos == input.startIndex {
504-
return self.matches(in: input, at: pos) != nil
528+
return self.matches(in: input, at: pos, with: options) != nil
505529
}
506530
let priorIdx = input.index(before: pos)
507531
if pos == input.endIndex {
508-
return self.matches(in: input, at: priorIdx) != nil
532+
return self.matches(in: input, at: priorIdx, with: options) != nil
509533
}
510534

511-
let prior = self.matches(in: input, at: priorIdx) != nil
512-
let current = self.matches(in: input, at: pos) != nil
535+
let prior = self.matches(in: input, at: priorIdx, with: options) != nil
536+
let current = self.matches(in: input, at: pos, with: options) != nil
513537
return prior != current
514538
}
515539

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ extension AST.Atom {
136136
) {
137137
return { input, bounds in
138138
// FIXME: should we worry about out of bounds?
139-
cc.matches(in: input, at: bounds.lowerBound)
139+
cc.matches(in: input, at: bounds.lowerBound, with: opts)
140140
}
141141
}
142142

Sources/_StringProcessing/MatchingOptions.swift

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ import _RegexParser
1313

1414
/// A type that represents the current state of regex matching options, with
1515
/// stack-based scoping.
16-
struct MatchingOptions {
16+
public struct MatchingOptions {
1717
fileprivate var stack: [Representation]
1818

1919
fileprivate func _invariantCheck() {
@@ -67,11 +67,32 @@ extension MatchingOptions {
6767
stack.last!.contains(.singleLine)
6868
}
6969

70+
var anchorsMatchNewlines: Bool {
71+
stack.last!.contains(.multiline)
72+
}
73+
74+
var usesASCIIWord: Bool {
75+
stack.last!.contains(.asciiOnlyWord)
76+
|| stack.last!.contains(.asciiOnlyPOSIXProps)
77+
}
78+
79+
var usesASCIIDigits: Bool {
80+
stack.last!.contains(.asciiOnlyDigit)
81+
|| stack.last!.contains(.asciiOnlyPOSIXProps)
82+
}
83+
84+
var usesASCIISpaces: Bool {
85+
stack.last!.contains(.asciiOnlySpace)
86+
|| stack.last!.contains(.asciiOnlyPOSIXProps)
87+
}
88+
89+
var usesSimpleUnicodeBoundaries: Bool {
90+
!stack.last!.contains(.unicodeWordBoundaries)
91+
}
92+
7093
enum SemanticLevel {
7194
case graphemeCluster
7295
case unicodeScalar
73-
// TODO: include?
74-
// case byte
7596
}
7697

7798
var semanticLevel: SemanticLevel {

0 commit comments

Comments
 (0)