Skip to content

Commit c16e389

Browse files
authored
Implement \R, \v, \h for character/scalar modes (#384)
Implement \R, \v, \h for character/scalar modes and audit assertions and anchors for semantic level.
1 parent 7f068dc commit c16e389

File tree

5 files changed

+88
-23
lines changed

5 files changed

+88
-23
lines changed

Sources/_RegexParser/Regex/Parse/Sema.swift

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,15 +182,16 @@ extension RegexValidator {
182182
_ esc: AST.Atom.EscapedBuiltin, at loc: SourceLocation
183183
) throws {
184184
switch esc {
185-
case .resetStartOfMatch, .singleDataUnit, .verticalTab, .notVerticalTab,
185+
case .resetStartOfMatch, .singleDataUnit,
186186
// '\N' needs to be emitted using 'emitAny'.
187187
.notNewline:
188188
throw error(.unsupported("'\\\(esc.character)'"), at: loc)
189189

190190
// Character classes.
191191
case .decimalDigit, .notDecimalDigit, .whitespace, .notWhitespace,
192192
.wordCharacter, .notWordCharacter, .graphemeCluster, .trueAnychar,
193-
.horizontalWhitespace, .notHorizontalWhitespace:
193+
.horizontalWhitespace, .notHorizontalWhitespace,
194+
.verticalTab, .notVerticalTab:
194195
break
195196

196197
case .newlineSequence:

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,10 +80,16 @@ extension Compiler.ByteCodeGen {
8080
}
8181

8282
case .endOfSubjectBeforeNewline:
83-
builder.buildAssert { (input, pos, bounds) in
83+
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in
8484
if pos == input.endIndex { return true }
85-
return input.index(after: pos) == input.endIndex
86-
&& input[pos].isNewline
85+
switch semanticLevel {
86+
case .graphemeCluster:
87+
return input.index(after: pos) == input.endIndex
88+
&& input[pos].isNewline
89+
case .unicodeScalar:
90+
return input.unicodeScalars.index(after: pos) == input.endIndex
91+
&& input.unicodeScalars[pos].isNewline
92+
}
8793
}
8894

8995
case .endOfSubject:
@@ -115,8 +121,14 @@ extension Compiler.ByteCodeGen {
115121

116122
case .startOfLine:
117123
if options.anchorsMatchNewlines {
118-
builder.buildAssert { (input, pos, bounds) in
119-
pos == input.startIndex || input[input.index(before: pos)].isNewline
124+
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in
125+
if pos == input.startIndex { return true }
126+
switch semanticLevel {
127+
case .graphemeCluster:
128+
return input[input.index(before: pos)].isNewline
129+
case .unicodeScalar:
130+
return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline
131+
}
120132
}
121133
} else {
122134
builder.buildAssert { (input, pos, bounds) in
@@ -126,8 +138,14 @@ extension Compiler.ByteCodeGen {
126138

127139
case .endOfLine:
128140
if options.anchorsMatchNewlines {
129-
builder.buildAssert { (input, pos, bounds) in
130-
pos == input.endIndex || input[pos].isNewline
141+
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in
142+
if pos == input.endIndex { return true }
143+
switch semanticLevel {
144+
case .graphemeCluster:
145+
return input[pos].isNewline
146+
case .unicodeScalar:
147+
return input.unicodeScalars[pos].isNewline
148+
}
131149
}
132150
} else {
133151
builder.buildAssert { (input, pos, bounds) in

Sources/_StringProcessing/Unicode/ScalarProps.swift

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,19 @@ extension Unicode.Script {
4646
return result
4747
}
4848
}
49+
50+
extension UnicodeScalar {
51+
var isHorizontalWhitespace: Bool {
52+
value == 0x09 || properties.generalCategory == .spaceSeparator
53+
}
54+
55+
var isNewline: Bool {
56+
switch value {
57+
case 0x000A...0x000D /* LF ... CR */: return true
58+
case 0x0085 /* NEXT LINE (NEL) */: return true
59+
case 0x2028 /* LINE SEPARATOR */: return true
60+
case 0x2029 /* PARAGRAPH SEPARATOR */: return true
61+
default: return false
62+
}
63+
}
64+
}

Sources/_StringProcessing/_CharacterClassModel.swift

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -178,15 +178,18 @@ public struct _CharacterClassModel: Hashable {
178178
matched = c.isNumber && (c.isASCII || !options.usesASCIIDigits)
179179
case .hexDigit:
180180
matched = c.isHexDigit && (c.isASCII || !options.usesASCIIDigits)
181-
case .horizontalWhitespace: fatalError("Not implemented")
182-
case .newlineSequence:
183-
matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces)
184-
case .verticalWhitespace: fatalError("Not implemented")
181+
case .horizontalWhitespace:
182+
matched = c.unicodeScalars.first?.isHorizontalWhitespace == true
183+
&& (c.isASCII || !options.usesASCIISpaces)
184+
case .newlineSequence, .verticalWhitespace:
185+
matched = c.unicodeScalars.first?.isNewline == true
186+
&& (c.isASCII || !options.usesASCIISpaces)
185187
case .whitespace:
186188
matched = c.isWhitespace && (c.isASCII || !options.usesASCIISpaces)
187189
case .word:
188190
matched = c.isWordCharacter && (c.isASCII || !options.usesASCIIWord)
189-
case .custom(let set): matched = set.any { $0.matches(c, with: options) }
191+
case .custom(let set):
192+
matched = set.any { $0.matches(c, with: options) }
190193
}
191194
if isInverted {
192195
matched.toggle()
@@ -206,14 +209,21 @@ public struct _CharacterClassModel: Hashable {
206209
matched = c.properties.numericType != nil && (c.isASCII || !options.usesASCIIDigits)
207210
case .hexDigit:
208211
matched = Character(c).isHexDigit && (c.isASCII || !options.usesASCIIDigits)
209-
case .horizontalWhitespace: fatalError("Not implemented")
210-
case .newlineSequence: fatalError("Not implemented")
211-
case .verticalWhitespace: fatalError("Not implemented")
212+
case .horizontalWhitespace:
213+
matched = c.isHorizontalWhitespace && (c.isASCII || !options.usesASCIISpaces)
214+
case .verticalWhitespace:
215+
matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces)
216+
case .newlineSequence:
217+
matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces)
218+
if c == "\r" && nextIndex != str.endIndex && str.unicodeScalars[nextIndex] == "\n" {
219+
str.unicodeScalars.formIndex(after: &nextIndex)
220+
}
212221
case .whitespace:
213222
matched = c.properties.isWhitespace && (c.isASCII || !options.usesASCIISpaces)
214223
case .word:
215224
matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !options.usesASCIIWord)
216-
case .custom: fatalError("Not supported")
225+
case .custom(let set):
226+
matched = set.any { $0.matches(Character(c), with: options) }
217227
}
218228
if isInverted {
219229
matched.toggle()

Tests/RegexTests/UTS18Tests.swift

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,14 @@ import XCTest
2222
@testable // for internal `matches(of:)`
2323
import _StringProcessing
2424

25+
extension UnicodeScalar {
26+
var value4Digits: String {
27+
let valueString = String(value, radix: 16, uppercase: true)
28+
if valueString.count >= 4 { return valueString }
29+
return String(repeating: "0", count: 4 - valueString.count) + valueString
30+
}
31+
}
32+
2533
class UTS18Tests: XCTestCase {
2634
var input: String {
2735
"ABCdefghîøu\u{308}\u{FFF0} -–—[]123"
@@ -262,21 +270,33 @@ extension UTS18Tests {
262270
09\u{85}\
263271
10\u{2028}\
264272
11\u{2029}\
265-
273+
12
266274
"""
267275
// Check the input counts
268276
var lines = lineInput.matches(of: regex(#"\d{2}"#))
269-
XCTAssertEqual(lines.count, 11)
277+
XCTAssertEqual(lines.count, 12)
270278
// Test \R - newline sequence
271-
lines = lineInput.matches(of: regex(#"\d{2}\R"#))
279+
lines = lineInput.matches(of: regex(#"\d{2}\R^"#).anchorsMatchLineEndings())
280+
XCTAssertEqual(lines.count, 11)
281+
// Test \v - vertical space
282+
lines = lineInput.matches(of: regex(#"\d{2}\v^"#).anchorsMatchLineEndings())
272283
XCTAssertEqual(lines.count, 11)
273284
// Test anchors as line boundaries
274285
lines = lineInput.matches(of: regex(#"^\d{2}$"#).anchorsMatchLineEndings())
275-
XCTAssertEqual(lines.count, 11)
286+
XCTAssertEqual(lines.count, 12)
276287
// Test that dot does not match line endings
277288
lines = lineInput.matches(of: regex(#".+"#))
278-
XCTAssertEqual(lines.count, 11)
289+
XCTAssertEqual(lines.count, 12)
279290

291+
// Unicode scalar semantics - \R still matches all, including \r\n sequence
292+
lines = lineInput.matches(
293+
of: regex(#"\d{2}\R(?=\d)"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings())
294+
XCTAssertEqual(lines.count, 11)
295+
// Unicode scalar semantics - \v matches all except for \r\n sequence
296+
lines = lineInput.matches(
297+
of: regex(#"\d{2}\v(?=\d)"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings())
298+
XCTAssertEqual(lines.count, 10)
299+
280300
// Does not contain an empty line
281301
XCTAssertFalse(lineInput.contains(regex(#"^$"#)))
282302
// Does contain an empty line (between \n and \r, which are reversed here)

0 commit comments

Comments
 (0)