Skip to content

Commit 0990483

Browse files
committed
Add in ASCII fast-path for anyNonNewline (swiftlang#654)
1 parent 65dff01 commit 0990483

File tree

3 files changed

+91
-39
lines changed

3 files changed

+91
-39
lines changed

Sources/_StringProcessing/Engine/MEBuiltins.swift

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,69 @@ extension Processor {
115115
}
116116
}
117117

118-
// MARK: Built-in character class matching
118+
// MARK: Matching `.`
119+
extension String {
120+
121+
func _matchAnyNonNewline(
122+
at currentPosition: String.Index,
123+
isScalarSemantics: Bool
124+
) -> String.Index? {
125+
guard currentPosition < endIndex else {
126+
return nil
127+
}
128+
if case .definite(let result) = _quickMatchAnyNonNewline(
129+
at: currentPosition,
130+
isScalarSemantics: isScalarSemantics
131+
) {
132+
assert(result == _thoroughMatchAnyNonNewline(
133+
at: currentPosition,
134+
isScalarSemantics: isScalarSemantics))
135+
return result
136+
}
137+
return _thoroughMatchAnyNonNewline(
138+
at: currentPosition,
139+
isScalarSemantics: isScalarSemantics)
140+
}
141+
142+
@inline(__always)
143+
func _quickMatchAnyNonNewline(
144+
at currentPosition: String.Index,
145+
isScalarSemantics: Bool
146+
) -> QuickResult<String.Index?> {
147+
assert(currentPosition < endIndex)
148+
guard let (asciiValue, next, isCRLF) = _quickASCIICharacter(
149+
at: currentPosition
150+
) else {
151+
return .unknown
152+
}
153+
switch asciiValue {
154+
case ._lineFeed, ._carriageReturn:
155+
return .definite(nil)
156+
default:
157+
assert(!isCRLF)
158+
return .definite(next)
159+
}
160+
}
161+
162+
@inline(never)
163+
func _thoroughMatchAnyNonNewline(
164+
at currentPosition: String.Index,
165+
isScalarSemantics: Bool
166+
) -> String.Index? {
167+
assert(currentPosition < endIndex)
168+
if isScalarSemantics {
169+
let scalar = unicodeScalars[currentPosition]
170+
guard !scalar.isNewline else { return nil }
171+
return unicodeScalars.index(after: currentPosition)
172+
}
173+
174+
let char = self[currentPosition]
175+
guard !char.isNewline else { return nil }
176+
return index(after: currentPosition)
177+
}
178+
}
119179

180+
// MARK: - Built-in character class matching
120181
extension String {
121182

122183
// Mentioned in ProgrammersManual.md, update docs if redesigned

Sources/_StringProcessing/Engine/Processor.swift

Lines changed: 11 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -346,23 +346,18 @@ extension Processor {
346346
return true
347347
}
348348

349-
// Matches the next character if it is not a newline
350-
mutating func matchAnyNonNewline() -> Bool {
351-
guard let c = load(), !c.isNewline else {
352-
signalFailure()
353-
return false
354-
}
355-
_uncheckedForcedConsumeOne()
356-
return true
357-
}
358-
359-
// Matches the next scalar if it is not a newline
360-
mutating func matchAnyNonNewlineScalar() -> Bool {
361-
guard let s = loadScalar(), !s.isNewline else {
349+
// Matches the next character/scalar if it is not a newline
350+
mutating func matchAnyNonNewline(
351+
isScalarSemantics: Bool
352+
) -> Bool {
353+
guard let next = input._matchAnyNonNewline(
354+
at: currentPosition,
355+
isScalarSemantics: isScalarSemantics
356+
) else {
362357
signalFailure()
363358
return false
364359
}
365-
input.unicodeScalars.formIndex(after: &currentPosition)
360+
currentPosition = next
366361
return true
367362
}
368363

@@ -535,14 +530,8 @@ extension Processor {
535530
}
536531
}
537532
case .matchAnyNonNewline:
538-
if payload.isScalar {
539-
if matchAnyNonNewlineScalar() {
540-
controller.step()
541-
}
542-
} else {
543-
if matchAnyNonNewline() {
544-
controller.step()
545-
}
533+
if matchAnyNonNewline(isScalarSemantics: payload.isScalar) {
534+
controller.step()
546535
}
547536
case .match:
548537
let (isCaseInsensitive, reg) = payload.elementPayload

Sources/_StringProcessing/Unicode/ASCII.swift

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,26 +9,25 @@
99
//
1010
//===----------------------------------------------------------------------===//
1111

12-
private var _lineFeed: UInt8 { 0x0A }
13-
private var _carriageReturn: UInt8 { 0x0D }
14-
private var _lineTab: UInt8 { 0x0B }
15-
private var _formFeed: UInt8 { 0x0C }
16-
private var _space: UInt8 { 0x20 }
17-
private var _tab: UInt8 { 0x09 }
12+
extension UInt8 {
13+
static var _lineFeed: UInt8 { 0x0A }
14+
static var _carriageReturn: UInt8 { 0x0D }
15+
static var _lineTab: UInt8 { 0x0B }
16+
static var _formFeed: UInt8 { 0x0C }
17+
static var _space: UInt8 { 0x20 }
18+
static var _tab: UInt8 { 0x09 }
19+
20+
static var _underscore: UInt8 { 0x5F }
21+
}
1822

1923
private var _0: UInt8 { 0x30 }
2024
private var _9: UInt8 { 0x39 }
21-
private func _isASCIINumber(_ x: UInt8) -> Bool {
22-
return (_0..._9).contains(x)
23-
}
2425

2526
private var _a: UInt8 { 0x61 }
2627
private var _z: UInt8 { 0x7A }
2728
private var _A: UInt8 { 0x41 }
2829
private var _Z: UInt8 { 0x5A }
2930

30-
private var _underscore: UInt8 { 0x5F }
31-
3231
extension UInt8 {
3332
var _isASCII: Bool { self < 0x80 }
3433

@@ -43,14 +42,14 @@ extension UInt8 {
4342
/// Assuming we're ASCII, whether we match `\h`
4443
var _asciiIsHorizontalWhitespace: Bool {
4544
assert(_isASCII)
46-
return self == _space || self == _tab
45+
return self == ._space || self == ._tab
4746
}
4847

4948
/// Assuming we're ASCII, whether we match `\v`
5049
var _asciiIsVerticalWhitespace: Bool {
5150
assert(_isASCII)
5251
switch self {
53-
case _lineFeed, _carriageReturn, _lineTab, _formFeed:
52+
case ._lineFeed, ._carriageReturn, ._lineTab, ._formFeed:
5453
return true
5554
default:
5655
return false
@@ -61,7 +60,7 @@ extension UInt8 {
6160
var _asciiIsWhitespace: Bool {
6261
assert(_isASCII)
6362
switch self {
64-
case _space, _tab, _lineFeed, _lineTab, _formFeed, _carriageReturn:
63+
case ._space, ._tab, ._lineFeed, ._lineTab, ._formFeed, ._carriageReturn:
6564
return true
6665
default:
6766
return false
@@ -77,11 +76,13 @@ extension UInt8 {
7776
/// Assuming we're ASCII, whether we match `\w`
7877
var _asciiIsWord: Bool {
7978
assert(_isASCII)
80-
return _asciiIsDigit || _asciiIsLetter || self == _underscore
79+
return _asciiIsDigit || _asciiIsLetter || self == ._underscore
8180
}
8281
}
8382

8483
extension String {
84+
/// TODO: better to take isScalarSemantics parameter, we can return more results
85+
/// and we can give the right `next` index, not requiring the caller to re-adjust it
8586
/// TODO: detailed description of nuanced semantics
8687
func _quickASCIICharacter(
8788
at idx: Index
@@ -107,7 +108,7 @@ extension String {
107108
guard tail._isSub300StartingByte else { return nil }
108109

109110
// Handle CR-LF:
110-
if base == _carriageReturn && tail == _lineFeed {
111+
if base == ._carriageReturn && tail == ._lineFeed {
111112
utf8.formIndex(after: &next)
112113
guard next == endIndex || utf8[next]._isSub300StartingByte else {
113114
return nil
@@ -165,5 +166,6 @@ extension String {
165166
return (next, asciiValue._asciiIsWord)
166167
}
167168
}
169+
168170
}
169171

0 commit comments

Comments
 (0)