Skip to content

Commit b2a3ff0

Browse files
committed
[String] Refactor validation and grapheme breaking
Internal refactoring. Validation now produces a (discardable) reason validation failed (only checked on slow-path). _hasGraphemeBreakBetween has been refactored to represent its quick-check nature.
1 parent 4da795f commit b2a3ff0

File tree

3 files changed

+138
-50
lines changed

3 files changed

+138
-50
lines changed

stdlib/public/core/StringCreate.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ extension String {
117117
return unsafe (String._uncheckedFromUTF8(
118118
input, asciiPreScanResult: extraInfo.isASCII
119119
), false)
120-
case .error(let initialRange):
120+
case .error(_, let initialRange):
121121
return unsafe (repairUTF8(input, firstKnownBrokenRange: initialRange), true)
122122
}
123123
}
@@ -139,7 +139,7 @@ extension String {
139139
newIsASCII: info.isASCII
140140
)
141141
return result.asString
142-
case .error(let initialRange):
142+
case .error(_, let initialRange):
143143
defer { _fixLifetime(result) }
144144
//This could be optimized to use excess tail capacity
145145
return unsafe repairUTF8(result.codeUnits, firstKnownBrokenRange: initialRange)

stdlib/public/core/StringGraphemeBreaking.swift

Lines changed: 20 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,18 @@
1313
import SwiftShims
1414

1515
/// CR and LF are common special cases in grapheme breaking logic
16-
private var _CR: UInt8 { return 0x0d }
17-
private var _LF: UInt8 { return 0x0a }
18-
19-
internal func _hasGraphemeBreakBetween(
16+
private var _CR: UInt8 { return 0x0D }
17+
private var _LF: UInt8 { return 0x0A }
18+
19+
/// Perform a quick-check to determine if there's a grapheme-break between two
20+
/// scalars, without consulting the data tables. Returns true if there
21+
/// definitely is a break, false if there definitely is none, and nil if a
22+
/// break couldn't be determined
23+
internal func _quickHasGraphemeBreakBetween(
2024
_ lhs: Unicode.Scalar, _ rhs: Unicode.Scalar
21-
) -> Bool {
22-
23-
// CR-LF is a special case: no break between these
25+
) -> Bool? {
26+
// GB3:
27+
// CR-LF is a special case: no break between these
2428
if lhs == Unicode.Scalar(_CR) && rhs == Unicode.Scalar(_LF) {
2529
return false
2630
}
@@ -80,7 +84,10 @@ internal func _hasGraphemeBreakBetween(
8084
default: return false
8185
}
8286
}
83-
return hasBreakWhenPaired(lhs) && hasBreakWhenPaired(rhs)
87+
if hasBreakWhenPaired(lhs) && hasBreakWhenPaired(rhs) {
88+
return true
89+
}
90+
return nil
8491
}
8592

8693
extension _StringGuts {
@@ -523,13 +530,7 @@ extension Unicode {
523530
between scalar1: Unicode.Scalar,
524531
and scalar2: Unicode.Scalar
525532
) -> Bool? {
526-
if scalar1.value == 0xD, scalar2.value == 0xA {
527-
return false
528-
}
529-
if _hasGraphemeBreakBetween(scalar1, scalar2) {
530-
return true
531-
}
532-
return nil
533+
_quickHasGraphemeBreakBetween(scalar1, scalar2)
533534
}
534535

535536
/// Initialize a new character recognizer at the _start of text_ (sot)
@@ -708,13 +709,8 @@ extension _GraphemeBreakingState {
708709
between scalar1: Unicode.Scalar,
709710
and scalar2: Unicode.Scalar
710711
) -> Bool {
711-
// GB3
712-
if scalar1.value == 0xD, scalar2.value == 0xA {
713-
return false
714-
}
715-
716-
if _hasGraphemeBreakBetween(scalar1, scalar2) {
717-
return true
712+
if let result = _quickHasGraphemeBreakBetween(scalar1, scalar2) {
713+
return result
718714
}
719715

720716
let x = Unicode._GraphemeBreakProperty(from: scalar1)
@@ -883,13 +879,8 @@ extension _StringGuts {
883879
at index: Int,
884880
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
885881
) -> Bool {
886-
// GB3
887-
if scalar1.value == 0xD, scalar2.value == 0xA {
888-
return false
889-
}
890-
891-
if _hasGraphemeBreakBetween(scalar1, scalar2) {
892-
return true
882+
if let result = _quickHasGraphemeBreakBetween(scalar1, scalar2) {
883+
return result
893884
}
894885

895886
let x = Unicode._GraphemeBreakProperty(from: scalar1)

stdlib/public/core/StringUTF8Validation.swift

Lines changed: 116 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,30 +18,97 @@ private func _isNotOverlong_F0(_ x: UInt8) -> Bool {
1818
return (0x90...0xBF).contains(x)
1919
}
2020

21-
private func _isNotOverlong_F4(_ x: UInt8) -> Bool {
21+
private func _isNotInvalid_F4(_ x: UInt8) -> Bool {
2222
return UTF8.isContinuation(x) && x <= 0x8F
2323
}
2424

2525
private func _isNotOverlong_E0(_ x: UInt8) -> Bool {
2626
return (0xA0...0xBF).contains(x)
2727
}
2828

29-
private func _isNotOverlong_ED(_ x: UInt8) -> Bool {
29+
private func _isNotInvalid_ED(_ x: UInt8) -> Bool {
3030
return UTF8.isContinuation(x) && x <= 0x9F
3131
}
3232

3333
internal struct UTF8ExtraInfo: Equatable {
3434
public var isASCII: Bool
3535
}
3636

37+
@inline(never) // slow-path
38+
private func _diagnoseInvalidUTF8MultiByteLeading(
39+
_ x: UInt8
40+
) -> _UTF8EncodingErrorKind {
41+
_internalInvariant(x >= 0x80)
42+
_internalInvariant(!_isUTF8MultiByteLeading(x))
43+
switch x {
44+
case 0x80...0xBF:
45+
return .unexpectedContinuationByte
46+
case 0xC0..<0xC2:
47+
return .overlongEncodingByte
48+
default:
49+
_internalInvariant(x > 0xF4)
50+
return .invalidNonSurrogateCodePointByte
51+
}
52+
}
53+
3754
internal enum UTF8ValidationResult {
3855
case success(UTF8ExtraInfo)
39-
case error(toBeReplaced: Range<Int>)
56+
case error(
57+
kind: _UTF8EncodingErrorKind, toBeReplaced: Range<Int>
58+
)
4059
}
4160

42-
extension UTF8ValidationResult: Equatable {}
61+
// FIXME: refactor other parts of stdlib to avoid this dumb mirror enum
62+
//
63+
// Mirror of UTF8.EncodingError.Kind, available on 6.1
64+
internal struct _UTF8EncodingErrorKind: Error, Sendable, Hashable
65+
// TODO: embedded?, Codable
66+
, RawRepresentable {
67+
internal var rawValue: UInt8
68+
69+
@available(SwiftStdlib 6.1, *)
70+
internal var _publicKind: UTF8.EncodingError.Kind {
71+
.init(rawValue: self.rawValue)
72+
}
73+
74+
@inlinable
75+
internal init(rawValue: UInt8) {
76+
self.rawValue = rawValue
77+
}
78+
79+
/// A continuation byte (`10xxxxxx`) outside of a multi-byte sequence
80+
@_alwaysEmitIntoClient
81+
internal static var unexpectedContinuationByte: Self {
82+
.init(rawValue: 0)
83+
}
84+
85+
/// A byte in a surrogate code point (`U+D800..U+DFFF`) sequence
86+
@_alwaysEmitIntoClient
87+
internal static var surrogateCodePointByte: Self {
88+
.init(rawValue: 1)
89+
}
4390

44-
private struct UTF8ValidationError: Error {}
91+
/// A byte in an invalid, non-surrogate code point (`>U+10FFFF`) sequence
92+
@_alwaysEmitIntoClient
93+
internal static var invalidNonSurrogateCodePointByte: Self {
94+
.init(rawValue: 2)
95+
}
96+
97+
/// A byte in an overlong encoding sequence
98+
@_alwaysEmitIntoClient
99+
internal static var overlongEncodingByte: Self {
100+
.init(rawValue: 3)
101+
}
102+
103+
/// A multi-byte sequence that is the start of a valid multi-byte scalar
104+
/// but is cut off before ending correctly
105+
@_alwaysEmitIntoClient
106+
internal static var truncatedScalar: Self {
107+
.init(rawValue: 4)
108+
}
109+
}
110+
111+
extension UTF8ValidationResult: Equatable {}
45112

46113
internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationResult {
47114
if unsafe _allASCII(buf) {
@@ -51,12 +118,20 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
51118
var iter = unsafe buf.makeIterator()
52119
var lastValidIndex = buf.startIndex
53120

54-
@inline(__always) func guaranteeIn(_ f: (UInt8) -> Bool) throws(UTF8ValidationError) {
55-
guard let cu = unsafe iter.next() else { throw UTF8ValidationError() }
56-
guard f(cu) else { throw UTF8ValidationError() }
121+
@inline(__always) func guarantee(
122+
_ f: (UInt8) -> Bool,
123+
_ err: _UTF8EncodingErrorKind
124+
) throws(_UTF8EncodingErrorKind) {
125+
guard let cu = unsafe iter.next() else {
126+
throw .truncatedScalar
127+
}
128+
guard f(cu) else {
129+
throw err
130+
}
57131
}
58-
@inline(__always) func guaranteeContinuation() throws(UTF8ValidationError) {
59-
try guaranteeIn(UTF8.isContinuation)
132+
@inline(__always) func guaranteeContinuation(
133+
) throws(_UTF8EncodingErrorKind) {
134+
try guarantee(UTF8.isContinuation, .truncatedScalar)
60135
}
61136

62137
func _legacyInvalidLengthCalculation(_ _buffer: (_storage: UInt32, ())) -> Int {
@@ -117,37 +192,56 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
117192
return unsafe _legacyNarrowIllegalRange(buf: buf[illegalRange])
118193
}
119194

120-
do {
195+
do throws(_UTF8EncodingErrorKind) {
196+
197+
/*
198+
The table of valid UTF-8 is:
199+
200+
╔════════════════════╦════════╦════════╦════════╦════════╗
201+
║ Scalar value ║ Byte 0 ║ Byte 1 ║ Byte 2 ║ Byte 3 ║
202+
╠════════════════════╬════════╬════════╬════════╬════════╣
203+
║ U+0000..U+007F ║ 00..7F ║ ║ ║ ║
204+
║ U+0080..U+07FF ║ C2..DF ║ Contin ║ ║ ║
205+
║ U+0800..U+0FFF ║ E0 ║ A0..BF ║ Contin ║ ║
206+
║ U+1000..U+CFFF ║ E1..EC ║ Contin ║ Contin ║ ║
207+
║ U+D000..U+D7FF ║ ED ║ 80..9F ║ Contin ║ ║
208+
║ U+E000..U+FFFF ║ EE..EF ║ Contin ║ Contin ║ ║
209+
║ U+10000..U+3FFFF ║ F0 ║ 90..BF ║ Contin ║ Contin ║
210+
║ U+40000..U+FFFFF ║ F1..F3 ║ Contin ║ Contin ║ Contin ║
211+
║ U+100000..U+10FFFF ║ F4 ║ 80..8F ║ Contin ║ Contin ║
212+
╚════════════════════╩════════╩════════╩════════╩════════╝
213+
214+
"Contin" is any continuation byte, i.e. 80..BF or 10xxxxxx
215+
*/
121216
var isASCII = true
122217
while let cu = unsafe iter.next() {
123218
if UTF8.isASCII(cu) { lastValidIndex &+= 1; continue }
124219
isASCII = false
125220
if _slowPath(!_isUTF8MultiByteLeading(cu)) {
126-
func fail() throws(UTF8ValidationError) { throw UTF8ValidationError() }
127-
try fail()
221+
throw _diagnoseInvalidUTF8MultiByteLeading(cu)
128222
}
129223
switch cu {
130224
case 0xC2...0xDF:
131225
try guaranteeContinuation()
132226
lastValidIndex &+= 2
133227
case 0xE0:
134-
try guaranteeIn(_isNotOverlong_E0)
228+
try guarantee(_isNotOverlong_E0, .overlongEncodingByte)
135229
try guaranteeContinuation()
136230
lastValidIndex &+= 3
137231
case 0xE1...0xEC:
138232
try guaranteeContinuation()
139233
try guaranteeContinuation()
140234
lastValidIndex &+= 3
141235
case 0xED:
142-
try guaranteeIn(_isNotOverlong_ED)
236+
try guarantee(_isNotInvalid_ED, .surrogateCodePointByte)
143237
try guaranteeContinuation()
144238
lastValidIndex &+= 3
145239
case 0xEE...0xEF:
146240
try guaranteeContinuation()
147241
try guaranteeContinuation()
148242
lastValidIndex &+= 3
149243
case 0xF0:
150-
try guaranteeIn(_isNotOverlong_F0)
244+
try guarantee(_isNotOverlong_F0, .overlongEncodingByte)
151245
try guaranteeContinuation()
152246
try guaranteeContinuation()
153247
lastValidIndex &+= 4
@@ -157,7 +251,8 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
157251
try guaranteeContinuation()
158252
lastValidIndex &+= 4
159253
case 0xF4:
160-
try guaranteeIn(_isNotOverlong_F4)
254+
try guarantee(
255+
_isNotInvalid_F4, .invalidNonSurrogateCodePointByte)
161256
try guaranteeContinuation()
162257
try guaranteeContinuation()
163258
lastValidIndex &+= 4
@@ -167,7 +262,9 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
167262
}
168263
return .success(UTF8ExtraInfo(isASCII: isASCII))
169264
} catch {
170-
return unsafe .error(toBeReplaced: findInvalidRange(buf[lastValidIndex...]))
265+
return unsafe .error(
266+
kind: error,
267+
toBeReplaced: findInvalidRange(buf[lastValidIndex...]))
171268
}
172269
}
173270

@@ -214,7 +311,7 @@ internal func repairUTF8(_ input: UnsafeBufferPointer<UInt8>, firstKnownBrokenRa
214311
case .success:
215312
unsafe result.appendInPlace(remainingInput, isASCII: false)
216313
return String(result)
217-
case .error(let newBrokenRange):
314+
case .error(_, let newBrokenRange):
218315
brokenRange = newBrokenRange
219316
}
220317
} while !remainingInput.isEmpty

0 commit comments

Comments
 (0)