@@ -18,30 +18,97 @@ private func _isNotOverlong_F0(_ x: UInt8) -> Bool {
18
18
return ( 0x90 ... 0xBF ) . contains ( x)
19
19
}
20
20
21
- private func _isNotOverlong_F4 ( _ x: UInt8 ) -> Bool {
21
+ private func _isNotInvalid_F4 ( _ x: UInt8 ) -> Bool {
22
22
return UTF8 . isContinuation ( x) && x <= 0x8F
23
23
}
24
24
25
25
private func _isNotOverlong_E0( _ x: UInt8 ) -> Bool {
26
26
return ( 0xA0 ... 0xBF ) . contains ( x)
27
27
}
28
28
29
- private func _isNotOverlong_ED ( _ x: UInt8 ) -> Bool {
29
+ private func _isNotInvalid_ED ( _ x: UInt8 ) -> Bool {
30
30
return UTF8 . isContinuation ( x) && x <= 0x9F
31
31
}
32
32
33
33
internal struct UTF8ExtraInfo : Equatable {
34
34
public var isASCII : Bool
35
35
}
36
36
37
+ @inline ( never) // slow-path
38
+ private func _diagnoseInvalidUTF8MultiByteLeading(
39
+ _ x: UInt8
40
+ ) -> _UTF8EncodingErrorKind {
41
+ _internalInvariant ( x >= 0x80 )
42
+ _internalInvariant ( !_isUTF8MultiByteLeading( x) )
43
+ switch x {
44
+ case 0x80 ... 0xBF :
45
+ return . unexpectedContinuationByte
46
+ case 0xC0 ..< 0xC2 :
47
+ return . overlongEncodingByte
48
+ default :
49
+ _internalInvariant ( x > 0xF4 )
50
+ return . invalidNonSurrogateCodePointByte
51
+ }
52
+ }
53
+
37
54
internal enum UTF8ValidationResult {
38
55
case success( UTF8ExtraInfo )
39
- case error( toBeReplaced: Range < Int > )
56
+ case error(
57
+ kind: _UTF8EncodingErrorKind , toBeReplaced: Range < Int >
58
+ )
40
59
}
41
60
42
- extension UTF8ValidationResult : Equatable { }
61
+ // FIXME: refactor other parts of stdlib to avoid this dumb mirror enum
62
+ //
63
+ // Mirror of UTF8.EncodingError.Kind, available on 6.1
64
+ internal struct _UTF8EncodingErrorKind : Error , Sendable , Hashable
65
+ // TODO: embedded?, Codable
66
+ , RawRepresentable {
67
+ internal var rawValue : UInt8
68
+
69
+ @available ( SwiftStdlib 6 . 1 , * )
70
+ internal var _publicKind : UTF8 . EncodingError . Kind {
71
+ . init( rawValue: self . rawValue)
72
+ }
73
+
74
+ @inlinable
75
+ internal init ( rawValue: UInt8 ) {
76
+ self . rawValue = rawValue
77
+ }
78
+
79
+ /// A continuation byte (`10xxxxxx`) outside of a multi-byte sequence
80
+ @_alwaysEmitIntoClient
81
+ internal static var unexpectedContinuationByte : Self {
82
+ . init( rawValue: 0 )
83
+ }
84
+
85
+ /// A byte in a surrogate code point (`U+D800..U+DFFF`) sequence
86
+ @_alwaysEmitIntoClient
87
+ internal static var surrogateCodePointByte : Self {
88
+ . init( rawValue: 1 )
89
+ }
43
90
44
- private struct UTF8ValidationError : Error { }
91
+ /// A byte in an invalid, non-surrogate code point (`>U+10FFFF`) sequence
92
+ @_alwaysEmitIntoClient
93
+ internal static var invalidNonSurrogateCodePointByte : Self {
94
+ . init( rawValue: 2 )
95
+ }
96
+
97
+ /// A byte in an overlong encoding sequence
98
+ @_alwaysEmitIntoClient
99
+ internal static var overlongEncodingByte : Self {
100
+ . init( rawValue: 3 )
101
+ }
102
+
103
+ /// A multi-byte sequence that is the start of a valid multi-byte scalar
104
+ /// but is cut off before ending correctly
105
+ @_alwaysEmitIntoClient
106
+ internal static var truncatedScalar : Self {
107
+ . init( rawValue: 4 )
108
+ }
109
+ }
110
+
111
+ extension UTF8ValidationResult : Equatable { }
45
112
46
113
internal func validateUTF8( _ buf: UnsafeBufferPointer < UInt8 > ) -> UTF8ValidationResult {
47
114
if unsafe _allASCII( buf) {
@@ -51,12 +118,20 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
51
118
var iter = unsafe buf. makeIterator ( )
52
119
var lastValidIndex = buf. startIndex
53
120
54
- @inline ( __always) func guaranteeIn( _ f: ( UInt8 ) -> Bool ) throws ( UTF8ValidationError) {
55
- guard let cu = unsafe iter. next ( ) else { throw UTF8ValidationError ( ) }
56
- guard f ( cu) else { throw UTF8ValidationError ( ) }
121
+ @inline ( __always) func guarantee(
122
+ _ f: ( UInt8 ) -> Bool ,
123
+ _ err: _UTF8EncodingErrorKind
124
+ ) throws ( _UTF8EncodingErrorKind) {
125
+ guard let cu = unsafe iter. next ( ) else {
126
+ throw . truncatedScalar
127
+ }
128
+ guard f ( cu) else {
129
+ throw err
130
+ }
57
131
}
58
- @inline ( __always) func guaranteeContinuation( ) throws ( UTF8ValidationError) {
59
- try guaranteeIn ( UTF8 . isContinuation)
132
+ @inline ( __always) func guaranteeContinuation(
133
+ ) throws ( _UTF8EncodingErrorKind) {
134
+ try guarantee ( UTF8 . isContinuation, . truncatedScalar)
60
135
}
61
136
62
137
func _legacyInvalidLengthCalculation( _ _buffer: ( _storage: UInt32 , ( ) ) ) -> Int {
@@ -117,37 +192,56 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
117
192
return unsafe _legacyNarrowIllegalRange( buf: buf [ illegalRange] )
118
193
}
119
194
120
- do {
195
+ do throws( _UTF8EncodingErrorKind) {
196
+
197
+ /*
198
+ The table of valid UTF-8 is:
199
+
200
+ ╔════════════════════╦════════╦════════╦════════╦════════╗
201
+ ║ Scalar value ║ Byte 0 ║ Byte 1 ║ Byte 2 ║ Byte 3 ║
202
+ ╠════════════════════╬════════╬════════╬════════╬════════╣
203
+ ║ U+0000..U+007F ║ 00..7F ║ ║ ║ ║
204
+ ║ U+0080..U+07FF ║ C2..DF ║ Contin ║ ║ ║
205
+ ║ U+0800..U+0FFF ║ E0 ║ A0..BF ║ Contin ║ ║
206
+ ║ U+1000..U+CFFF ║ E1..EC ║ Contin ║ Contin ║ ║
207
+ ║ U+D000..U+D7FF ║ ED ║ 80..9F ║ Contin ║ ║
208
+ ║ U+E000..U+FFFF ║ EE..EF ║ Contin ║ Contin ║ ║
209
+ ║ U+10000..U+3FFFF ║ F0 ║ 90..BF ║ Contin ║ Contin ║
210
+ ║ U+40000..U+FFFFF ║ F1..F3 ║ Contin ║ Contin ║ Contin ║
211
+ ║ U+100000..U+10FFFF ║ F4 ║ 80..8F ║ Contin ║ Contin ║
212
+ ╚════════════════════╩════════╩════════╩════════╩════════╝
213
+
214
+ "Contin" is any continuation byte, i.e. 80..BF or 10xxxxxx
215
+ */
121
216
var isASCII = true
122
217
while let cu = unsafe iter. next ( ) {
123
218
if UTF8 . isASCII ( cu) { lastValidIndex &+= 1 ; continue }
124
219
isASCII = false
125
220
if _slowPath ( !_isUTF8MultiByteLeading( cu) ) {
126
- func fail( ) throws ( UTF8ValidationError) { throw UTF8ValidationError ( ) }
127
- try fail ( )
221
+ throw _diagnoseInvalidUTF8MultiByteLeading ( cu)
128
222
}
129
223
switch cu {
130
224
case 0xC2 ... 0xDF :
131
225
try guaranteeContinuation ( )
132
226
lastValidIndex &+= 2
133
227
case 0xE0 :
134
- try guaranteeIn ( _isNotOverlong_E0)
228
+ try guarantee ( _isNotOverlong_E0, . overlongEncodingByte )
135
229
try guaranteeContinuation ( )
136
230
lastValidIndex &+= 3
137
231
case 0xE1 ... 0xEC :
138
232
try guaranteeContinuation ( )
139
233
try guaranteeContinuation ( )
140
234
lastValidIndex &+= 3
141
235
case 0xED :
142
- try guaranteeIn ( _isNotOverlong_ED )
236
+ try guarantee ( _isNotInvalid_ED , . surrogateCodePointByte )
143
237
try guaranteeContinuation ( )
144
238
lastValidIndex &+= 3
145
239
case 0xEE ... 0xEF :
146
240
try guaranteeContinuation ( )
147
241
try guaranteeContinuation ( )
148
242
lastValidIndex &+= 3
149
243
case 0xF0 :
150
- try guaranteeIn ( _isNotOverlong_F0)
244
+ try guarantee ( _isNotOverlong_F0, . overlongEncodingByte )
151
245
try guaranteeContinuation ( )
152
246
try guaranteeContinuation ( )
153
247
lastValidIndex &+= 4
@@ -157,7 +251,8 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
157
251
try guaranteeContinuation ( )
158
252
lastValidIndex &+= 4
159
253
case 0xF4 :
160
- try guaranteeIn ( _isNotOverlong_F4)
254
+ try guarantee (
255
+ _isNotInvalid_F4, . invalidNonSurrogateCodePointByte)
161
256
try guaranteeContinuation ( )
162
257
try guaranteeContinuation ( )
163
258
lastValidIndex &+= 4
@@ -167,7 +262,9 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
167
262
}
168
263
return . success( UTF8ExtraInfo ( isASCII: isASCII) )
169
264
} catch {
170
- return unsafe . error ( toBeReplaced: findInvalidRange ( buf [ lastValidIndex... ] ) )
265
+ return unsafe . error (
266
+ kind: error,
267
+ toBeReplaced: findInvalidRange ( buf [ lastValidIndex... ] ) )
171
268
}
172
269
}
173
270
@@ -214,7 +311,7 @@ internal func repairUTF8(_ input: UnsafeBufferPointer<UInt8>, firstKnownBrokenRa
214
311
case . success:
215
312
unsafe result. appendInPlace ( remainingInput, isASCII: false )
216
313
return String ( result)
217
- case . error( let newBrokenRange) :
314
+ case . error( _ , let newBrokenRange) :
218
315
brokenRange = newBrokenRange
219
316
}
220
317
} while !remainingInput. isEmpty
0 commit comments