|
| 1 | +extension Unicode.UTF8 { |
| 2 | + /** |
| 3 | + |
| 4 | + The kind and location of a UTF-8 encoding error. |
| 5 | + |
| 6 | + Valid UTF-8 is represented by this table: |
| 7 | + |
| 8 | + ``` |
| 9 | + ╔════════════════════╦════════╦════════╦════════╦════════╗ |
| 10 | + ║ Scalar value ║ Byte 0 ║ Byte 1 ║ Byte 2 ║ Byte 3 ║ |
| 11 | + ╠════════════════════╬════════╬════════╬════════╬════════╣ |
| 12 | + ║ U+0000..U+007F ║ 00..7F ║ ║ ║ ║ |
| 13 | + ║ U+0080..U+07FF ║ C2..DF ║ 80..BF ║ ║ ║ |
| 14 | + ║ U+0800..U+0FFF ║ E0 ║ A0..BF ║ 80..BF ║ ║ |
| 15 | + ║ U+1000..U+CFFF ║ E1..EC ║ 80..BF ║ 80..BF ║ ║ |
| 16 | + ║ U+D000..U+D7FF ║ ED ║ 80..9F ║ 80..BF ║ ║ |
| 17 | + ║ U+E000..U+FFFF ║ EE..EF ║ 80..BF ║ 80..BF ║ ║ |
| 18 | + ║ U+10000..U+3FFFF ║ F0 ║ 90..BF ║ 80..BF ║ 80..BF ║ |
| 19 | + ║ U+40000..U+FFFFF ║ F1..F3 ║ 80..BF ║ 80..BF ║ 80..BF ║ |
| 20 | + ║ U+100000..U+10FFFF ║ F4 ║ 80..8F ║ 80..BF ║ 80..BF ║ |
| 21 | + ╚════════════════════╩════════╩════════╩════════╩════════╝ |
| 22 | + ``` |
| 23 | + |
| 24 | + ### Classifying errors |
| 25 | + |
| 26 | + An *unexpected continuation* is when a continuation byte (`10xxxxxx`) occurs |
| 27 | + in a position that should be the start of a new scalar value. Unexpected |
| 28 | + continuations can often occur when the input contains arbitrary data |
| 29 | + instead of textual content. An unexpected continuation at the start of |
| 30 | + input might mean that the input was not correctly sliced along scalar |
| 31 | + boundaries or that it does not contain UTF-8. |
| 32 | + |
| 33 | + A *truncated scalar* is a multi-byte sequence that is the start of a valid |
| 34 | + multi-byte scalar but is cut off before ending correctly. A truncated |
| 35 | + scalar at the end of the input might mean that only part of the entire |
| 36 | + input was received. |
| 37 | + |
| 38 | + A *surrogate code point* (`U+D800..U+DFFF`) is invalid UTF-8. Surrogate |
| 39 | + code points are used by UTF-16 to encode scalars in the supplementary |
| 40 | + planes. Their presence may mean the input was encoded in a different 8-bit |
| 41 | + encoding, such as CESU-8, WTF-8, or Java's Modified UTF-8. |
| 42 | + |
| 43 | + An *invalid non-surrogate code point* is any code point higher than |
| 44 | + `U+10FFFF`. This can often occur when the input is arbitrary data instead |
| 45 | + of textual content. |
| 46 | + |
| 47 | + An *overlong encoding* occurs when a scalar value that could have been |
| 48 | + encoded using fewer bytes is encoded in a longer byte sequence. Overlong |
| 49 | + encodings are invalid UTF-8 and can lead to security issues if not |
| 50 | + correctly detected: |
| 51 | + |
| 52 | + - https://nvd.nist.gov/vuln/detail/CVE-2008-2938 |
| 53 | + - https://nvd.nist.gov/vuln/detail/CVE-2000-0884 |
| 54 | + |
| 55 | + An overlong encoding of `NUL`, `0xC0 0x80`, is used in Java's Modified |
| 56 | + UTF-8 but is invalid UTF-8. Overlong encoding errors often catch attempts |
| 57 | + to bypass security measures. |
| 58 | + |
| 59 | + ### Reporting the range of the error |
| 60 | + |
| 61 | + The range of the error reported follows the *Maximal subpart of an |
| 62 | + ill-formed subsequence* algorithm in which each error is either one byte |
| 63 | + long or ends before the first byte that is disallowed. See "U+FFFD |
| 64 | + Substitution of Maximal Subparts" in the Unicode Standard. Unicode started |
| 65 | + recommending this algorithm in version 6 and is adopted by the W3C. |
| 66 | + |
| 67 | + The maximal subpart algorithm will produce a single multi-byte range for a |
| 68 | + truncated scalar (a multi-byte sequence that is the start of a valid |
| 69 | + multi-byte scalar but is cut off before ending correctly). For all other |
| 70 | + errors (including overlong encodings, surrogates, and invalid code |
| 71 | + points), it will produce an error per byte. |
| 72 | + |
| 73 | + Since overlong encodings, surrogates, and invalid code points are erroneous |
| 74 | + by the second byte (at the latest), the above definition produces the same |
| 75 | + ranges as defining such a sequence as a truncated scalar error followed by |
| 76 | + unexpected continuation byte errors. The more semantically-rich |
| 77 | + classification is reported. |
| 78 | + |
| 79 | + For example, a surrogate count point sequence `ED A0 80` will be reported |
| 80 | + as three `.surrogateCodePointByte` errors rather than a `.truncatedScalar` |
| 81 | + followed by two `.unexpectedContinuationByte` errors. |
| 82 | + |
| 83 | + Other commonly reported error ranges can be constructed from this result. |
| 84 | + For example, PEP 383's error-per-byte can be constructed by mapping over |
| 85 | + the reported range. Similarly, constructing a single error for the longest |
| 86 | + invalid byte range can be constructed by joining adjacent error ranges. |
| 87 | + |
| 88 | + ``` |
| 89 | + ╔═════════════════╦══════╦═════╦═════╦═════╦═════╦═════╦═════╦══════╗ |
| 90 | + ║ ║ 61 ║ F1 ║ 80 ║ 80 ║ E1 ║ 80 ║ C2 ║ 62 ║ |
| 91 | + ╠═════════════════╬══════╬═════╬═════╬═════╬═════╬═════╬═════╬══════╣ |
| 92 | + ║ Longest range ║ U+61 ║ err ║ ║ ║ ║ ║ ║ U+62 ║ |
| 93 | + ║ Maximal subpart ║ U+61 ║ err ║ ║ ║ err ║ ║ err ║ U+62 ║ |
| 94 | + ║ Error per byte ║ U+61 ║ err ║ err ║ err ║ err ║ err ║ err ║ U+62 ║ |
| 95 | + ╚═════════════════╩══════╩═════╩═════╩═════╩═════╩═════╩═════╩══════╝ |
| 96 | + ``` |
| 97 | + |
| 98 | + */ |
| 99 | + @available(SwiftStdlib 6.1, *) |
| 100 | + @frozen |
| 101 | + public struct EncodingError: Error, Sendable, Hashable |
| 102 | + // TODO: embedded? , Codable |
| 103 | + { |
| 104 | + /// The kind of encoding error |
| 105 | + public var kind: Unicode.UTF8.EncodingError.Kind |
| 106 | + |
| 107 | + /// The range of offsets into our input containing the error |
| 108 | + public var range: Range<Int> |
| 109 | + |
| 110 | + @_alwaysEmitIntoClient |
| 111 | + public init( |
| 112 | + _ kind: Unicode.UTF8.EncodingError.Kind, |
| 113 | + _ range: some RangeExpression<Int> |
| 114 | + ) { |
| 115 | + self.kind = kind |
| 116 | + self.range = range.relative(to: Int.min..<Int.max) |
| 117 | + } |
| 118 | + |
| 119 | + @_alwaysEmitIntoClient |
| 120 | + public init(_ kind: Unicode.UTF8.EncodingError.Kind, at: Int) { |
| 121 | + self.init(kind, at...at) |
| 122 | + } |
| 123 | + } |
| 124 | +} |
| 125 | + |
| 126 | + |
| 127 | +@available(SwiftStdlib 6.1, *) |
| 128 | +extension UTF8.EncodingError { |
| 129 | + /// The kind of encoding error encountered during validation |
| 130 | + @frozen |
| 131 | + public struct Kind: Error, Sendable, Hashable, RawRepresentable |
| 132 | + // FIXME: error unavailable in embedded swift, Codable |
| 133 | + { |
| 134 | + public var rawValue: UInt8 |
| 135 | + |
| 136 | + @inlinable |
| 137 | + public init(rawValue: UInt8) { |
| 138 | + self.rawValue = rawValue |
| 139 | + } |
| 140 | + |
| 141 | + /// A continuation byte (`10xxxxxx`) outside of a multi-byte sequence |
| 142 | + @_alwaysEmitIntoClient |
| 143 | + public static var unexpectedContinuationByte: Self { |
| 144 | + .init(rawValue: 0) |
| 145 | + } |
| 146 | + |
| 147 | + /// A byte in a surrogate code point (`U+D800..U+DFFF`) sequence |
| 148 | + @_alwaysEmitIntoClient |
| 149 | + public static var surrogateCodePointByte: Self { |
| 150 | + .init(rawValue: 1) |
| 151 | + } |
| 152 | + |
| 153 | + /// A byte in an invalid, non-surrogate code point (`>U+10FFFF`) sequence |
| 154 | + @_alwaysEmitIntoClient |
| 155 | + public static var invalidNonSurrogateCodePointByte: Self { |
| 156 | + .init(rawValue: 2) |
| 157 | + } |
| 158 | + |
| 159 | + /// A byte in an overlong encoding sequence |
| 160 | + @_alwaysEmitIntoClient |
| 161 | + public static var overlongEncodingByte: Self { |
| 162 | + .init(rawValue: 3) |
| 163 | + } |
| 164 | + |
| 165 | + /// A multi-byte sequence that is the start of a valid multi-byte scalar |
| 166 | + /// but is cut off before ending correctly |
| 167 | + @_alwaysEmitIntoClient |
| 168 | + public static var truncatedScalar: Self { |
| 169 | + .init(rawValue: 4) |
| 170 | + } |
| 171 | + } |
| 172 | +} |
| 173 | + |
| 174 | +@_unavailableInEmbedded |
| 175 | +@available(SwiftStdlib 6.1, *) |
| 176 | +extension UTF8.EncodingError.Kind: CustomStringConvertible { |
| 177 | + public var description: String { |
| 178 | + switch self { |
| 179 | + case .invalidNonSurrogateCodePointByte: |
| 180 | + ".invalidNonSurrogateCodePointByte" |
| 181 | + case .overlongEncodingByte: |
| 182 | + ".overlongEncodingByte" |
| 183 | + case .surrogateCodePointByte: |
| 184 | + ".surrogateCodePointByte" |
| 185 | + case .truncatedScalar: |
| 186 | + ".truncatedScalar" |
| 187 | + case .unexpectedContinuationByte: |
| 188 | + ".unexpectedContinuationByte" |
| 189 | + default: |
| 190 | + fatalError("unreachable") |
| 191 | + } |
| 192 | + } |
| 193 | +} |
| 194 | + |
| 195 | +@_unavailableInEmbedded |
| 196 | +@available(SwiftStdlib 6.1, *) |
| 197 | +extension UTF8.EncodingError: CustomStringConvertible { |
| 198 | + public var description: String { |
| 199 | + "UTF8.EncodingError(\(kind), \(range))" |
| 200 | + } |
| 201 | +} |
| 202 | + |
| 203 | +@available(SwiftStdlib 6.1, *) |
| 204 | +extension UTF8 { |
| 205 | + public // For demo purposes |
| 206 | + static func _checkAllErrors( |
| 207 | + _ s: some Sequence<UInt8> |
| 208 | + ) -> some Sequence<UTF8.EncodingError> { |
| 209 | + // TODO: Span fast path |
| 210 | + // TODO: Fixed size buffer for non-contig inputs |
| 211 | + // TODO: Lifetime-dependent result variant |
| 212 | + let cus = Array(s) |
| 213 | + return cus.withUnsafeBytes { |
| 214 | + var bufPtr = $0 |
| 215 | + var start = 0 |
| 216 | + var errors: Array<UTF8.EncodingError> = [] |
| 217 | + |
| 218 | + // Remember the previous error, so that we can |
| 219 | + // apply it to subsequent bytes instead of reporting |
| 220 | + // just `.unexpectedContinuation`. |
| 221 | + var priorError: UTF8.EncodingError? = nil |
| 222 | + while true { |
| 223 | + do throws(UTF8.EncodingError) { |
| 224 | + _ = try bufPtr.baseAddress!._validateUTF8(limitedBy: bufPtr.count) |
| 225 | + return errors |
| 226 | + } catch { |
| 227 | + let adjustedRange = |
| 228 | + error.range.lowerBound + start ..< error.range.upperBound + start |
| 229 | + |
| 230 | + let kind: UTF8.EncodingError.Kind |
| 231 | + if let prior = priorError, |
| 232 | + prior.range.upperBound == adjustedRange.lowerBound, |
| 233 | + error.kind == .unexpectedContinuationByte |
| 234 | + { |
| 235 | + kind = prior.kind |
| 236 | + } else { |
| 237 | + kind = error.kind |
| 238 | + } |
| 239 | + let adjustedErr = UTF8.EncodingError(kind, adjustedRange) |
| 240 | + priorError = adjustedErr |
| 241 | + |
| 242 | + let errEnd = error.range.upperBound |
| 243 | + start += errEnd |
| 244 | + bufPtr = .init(rebasing: bufPtr[errEnd...]) |
| 245 | + errors.append(adjustedErr) |
| 246 | + } |
| 247 | + } |
| 248 | + } |
| 249 | + } |
| 250 | +} |
0 commit comments