Skip to content

Commit f28eaa2

Browse files
committed
WIP: UTF8Span
1 parent 68546c7 commit f28eaa2

12 files changed

+2447
-0
lines changed

stdlib/public/core/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,12 @@ split_embedded_sources(
212212
EMBEDDED UTF8.swift
213213
EMBEDDED UTF16.swift
214214
EMBEDDED UTF32.swift
215+
EMBEDDED UTF8Span.swift
216+
EMBEDDED UTF8SpanBits.swift
217+
EMBEDDED UTF8SpanFundamentals.swift
218+
EMBEDDED UTF8SpanInternalHelpers.swift
219+
EMBEDDED UTF8SpanIterators.swift
220+
EMBEDDED UTF8EncodingError.swift
215221
EMBEDDED Unicode.swift # ORDER DEPENDENCY: must follow new unicode support
216222
EMBEDDED StringGraphemeBreaking.swift # ORDER DEPENDENCY: Must follow UTF16.swift
217223
EMBEDDED ValidUTF8Buffer.swift

stdlib/public/core/GroupInfo.json

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,14 @@
202202
"RawSpan.swift",
203203
"Span.swift"
204204
],
205+
"UTF8Span": [
206+
"UTF8EncodingError.swift",
207+
"UTF8Span.swift",
208+
"UTF8SpanBits.swift",
209+
"UTF8SpanFundamentals.swift",
210+
"UTF8SpanInternalHelpers.swift",
211+
"UTF8SpanIterators.swift"
212+
],
205213
"Protocols": [
206214
"CompilerProtocols.swift",
207215
"ShadowProtocols.swift"

stdlib/public/core/StringGraphemeBreaking.swift

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,19 @@ extension Unicode.Scalar {
463463
}
464464
}
465465

466+
extension Unicode {
467+
/// **QUESTION**: Hashable? Codable?
468+
/// **QUESTION**: Can we be frozen? What's the cost of resiliency here?
469+
public struct GraphemeBreakingState: Sendable, Equatable {
470+
internal var _state: _GraphemeBreakingState
471+
472+
/// ...
473+
public init() {
474+
self._state = .init()
475+
}
476+
}
477+
}
478+
466479
internal struct _GraphemeBreakingState: Sendable, Equatable {
467480
// When we're looking through an indic sequence, one of the requirements is
468481
// that there is at LEAST 1 InCB=Linker present between two InCB=Consonant.
@@ -520,6 +533,8 @@ extension Unicode {
520533
internal var _previous: Unicode.Scalar
521534
internal var _state: _GraphemeBreakingState
522535

536+
/// Refactoring TODO: should we use a quick check result?
537+
///
523538
/// Returns a non-nil value if it can be determined whether there is a
524539
/// grapheme break between `scalar1` and `scalar2` without knowing anything
525540
/// about the scalars that precede `scalar1`. This can optionally be used as
Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
extension Unicode.UTF8 {
2+
/**
3+
4+
The kind and location of a UTF-8 encoding error.
5+
6+
Valid UTF-8 is represented by this table:
7+
8+
```
9+
╔════════════════════╦════════╦════════╦════════╦════════╗
10+
║ Scalar value ║ Byte 0 ║ Byte 1 ║ Byte 2 ║ Byte 3 ║
11+
╠════════════════════╬════════╬════════╬════════╬════════╣
12+
║ U+0000..U+007F ║ 00..7F ║ ║ ║ ║
13+
║ U+0080..U+07FF ║ C2..DF ║ 80..BF ║ ║ ║
14+
║ U+0800..U+0FFF ║ E0 ║ A0..BF ║ 80..BF ║ ║
15+
║ U+1000..U+CFFF ║ E1..EC ║ 80..BF ║ 80..BF ║ ║
16+
║ U+D000..U+D7FF ║ ED ║ 80..9F ║ 80..BF ║ ║
17+
║ U+E000..U+FFFF ║ EE..EF ║ 80..BF ║ 80..BF ║ ║
18+
║ U+10000..U+3FFFF ║ F0 ║ 90..BF ║ 80..BF ║ 80..BF ║
19+
║ U+40000..U+FFFFF ║ F1..F3 ║ 80..BF ║ 80..BF ║ 80..BF ║
20+
║ U+100000..U+10FFFF ║ F4 ║ 80..8F ║ 80..BF ║ 80..BF ║
21+
╚════════════════════╩════════╩════════╩════════╩════════╝
22+
```
23+
24+
### Classifying errors
25+
26+
An *unexpected continuation* is when a continuation byte (`10xxxxxx`) occurs
27+
in a position that should be the start of a new scalar value. Unexpected
28+
continuations can often occur when the input contains arbitrary data
29+
instead of textual content. An unexpected continuation at the start of
30+
input might mean that the input was not correctly sliced along scalar
31+
boundaries or that it does not contain UTF-8.
32+
33+
A *truncated scalar* is a multi-byte sequence that is the start of a valid
34+
multi-byte scalar but is cut off before ending correctly. A truncated
35+
scalar at the end of the input might mean that only part of the entire
36+
input was received.
37+
38+
A *surrogate code point* (`U+D800..U+DFFF`) is invalid UTF-8. Surrogate
39+
code points are used by UTF-16 to encode scalars in the supplementary
40+
planes. Their presence may mean the input was encoded in a different 8-bit
41+
encoding, such as CESU-8, WTF-8, or Java's Modified UTF-8.
42+
43+
An *invalid non-surrogate code point* is any code point higher than
44+
`U+10FFFF`. This can often occur when the input is arbitrary data instead
45+
of textual content.
46+
47+
An *overlong encoding* occurs when a scalar value that could have been
48+
encoded using fewer bytes is encoded in a longer byte sequence. Overlong
49+
encodings are invalid UTF-8 and can lead to security issues if not
50+
correctly detected:
51+
52+
- https://nvd.nist.gov/vuln/detail/CVE-2008-2938
53+
- https://nvd.nist.gov/vuln/detail/CVE-2000-0884
54+
55+
An overlong encoding of `NUL`, `0xC0 0x80`, is used in Java's Modified
56+
UTF-8 but is invalid UTF-8. Overlong encoding errors often catch attempts
57+
to bypass security measures.
58+
59+
### Reporting the range of the error
60+
61+
The range of the error reported follows the *Maximal subpart of an
62+
ill-formed subsequence* algorithm in which each error is either one byte
63+
long or ends before the first byte that is disallowed. See "U+FFFD
64+
Substitution of Maximal Subparts" in the Unicode Standard. Unicode started
65+
recommending this algorithm in version 6 and is adopted by the W3C.
66+
67+
The maximal subpart algorithm will produce a single multi-byte range for a
68+
truncated scalar (a multi-byte sequence that is the start of a valid
69+
multi-byte scalar but is cut off before ending correctly). For all other
70+
errors (including overlong encodings, surrogates, and invalid code
71+
points), it will produce an error per byte.
72+
73+
Since overlong encodings, surrogates, and invalid code points are erroneous
74+
by the second byte (at the latest), the above definition produces the same
75+
ranges as defining such a sequence as a truncated scalar error followed by
76+
unexpected continuation byte errors. The more semantically-rich
77+
classification is reported.
78+
79+
For example, a surrogate count point sequence `ED A0 80` will be reported
80+
as three `.surrogateCodePointByte` errors rather than a `.truncatedScalar`
81+
followed by two `.unexpectedContinuationByte` errors.
82+
83+
Other commonly reported error ranges can be constructed from this result.
84+
For example, PEP 383's error-per-byte can be constructed by mapping over
85+
the reported range. Similarly, constructing a single error for the longest
86+
invalid byte range can be constructed by joining adjacent error ranges.
87+
88+
```
89+
╔═════════════════╦══════╦═════╦═════╦═════╦═════╦═════╦═════╦══════╗
90+
║ ║ 61 ║ F1 ║ 80 ║ 80 ║ E1 ║ 80 ║ C2 ║ 62 ║
91+
╠═════════════════╬══════╬═════╬═════╬═════╬═════╬═════╬═════╬══════╣
92+
║ Longest range ║ U+61 ║ err ║ ║ ║ ║ ║ ║ U+62 ║
93+
║ Maximal subpart ║ U+61 ║ err ║ ║ ║ err ║ ║ err ║ U+62 ║
94+
║ Error per byte ║ U+61 ║ err ║ err ║ err ║ err ║ err ║ err ║ U+62 ║
95+
╚═════════════════╩══════╩═════╩═════╩═════╩═════╩═════╩═════╩══════╝
96+
```
97+
98+
*/
99+
@available(SwiftStdlib 6.1, *)
100+
@frozen
101+
public struct EncodingError: Error, Sendable, Hashable
102+
// TODO: embedded? , Codable
103+
{
104+
/// The kind of encoding error
105+
public var kind: Unicode.UTF8.EncodingError.Kind
106+
107+
/// The range of offsets into our input containing the error
108+
public var range: Range<Int>
109+
110+
@_alwaysEmitIntoClient
111+
public init(
112+
_ kind: Unicode.UTF8.EncodingError.Kind,
113+
_ range: some RangeExpression<Int>
114+
) {
115+
self.kind = kind
116+
self.range = range.relative(to: Int.min..<Int.max)
117+
}
118+
119+
@_alwaysEmitIntoClient
120+
public init(_ kind: Unicode.UTF8.EncodingError.Kind, at: Int) {
121+
self.init(kind, at...at)
122+
}
123+
}
124+
}
125+
126+
127+
@available(SwiftStdlib 6.1, *)
128+
extension UTF8.EncodingError {
129+
/// The kind of encoding error encountered during validation
130+
@frozen
131+
public struct Kind: Error, Sendable, Hashable, RawRepresentable
132+
// FIXME: error unavailable in embedded swift, Codable
133+
{
134+
public var rawValue: UInt8
135+
136+
@inlinable
137+
public init(rawValue: UInt8) {
138+
self.rawValue = rawValue
139+
}
140+
141+
/// A continuation byte (`10xxxxxx`) outside of a multi-byte sequence
142+
@_alwaysEmitIntoClient
143+
public static var unexpectedContinuationByte: Self {
144+
.init(rawValue: 0)
145+
}
146+
147+
/// A byte in a surrogate code point (`U+D800..U+DFFF`) sequence
148+
@_alwaysEmitIntoClient
149+
public static var surrogateCodePointByte: Self {
150+
.init(rawValue: 1)
151+
}
152+
153+
/// A byte in an invalid, non-surrogate code point (`>U+10FFFF`) sequence
154+
@_alwaysEmitIntoClient
155+
public static var invalidNonSurrogateCodePointByte: Self {
156+
.init(rawValue: 2)
157+
}
158+
159+
/// A byte in an overlong encoding sequence
160+
@_alwaysEmitIntoClient
161+
public static var overlongEncodingByte: Self {
162+
.init(rawValue: 3)
163+
}
164+
165+
/// A multi-byte sequence that is the start of a valid multi-byte scalar
166+
/// but is cut off before ending correctly
167+
@_alwaysEmitIntoClient
168+
public static var truncatedScalar: Self {
169+
.init(rawValue: 4)
170+
}
171+
}
172+
}
173+
174+
@_unavailableInEmbedded
175+
@available(SwiftStdlib 6.1, *)
176+
extension UTF8.EncodingError.Kind: CustomStringConvertible {
177+
public var description: String {
178+
switch self {
179+
case .invalidNonSurrogateCodePointByte:
180+
".invalidNonSurrogateCodePointByte"
181+
case .overlongEncodingByte:
182+
".overlongEncodingByte"
183+
case .surrogateCodePointByte:
184+
".surrogateCodePointByte"
185+
case .truncatedScalar:
186+
".truncatedScalar"
187+
case .unexpectedContinuationByte:
188+
".unexpectedContinuationByte"
189+
default:
190+
fatalError("unreachable")
191+
}
192+
}
193+
}
194+
195+
@_unavailableInEmbedded
196+
@available(SwiftStdlib 6.1, *)
197+
extension UTF8.EncodingError: CustomStringConvertible {
198+
public var description: String {
199+
"UTF8.EncodingError(\(kind), \(range))"
200+
}
201+
}
202+
203+
@available(SwiftStdlib 6.1, *)
204+
extension UTF8 {
205+
public // For demo purposes
206+
static func _checkAllErrors(
207+
_ s: some Sequence<UInt8>
208+
) -> some Sequence<UTF8.EncodingError> {
209+
// TODO: Span fast path
210+
// TODO: Fixed size buffer for non-contig inputs
211+
// TODO: Lifetime-dependent result variant
212+
let cus = Array(s)
213+
return cus.withUnsafeBytes {
214+
var bufPtr = $0
215+
var start = 0
216+
var errors: Array<UTF8.EncodingError> = []
217+
218+
// Remember the previous error, so that we can
219+
// apply it to subsequent bytes instead of reporting
220+
// just `.unexpectedContinuation`.
221+
var priorError: UTF8.EncodingError? = nil
222+
while true {
223+
do throws(UTF8.EncodingError) {
224+
_ = try bufPtr.baseAddress!._validateUTF8(limitedBy: bufPtr.count)
225+
return errors
226+
} catch {
227+
let adjustedRange =
228+
error.range.lowerBound + start ..< error.range.upperBound + start
229+
230+
let kind: UTF8.EncodingError.Kind
231+
if let prior = priorError,
232+
prior.range.upperBound == adjustedRange.lowerBound,
233+
error.kind == .unexpectedContinuationByte
234+
{
235+
kind = prior.kind
236+
} else {
237+
kind = error.kind
238+
}
239+
let adjustedErr = UTF8.EncodingError(kind, adjustedRange)
240+
priorError = adjustedErr
241+
242+
let errEnd = error.range.upperBound
243+
start += errEnd
244+
bufPtr = .init(rebasing: bufPtr[errEnd...])
245+
errors.append(adjustedErr)
246+
}
247+
}
248+
}
249+
}
250+
}

0 commit comments

Comments
 (0)