-
Notifications
You must be signed in to change notification settings - Fork 10.5k
[SE-0456] Span properties over utf8 views #80116
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3015bf5
a061425
af0aa94
9ae58c9
03208a6
85a9be6
6059be4
e3234f8
17412a9
9842a79
ea44ff9
0976b61
c37e152
0bbec0c
bde7a18
f3a0166
5eab446
278f76b
7539366
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,7 @@ | |
// | ||
// This source file is part of the Swift.org open source project | ||
// | ||
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors | ||
// Copyright (c) 2014 - 2025 Apple Inc. and the Swift project authors | ||
// Licensed under Apache License v2.0 with Runtime Library Exception | ||
// | ||
// See https://swift.org/LICENSE.txt for license information | ||
|
@@ -317,6 +317,49 @@ extension String.UTF8View { | |
} | ||
} | ||
|
||
extension String.UTF8View { | ||
|
||
/// A span over the UTF8 code units that make up this string. | ||
/// | ||
/// - Note: In the case of bridged UTF16 String instances (on Apple | ||
/// platforms,) this property transcodes the code units the first time | ||
/// it is called. The transcoded buffer is cached, and subsequent calls | ||
/// to `span` can reuse the buffer. | ||
/// | ||
/// Returns: a `Span` over the UTF8 code units of this String. | ||
/// | ||
/// Complexity: O(1) for native UTF8 Strings, | ||
/// amortized O(1) for bridged UTF16 Strings. | ||
@available(SwiftStdlib 6.2, *) | ||
public var span: Span<UTF8.CodeUnit> { | ||
@lifetime(borrow self) | ||
glessard marked this conversation as resolved.
Show resolved
Hide resolved
|
||
borrowing get { | ||
#if _runtime(_ObjC) | ||
// handle non-UTF8 Objective-C bridging cases here | ||
if !_guts.isFastUTF8 && _guts._object.hasObjCBridgeableObject { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It occurred to me today that we may be able to do better here. If we're careful we should be able to handle that last case without using associated objects, which will make "NoCopy" bridged ASCII NSStrings fall into the happy path here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here's the relevant bit of StringBridge.swift, we'd want a path that passes 0 instead of 1 as the argument
|
||
let storage = _guts._getOrAllocateAssociatedStorage() | ||
let (start, count) = unsafe (storage.start, storage.count) | ||
let span = unsafe Span(_unsafeStart: start, count: count) | ||
return unsafe _overrideLifetime(span, borrowing: self) | ||
} | ||
#endif | ||
let count = _guts.count | ||
if _guts.isSmall { | ||
let a = Builtin.addressOfBorrow(self) | ||
let address = unsafe UnsafePointer<UTF8.CodeUnit>(a) | ||
let span = unsafe Span(_unsafeStart: address, count: count) | ||
fatalError("Span over the small string form is not supported yet.") | ||
return unsafe _overrideLifetime(span, borrowing: self) | ||
} | ||
_precondition(_guts.isFastUTF8) | ||
let buffer = unsafe _guts._object.fastUTF8 | ||
_internalInvariant(count == buffer.count) | ||
let span = unsafe Span(_unsafeElements: buffer) | ||
return unsafe _overrideLifetime(span, borrowing: self) | ||
} | ||
} | ||
} | ||
|
||
// Index conversions | ||
extension String.UTF8View.Index { | ||
/// Creates an index in the given UTF-8 view that corresponds exactly to the | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,7 @@ | |
// | ||
// This source file is part of the Swift.org open source project | ||
// | ||
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors | ||
// Copyright (c) 2014 - 2025 Apple Inc. and the Swift project authors | ||
// Licensed under Apache License v2.0 with Runtime Library Exception | ||
// | ||
// See https://swift.org/LICENSE.txt for license information | ||
|
@@ -749,6 +749,61 @@ extension Substring.UTF8View: BidirectionalCollection { | |
} | ||
} | ||
|
||
extension Substring.UTF8View { | ||
|
||
/// A span over the UTF8 code units that make up this substring. | ||
/// | ||
/// - Note: In the case of bridged UTF16 String instances (on Apple | ||
/// platforms,) this property needs to transcode the code units every time | ||
/// it is called. | ||
/// For example, if `string` has the bridged UTF16 representation, | ||
/// for word in string.split(separator: " ") { | ||
/// useSpan(word.span) | ||
/// } | ||
/// is accidentally quadratic because of this issue. A workaround is to | ||
/// explicitly convert the string into its native UTF8 representation: | ||
/// var nativeString = consume string | ||
/// nativeString.makeContiguousUTF8() | ||
/// for word in nativeString.split(separator: " ") { | ||
/// useSpan(word.span) | ||
/// } | ||
/// This second option has linear time complexity, as expected. | ||
/// | ||
/// Returns: a `Span` over the UTF8 code units of this Substring. | ||
/// | ||
/// Complexity: O(1) for native UTF8 Strings, O(n) for bridged UTF16 Strings. | ||
@available(SwiftStdlib 6.2, *) | ||
public var span: Span<UTF8.CodeUnit> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think in this particular case it is extremely important for this property to have a doc comment that explains its complexity. Unlike the other |
||
@lifetime(borrow self) | ||
borrowing get { | ||
#if _runtime(_ObjC) | ||
// handle non-UTF8 Objective-C bridging cases here | ||
if !_wholeGuts.isFastUTF8 && _wholeGuts._object.hasObjCBridgeableObject { | ||
let base: String.UTF8View = self._base | ||
let first = base._foreignDistance(from: base.startIndex, to: startIndex) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is fine and I'm not suggesting changing it now, but I wonder if we can avoid a bunch of -characterAtIndex: calls by making a version of this that operates on the associated buffer directly There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case the indices are to the UTF16 buffer, so a translation is needed. I'd like @lorentey's eyes on this one. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The implementation looks correct to me, but I think David is right, performance is going to be a real issue. 😫 The point of these
The one-time transcoding of the base string is often amortized away to near-zero with repeated calls to Unfortunately the only way to avoid this footgun I can see is to not provide this property on this type at all; we should consider doing that! If we do end up wanting to keep this property, then I think the doc comment needs to loudly explain this major gotcha, and it should recommend strategies to avoid hitting it. (E.g., by first copying the substring into a standalone string instance.)
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's worth noting that we're steadily reducing the number of cases this will apply to, but yes, we don't currently have a strategy for getting that number to zero |
||
let count = base._foreignDistance(from: startIndex, to: endIndex) | ||
let span = unsafe base.span._extracting(first..<(first &+ count)) | ||
return unsafe _overrideLifetime(span, borrowing: self) | ||
} | ||
#endif | ||
let first = _slice._startIndex._encodedOffset | ||
let end = _slice._endIndex._encodedOffset | ||
if _wholeGuts.isSmall { | ||
let a = Builtin.addressOfBorrow(self) | ||
let offset = first &+ (2 &* MemoryLayout<String.Index>.stride) | ||
let start = unsafe UnsafePointer<UTF8.CodeUnit>(a).advanced(by: offset) | ||
let span = unsafe Span(_unsafeStart: start, count: end &- first) | ||
fatalError("Span over the small string form is not supported yet.") | ||
return unsafe _overrideLifetime(span, borrowing: self) | ||
} | ||
_internalInvariant(_wholeGuts.isFastUTF8) | ||
var span = unsafe Span(_unsafeElements: _wholeGuts._object.fastUTF8) | ||
span = span._extracting(first..<end) | ||
return unsafe _overrideLifetime(span, borrowing: self) | ||
} | ||
} | ||
} | ||
|
||
extension Substring { | ||
@inlinable | ||
public var utf8: UTF8View { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
//===--- BridgedStringSpanTests.swift -------------------------------------===// | ||
// | ||
// This source file is part of the Swift.org open source project | ||
// | ||
// Copyright (c) 2025 Apple Inc. and the Swift project authors | ||
// Licensed under Apache License v2.0 with Runtime Library Exception | ||
// | ||
// See https://swift.org/LICENSE.txt for license information | ||
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
// RUN: %target-run-stdlib-swift -enable-experimental-feature LifetimeDependence | ||
|
||
// REQUIRES: executable_test | ||
// REQUIRES: objc_interop | ||
// REQUIRES: swift_feature_LifetimeDependence | ||
|
||
import StdlibUnittest | ||
|
||
import Foundation | ||
|
||
var suite = TestSuite("EagerLazyBridging String Tests") | ||
defer { runAllTests() } | ||
|
||
let strings = [ | ||
"Hello, World!", | ||
"A long ASCII string exceeding 16 code units.", | ||
"🇯🇵", | ||
"🏂☃❅❆❄︎⛄️❄️", | ||
glessard marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// Enable the following once the small native string form is supported | ||
// "z", | ||
// "", | ||
] | ||
|
||
strings.forEach { expected in | ||
suite.test("Span from Bridged String Stability: \(expected)") | ||
.require(.stdlib_6_2).code { | ||
guard #available(SwiftStdlib 6.2, *) else { return } | ||
|
||
let string = NSString(utf8String: expected) | ||
guard let string else { expectNotNil(string); return } | ||
|
||
let bridged = String(string).utf8 | ||
var p: ObjectIdentifier? = nil | ||
for (i, j) in zip(0..<3, bridged.indices) { | ||
let span = bridged.span | ||
let c = span.withUnsafeBufferPointer { | ||
let o = unsafeBitCast($0.baseAddress, to: ObjectIdentifier.self) | ||
if p == nil { | ||
p = o | ||
} else { | ||
expectEqual(p, o) | ||
} | ||
return $0[i] | ||
} | ||
expectEqual(c, bridged[j]) | ||
} | ||
} | ||
} | ||
|
||
strings.forEach { expected in | ||
suite.test("Span from Bridged String: \(expected)") | ||
.require(.stdlib_6_2).code { | ||
guard #available(SwiftStdlib 6.2, *) else { return } | ||
|
||
let string = NSString(utf8String: expected) | ||
guard let string else { expectNotNil(string); return } | ||
|
||
let bridged = String(string) | ||
let utf8 = bridged.utf8 | ||
let span = utf8.span | ||
expectEqual(span.count, expected.utf8.count) | ||
for (i,j) in zip(span.indices, expected.utf8.indices) { | ||
expectEqual(span[i], expected.utf8[j]) | ||
} | ||
} | ||
} | ||
|
||
strings.forEach { expected in | ||
suite.test("Span from Bridged String Substring: \(expected)") | ||
.require(.stdlib_6_2).code { | ||
guard #available(SwiftStdlib 6.2, *) else { return } | ||
|
||
let string = NSString(utf8String: expected) | ||
guard let string else { expectNotNil(string); return } | ||
|
||
let bridged = String(string).dropFirst() | ||
let utf8 = bridged.utf8 | ||
let span = utf8.span | ||
let expected = expected.dropFirst() | ||
expectEqual(span.count, expected.utf8.count) | ||
for (i,j) in zip(span.indices, expected.utf8.indices) { | ||
expectEqual(span[i], expected.utf8[j]) | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we're missing an
#if !(os(watchOS) && _pointerBitWidth(_32))
around this. 😭There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, and we'll need a doc comment, too -- the complexity (amortized over many calls) is particularly relevant.