Skip to content

[SE-0456] Span properties over utf8 views #80116

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Apr 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 88 additions & 1 deletion stdlib/public/core/StringGuts.swift
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2018 Apple Inc. and the Swift project authors
// Copyright (c) 2014 - 2025 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
Expand Down Expand Up @@ -417,6 +417,93 @@ extension _StringGuts {
}
}

#if _runtime(_ObjC)
extension _StringGuts {

private static var _associationKey: UnsafeRawPointer {
struct AssociationKey {}
// We never dereference this, we only use this address as a unique key
return unsafe unsafeBitCast(
ObjectIdentifier(AssociationKey.self),
to: UnsafeRawPointer.self
)
}

private func _getAssociatedStorage() -> __StringStorage? {
_internalInvariant(_object.hasObjCBridgeableObject)
let getter = unsafe unsafeBitCast(
getGetAssociatedObjectPtr(),
to: (@convention(c)(
AnyObject,
UnsafeRawPointer
) -> UnsafeRawPointer?).self
)

if let assocPtr = unsafe getter(
_object.objCBridgeableObject,
Self._associationKey
) {
let storage: __StringStorage
storage = unsafe Unmanaged.fromOpaque(assocPtr).takeUnretainedValue()
return storage
}
return nil
}

private func _setAssociatedStorage(_ storage: __StringStorage) {
_internalInvariant(_object.hasObjCBridgeableObject)
let setter = unsafe unsafeBitCast(
getSetAssociatedObjectPtr(),
to: (@convention(c)(
AnyObject,
UnsafeRawPointer,
AnyObject?,
UInt
) -> Void).self
)

unsafe setter(
_object.objCBridgeableObject,
Self._associationKey,
storage,
1 //OBJC_ASSOCIATION_RETAIN_NONATOMIC
)
}

internal func _getOrAllocateAssociatedStorage() -> __StringStorage {
_internalInvariant(_object.hasObjCBridgeableObject)
let unwrapped: __StringStorage
// libobjc already provides the necessary memory barriers for
// double checked locking to be safe, per comments on
// https://github.com/swiftlang/swift/pull/75148
if let storage = _getAssociatedStorage() {
unwrapped = storage
} else {
let lock = _object.objCBridgeableObject
objc_sync_enter(lock)
if let storage = _getAssociatedStorage() {
unwrapped = storage
} else {
var contents = String.UnicodeScalarView()
// always reserve a capacity larger than a small string
contents.reserveCapacity(
Swift.max(_SmallString.capacity + 1, count + count >> 1)
)
for c in String.UnicodeScalarView(self) {
contents.append(c)
}
_precondition(contents._guts._object.hasNativeStorage)
unwrapped = (consume contents)._guts._object.nativeStorage
_setAssociatedStorage(unwrapped)
}
defer { _fixLifetime(unwrapped) }
objc_sync_exit(lock)
}
return unwrapped
}
}
#endif

// Old SPI(corelibs-foundation)
extension _StringGuts {
public // SPI(corelibs-foundation)
Expand Down
45 changes: 44 additions & 1 deletion stdlib/public/core/StringUTF8View.swift
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Copyright (c) 2014 - 2025 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
Expand Down Expand Up @@ -317,6 +317,49 @@ extension String.UTF8View {
}
}

extension String.UTF8View {

/// A span over the UTF8 code units that make up this string.
///
/// - Note: In the case of bridged UTF16 String instances (on Apple
/// platforms,) this property transcodes the code units the first time
/// it is called. The transcoded buffer is cached, and subsequent calls
/// to `span` can reuse the buffer.
///
/// Returns: a `Span` over the UTF8 code units of this String.
///
/// Complexity: O(1) for native UTF8 Strings,
/// amortized O(1) for bridged UTF16 Strings.
@available(SwiftStdlib 6.2, *)
public var span: Span<UTF8.CodeUnit> {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we're missing an #if !(os(watchOS) && _pointerBitWidth(_32)) around this. 😭

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, and we'll need a doc comment, too -- the complexity (amortized over many calls) is particularly relevant.

@lifetime(borrow self)
borrowing get {
#if _runtime(_ObjC)
// handle non-UTF8 Objective-C bridging cases here
if !_guts.isFastUTF8 && _guts._object.hasObjCBridgeableObject {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It occurred to me today that we may be able to do better here. isFastUTF8 is not actually a binary property, even though most uses of it in practice can treat it that way. It's actually a 3-state thing: yes, no, yes-but-not-terminated.

If we're careful we should be able to handle that last case without using associated objects, which will make "NoCopy" bridged ASCII NSStrings fall into the happy path here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's the relevant bit of StringBridge.swift, we'd want a path that passes 0 instead of 1 as the argument

@_effects(readonly)
private func _NSStringASCIIPointer(_ str: _StringSelectorHolder) -> UnsafePointer<UInt8>? {
  //TODO(String bridging): Unconditionally asking for nul-terminated contents is
  // overly conservative and hurts perf with some NSStrings
  return unsafe str._fastCStringContents(1)?._asUInt8
}

let storage = _guts._getOrAllocateAssociatedStorage()
let (start, count) = unsafe (storage.start, storage.count)
let span = unsafe Span(_unsafeStart: start, count: count)
return unsafe _overrideLifetime(span, borrowing: self)
}
#endif
let count = _guts.count
if _guts.isSmall {
let a = Builtin.addressOfBorrow(self)
let address = unsafe UnsafePointer<UTF8.CodeUnit>(a)
let span = unsafe Span(_unsafeStart: address, count: count)
fatalError("Span over the small string form is not supported yet.")
return unsafe _overrideLifetime(span, borrowing: self)
}
_precondition(_guts.isFastUTF8)
let buffer = unsafe _guts._object.fastUTF8
_internalInvariant(count == buffer.count)
let span = unsafe Span(_unsafeElements: buffer)
return unsafe _overrideLifetime(span, borrowing: self)
}
}
}

// Index conversions
extension String.UTF8View.Index {
/// Creates an index in the given UTF-8 view that corresponds exactly to the
Expand Down
57 changes: 56 additions & 1 deletion stdlib/public/core/Substring.swift
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Copyright (c) 2014 - 2025 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
Expand Down Expand Up @@ -749,6 +749,61 @@ extension Substring.UTF8View: BidirectionalCollection {
}
}

extension Substring.UTF8View {

/// A span over the UTF8 code units that make up this substring.
///
/// - Note: In the case of bridged UTF16 String instances (on Apple
/// platforms,) this property needs to transcode the code units every time
/// it is called.
/// For example, if `string` has the bridged UTF16 representation,
/// for word in string.split(separator: " ") {
/// useSpan(word.span)
/// }
/// is accidentally quadratic because of this issue. A workaround is to
/// explicitly convert the string into its native UTF8 representation:
/// var nativeString = consume string
/// nativeString.makeContiguousUTF8()
/// for word in nativeString.split(separator: " ") {
/// useSpan(word.span)
/// }
/// This second option has linear time complexity, as expected.
///
/// Returns: a `Span` over the UTF8 code units of this Substring.
///
/// Complexity: O(1) for native UTF8 Strings, O(n) for bridged UTF16 Strings.
@available(SwiftStdlib 6.2, *)
public var span: Span<UTF8.CodeUnit> {
Copy link
Member

@lorentey lorentey Mar 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think in this particular case it is extremely important for this property to have a doc comment that explains its complexity.

Unlike the other span properties, this one has linear time complexity for bridged strings, and that requires developers to be careful to avoid calling it in a loop. (E.g., most Substring.UTF8View operations in the stdlib (and elsewhere) must not use this property, as they have no way to avoid getting called in a loop.)

@lifetime(borrow self)
borrowing get {
#if _runtime(_ObjC)
// handle non-UTF8 Objective-C bridging cases here
if !_wholeGuts.isFastUTF8 && _wholeGuts._object.hasObjCBridgeableObject {
let base: String.UTF8View = self._base
let first = base._foreignDistance(from: base.startIndex, to: startIndex)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is fine and I'm not suggesting changing it now, but I wonder if we can avoid a bunch of -characterAtIndex: calls by making a version of this that operates on the associated buffer directly

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case the indices are to the UTF16 buffer, so a translation is needed. I'd like @lorentey's eyes on this one.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The implementation looks correct to me, but I think David is right, performance is going to be a real issue. 😫

The point of these span properties is that they provide fast direct access to in-memory storage; doing a full UTF-16 → UTF-8 transcoding of an arbitrarily large prefix in the base string (plus the entire contents of the substring itself) every time span is accessed is violating this goal. We want to encourage developers to think of these properties as cheap enough to call whenever and wherever they need them; loops like the one below should not raise any eyebrows:

for word in string.split(separator: / /) {
  let span = word.span // oops
  ...
}

The one-time transcoding of the base string is often amortized away to near-zero with repeated calls to span, but the two index conversions in the substring's implementation will be repeated from scratch on every iteration. Straightforward-looking loops will become accidentally quadratic.

Unfortunately the only way to avoid this footgun I can see is to not provide this property on this type at all; we should consider doing that!

If we do end up wanting to keep this property, then I think the doc comment needs to loudly explain this major gotcha, and it should recommend strategies to avoid hitting it. (E.g., by first copying the substring into a standalone string instance.)

for word in string.split(separator: / /) {
  let str = String(word)
  let span = str.span // OK
  ...
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's worth noting that we're steadily reducing the number of cases this will apply to, but yes, we don't currently have a strategy for getting that number to zero

let count = base._foreignDistance(from: startIndex, to: endIndex)
let span = unsafe base.span._extracting(first..<(first &+ count))
return unsafe _overrideLifetime(span, borrowing: self)
}
#endif
let first = _slice._startIndex._encodedOffset
let end = _slice._endIndex._encodedOffset
if _wholeGuts.isSmall {
let a = Builtin.addressOfBorrow(self)
let offset = first &+ (2 &* MemoryLayout<String.Index>.stride)
let start = unsafe UnsafePointer<UTF8.CodeUnit>(a).advanced(by: offset)
let span = unsafe Span(_unsafeStart: start, count: end &- first)
fatalError("Span over the small string form is not supported yet.")
return unsafe _overrideLifetime(span, borrowing: self)
}
_internalInvariant(_wholeGuts.isFastUTF8)
var span = unsafe Span(_unsafeElements: _wholeGuts._object.fastUTF8)
span = span._extracting(first..<end)
return unsafe _overrideLifetime(span, borrowing: self)
}
}
}

extension Substring {
@inlinable
public var utf8: UTF8View {
Expand Down
4 changes: 4 additions & 0 deletions test/abi/macOS/arm64/stdlib.swift
Original file line number Diff line number Diff line change
Expand Up @@ -837,6 +837,10 @@ Added: _$ss13KeyValuePairsV4spans4SpanVyx3key_q_5valuetGvpMV
Added: _$ss15CollectionOfOneV4spans4SpanVyxGvpMV
Added: _$ss15ContiguousArrayV4spans4SpanVyxGvpMV
Added: _$ss4SpanVss15BitwiseCopyableRzlE5bytess03RawA0VvpMV
Added: _$sSS8UTF8ViewV4spans4SpanVys5UInt8VGvg
Added: _$sSS8UTF8ViewV4spans4SpanVys5UInt8VGvpMV
Added: _$sSs8UTF8ViewV4spans4SpanVys5UInt8VGvg
Added: _$sSs8UTF8ViewV4spans4SpanVys5UInt8VGvpMV

// SE-0467 mutableSpan properties
Added: _$sSa11mutableSpans07MutableB0VyxGvr
Expand Down
4 changes: 4 additions & 0 deletions test/abi/macOS/x86_64/stdlib.swift
Original file line number Diff line number Diff line change
Expand Up @@ -838,6 +838,10 @@ Added: _$ss13KeyValuePairsV4spans4SpanVyx3key_q_5valuetGvpMV
Added: _$ss15CollectionOfOneV4spans4SpanVyxGvpMV
Added: _$ss15ContiguousArrayV4spans4SpanVyxGvpMV
Added: _$ss4SpanVss15BitwiseCopyableRzlE5bytess03RawA0VvpMV
Added: _$sSS8UTF8ViewV4spans4SpanVys5UInt8VGvg
Added: _$sSS8UTF8ViewV4spans4SpanVys5UInt8VGvpMV
Added: _$sSs8UTF8ViewV4spans4SpanVys5UInt8VGvg
Added: _$sSs8UTF8ViewV4spans4SpanVys5UInt8VGvpMV

// SE-0467 mutableSpan properties
Added: _$sSa11mutableSpans07MutableB0VyxGvr
Expand Down
97 changes: 97 additions & 0 deletions test/stdlib/Span/BridgedStringUTF8ViewSpanTests.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
//===--- BridgedStringSpanTests.swift -------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2025 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

// RUN: %target-run-stdlib-swift -enable-experimental-feature LifetimeDependence

// REQUIRES: executable_test
// REQUIRES: objc_interop
// REQUIRES: swift_feature_LifetimeDependence

import StdlibUnittest

import Foundation

var suite = TestSuite("EagerLazyBridging String Tests")
defer { runAllTests() }

let strings = [
"Hello, World!",
"A long ASCII string exceeding 16 code units.",
"🇯🇵",
"🏂☃❅❆❄︎⛄️❄️",
// Enable the following once the small native string form is supported
// "z",
// "",
]

strings.forEach { expected in
suite.test("Span from Bridged String Stability: \(expected)")
.require(.stdlib_6_2).code {
guard #available(SwiftStdlib 6.2, *) else { return }

let string = NSString(utf8String: expected)
guard let string else { expectNotNil(string); return }

let bridged = String(string).utf8
var p: ObjectIdentifier? = nil
for (i, j) in zip(0..<3, bridged.indices) {
let span = bridged.span
let c = span.withUnsafeBufferPointer {
let o = unsafeBitCast($0.baseAddress, to: ObjectIdentifier.self)
if p == nil {
p = o
} else {
expectEqual(p, o)
}
return $0[i]
}
expectEqual(c, bridged[j])
}
}
}

strings.forEach { expected in
suite.test("Span from Bridged String: \(expected)")
.require(.stdlib_6_2).code {
guard #available(SwiftStdlib 6.2, *) else { return }

let string = NSString(utf8String: expected)
guard let string else { expectNotNil(string); return }

let bridged = String(string)
let utf8 = bridged.utf8
let span = utf8.span
expectEqual(span.count, expected.utf8.count)
for (i,j) in zip(span.indices, expected.utf8.indices) {
expectEqual(span[i], expected.utf8[j])
}
}
}

strings.forEach { expected in
suite.test("Span from Bridged String Substring: \(expected)")
.require(.stdlib_6_2).code {
guard #available(SwiftStdlib 6.2, *) else { return }

let string = NSString(utf8String: expected)
guard let string else { expectNotNil(string); return }

let bridged = String(string).dropFirst()
let utf8 = bridged.utf8
let span = utf8.span
let expected = expected.dropFirst()
expectEqual(span.count, expected.utf8.count)
for (i,j) in zip(span.indices, expected.utf8.indices) {
expectEqual(span[i], expected.utf8[j])
}
}
}
Loading