Skip to content

Add in ASCII fast-path for anyNonNewline #654

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 62 additions & 1 deletion Sources/_StringProcessing/Engine/MEBuiltins.swift
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,69 @@ extension Processor {
}
}

// MARK: Built-in character class matching
// MARK: Matching `.`
extension String {

func _matchAnyNonNewline(
at currentPosition: String.Index,
isScalarSemantics: Bool
) -> String.Index? {
guard currentPosition < endIndex else {
return nil
}
if case .definite(let result) = _quickMatchAnyNonNewline(
at: currentPosition,
isScalarSemantics: isScalarSemantics
) {
assert(result == _thoroughMatchAnyNonNewline(
at: currentPosition,
isScalarSemantics: isScalarSemantics))
return result
}
return _thoroughMatchAnyNonNewline(
at: currentPosition,
isScalarSemantics: isScalarSemantics)
}

@inline(__always)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How much of the improvement do you think is from this check getting inlined vs the actual code difference? Also, what's the larger impact here? Does inlining this function make QuickResult and _quickASCIICharacter ABI?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no ABI here. The main point of the inline annotation pattern is to eliminate the generic struct in the middle, and because this is the fast/common path.

func _quickMatchAnyNonNewline(
at currentPosition: String.Index,
isScalarSemantics: Bool
) -> QuickResult<String.Index?> {
assert(currentPosition < endIndex)
guard let (asciiValue, next, isCRLF) = _quickASCIICharacter(
at: currentPosition
) else {
return .unknown
}
switch asciiValue {
case ._lineFeed, ._carriageReturn:
return .definite(nil)
default:
assert(!isCRLF)
return .definite(next)
}
}

@inline(never)
func _thoroughMatchAnyNonNewline(
at currentPosition: String.Index,
isScalarSemantics: Bool
) -> String.Index? {
assert(currentPosition < endIndex)
if isScalarSemantics {
let scalar = unicodeScalars[currentPosition]
guard !scalar.isNewline else { return nil }
return unicodeScalars.index(after: currentPosition)
}

let char = self[currentPosition]
guard !char.isNewline else { return nil }
return index(after: currentPosition)
}
}

// MARK: - Built-in character class matching
extension String {

// Mentioned in ProgrammersManual.md, update docs if redesigned
Expand Down
33 changes: 11 additions & 22 deletions Sources/_StringProcessing/Engine/Processor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -346,23 +346,18 @@ extension Processor {
return true
}

// Matches the next character if it is not a newline
mutating func matchAnyNonNewline() -> Bool {
guard let c = load(), !c.isNewline else {
signalFailure()
return false
}
_uncheckedForcedConsumeOne()
return true
}

// Matches the next scalar if it is not a newline
mutating func matchAnyNonNewlineScalar() -> Bool {
guard let s = loadScalar(), !s.isNewline else {
// Matches the next character/scalar if it is not a newline
mutating func matchAnyNonNewline(
isScalarSemantics: Bool
) -> Bool {
guard let next = input._matchAnyNonNewline(
at: currentPosition,
isScalarSemantics: isScalarSemantics
) else {
signalFailure()
return false
}
input.unicodeScalars.formIndex(after: &currentPosition)
currentPosition = next
return true
}

Expand Down Expand Up @@ -535,14 +530,8 @@ extension Processor {
}
}
case .matchAnyNonNewline:
if payload.isScalar {
if matchAnyNonNewlineScalar() {
controller.step()
}
} else {
if matchAnyNonNewline() {
controller.step()
}
if matchAnyNonNewline(isScalarSemantics: payload.isScalar) {
controller.step()
}
case .match:
let (isCaseInsensitive, reg) = payload.elementPayload
Expand Down
34 changes: 18 additions & 16 deletions Sources/_StringProcessing/Unicode/ASCII.swift
Original file line number Diff line number Diff line change
Expand Up @@ -9,26 +9,25 @@
//
//===----------------------------------------------------------------------===//

private var _lineFeed: UInt8 { 0x0A }
private var _carriageReturn: UInt8 { 0x0D }
private var _lineTab: UInt8 { 0x0B }
private var _formFeed: UInt8 { 0x0C }
private var _space: UInt8 { 0x20 }
private var _tab: UInt8 { 0x09 }
extension UInt8 {
static var _lineFeed: UInt8 { 0x0A }
static var _carriageReturn: UInt8 { 0x0D }
static var _lineTab: UInt8 { 0x0B }
static var _formFeed: UInt8 { 0x0C }
static var _space: UInt8 { 0x20 }
static var _tab: UInt8 { 0x09 }

static var _underscore: UInt8 { 0x5F }
}

private var _0: UInt8 { 0x30 }
private var _9: UInt8 { 0x39 }
private func _isASCIINumber(_ x: UInt8) -> Bool {
return (_0..._9).contains(x)
}

private var _a: UInt8 { 0x61 }
private var _z: UInt8 { 0x7A }
private var _A: UInt8 { 0x41 }
private var _Z: UInt8 { 0x5A }

private var _underscore: UInt8 { 0x5F }

extension UInt8 {
var _isASCII: Bool { self < 0x80 }

Expand All @@ -43,14 +42,14 @@ extension UInt8 {
/// Assuming we're ASCII, whether we match `\h`
var _asciiIsHorizontalWhitespace: Bool {
assert(_isASCII)
return self == _space || self == _tab
return self == ._space || self == ._tab
}

/// Assuming we're ASCII, whether we match `\v`
var _asciiIsVerticalWhitespace: Bool {
assert(_isASCII)
switch self {
case _lineFeed, _carriageReturn, _lineTab, _formFeed:
case ._lineFeed, ._carriageReturn, ._lineTab, ._formFeed:
return true
default:
return false
Expand All @@ -61,7 +60,7 @@ extension UInt8 {
var _asciiIsWhitespace: Bool {
assert(_isASCII)
switch self {
case _space, _tab, _lineFeed, _lineTab, _formFeed, _carriageReturn:
case ._space, ._tab, ._lineFeed, ._lineTab, ._formFeed, ._carriageReturn:
return true
default:
return false
Expand All @@ -77,11 +76,13 @@ extension UInt8 {
/// Assuming we're ASCII, whether we match `\w`
var _asciiIsWord: Bool {
assert(_isASCII)
return _asciiIsDigit || _asciiIsLetter || self == _underscore
return _asciiIsDigit || _asciiIsLetter || self == ._underscore
}
}

extension String {
/// TODO: better to take isScalarSemantics parameter, we can return more results
/// and we can give the right `next` index, not requiring the caller to re-adjust it
/// TODO: detailed description of nuanced semantics
func _quickASCIICharacter(
at idx: Index
Expand All @@ -107,7 +108,7 @@ extension String {
guard tail._isSub300StartingByte else { return nil }

// Handle CR-LF:
if base == _carriageReturn && tail == _lineFeed {
if base == ._carriageReturn && tail == ._lineFeed {
utf8.formIndex(after: &next)
guard next == endIndex || utf8[next]._isSub300StartingByte else {
return nil
Expand Down Expand Up @@ -165,5 +166,6 @@ extension String {
return (next, asciiValue._asciiIsWord)
}
}

}