-
Notifications
You must be signed in to change notification settings - Fork 49
Handle boundaries when matching in substrings #675
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5b157cf
e694693
9024254
6160220
b331bc0
3cd121a
5114ea4
656c388
0f4f005
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,9 +15,10 @@ extension Processor { | |
isStrictASCII: Bool, | ||
isScalarSemantics: Bool | ||
) -> Bool { | ||
guard let next = input.matchBuiltinCC( | ||
guard currentPosition < end, let next = input.matchBuiltinCC( | ||
cc, | ||
at: currentPosition, | ||
limitedBy: end, | ||
isInverted: isInverted, | ||
isStrictASCII: isStrictASCII, | ||
isScalarSemantics: isScalarSemantics | ||
|
@@ -102,56 +103,96 @@ extension Processor { | |
|
||
case .wordBoundary: | ||
if payload.usesSimpleUnicodeBoundaries { | ||
// TODO: How should we handle bounds? | ||
return atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel) | ||
} else { | ||
return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) | ||
return input.isOnWordBoundary(at: currentPosition, in: searchBounds, using: &wordIndexCache, &wordIndexMaxIndex) | ||
} | ||
|
||
case .notWordBoundary: | ||
if payload.usesSimpleUnicodeBoundaries { | ||
// TODO: How should we handle bounds? | ||
return !atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel) | ||
} else { | ||
return !input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) | ||
return !input.isOnWordBoundary(at: currentPosition, in: searchBounds, using: &wordIndexCache, &wordIndexMaxIndex) | ||
} | ||
} | ||
} | ||
} | ||
|
||
// MARK: Matching `.` | ||
extension String { | ||
// TODO: Should the below have a `limitedBy` parameter? | ||
/// Returns the character at `pos`, bounded by `end`, as well as the upper | ||
/// boundary of the returned character. | ||
/// | ||
/// This function handles loading a character from a string while respecting | ||
/// an end boundary, even if that end boundary is sub-character or sub-scalar. | ||
/// | ||
/// - If `pos` is at or past `end`, this function returns `nil`. | ||
/// - If `end` is between `pos` and the next grapheme cluster boundary (i.e., | ||
/// `end` is before `self.index(after: pos)`, then the returned character | ||
/// is smaller than the one that would be produced by `self[pos]` and the | ||
/// returned index is at the end of that character. | ||
/// - If `end` is between `pos` and the next grapheme cluster boundary, and | ||
/// is not on a Unicode scalar boundary, the partial scalar is dropped. This | ||
/// can result in a `nil` return or a character that includes only part of | ||
/// the `self[pos]` character. | ||
/// | ||
/// - Parameters: | ||
/// - pos: The position to load a character from. | ||
/// - end: The limit for the character at `pos`. | ||
/// - Returns: The character at `pos`, bounded by `end`, if it exists, along | ||
/// with the upper bound of that character. The upper bound is always | ||
/// scalar-aligned. | ||
func characterAndEnd(at pos: String.Index, limitedBy end: String.Index) -> (Character, String.Index)? { | ||
// FIXME: Sink into the stdlib to avoid multiple boundary calculations | ||
guard pos < end else { return nil } | ||
let next = index(after: pos) | ||
if next <= end { | ||
return (self[pos], next) | ||
} | ||
|
||
// `end` must be a sub-character position that is between `pos` and the | ||
// next grapheme boundary. This is okay if `end` is on a Unicode scalar | ||
// boundary, but if it's in the middle of a scalar's code units, there | ||
// may not be a character to return at all after rounding down. Use | ||
// `Substring`'s rounding to determine what we can return. | ||
let substr = self[pos..<end] | ||
return substr.isEmpty | ||
? nil | ||
: (substr.first!, substr.endIndex) | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you run the benchmark suite over this approach vs just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Post-merge performance tweak. We should hand-outline the rare and slow path, no need to pollute the i-cache and confuse the optimizer. |
||
|
||
func matchAnyNonNewline( | ||
at currentPosition: String.Index, | ||
limitedBy end: String.Index, | ||
isScalarSemantics: Bool | ||
) -> String.Index? { | ||
guard currentPosition < endIndex else { | ||
return nil | ||
} | ||
guard currentPosition < end else { return nil } | ||
if case .definite(let result) = _quickMatchAnyNonNewline( | ||
at: currentPosition, | ||
limitedBy: end, | ||
isScalarSemantics: isScalarSemantics | ||
) { | ||
assert(result == _thoroughMatchAnyNonNewline( | ||
at: currentPosition, | ||
limitedBy: end, | ||
isScalarSemantics: isScalarSemantics)) | ||
return result | ||
} | ||
return _thoroughMatchAnyNonNewline( | ||
at: currentPosition, | ||
limitedBy: end, | ||
isScalarSemantics: isScalarSemantics) | ||
} | ||
|
||
@inline(__always) | ||
private func _quickMatchAnyNonNewline( | ||
at currentPosition: String.Index, | ||
limitedBy end: String.Index, | ||
isScalarSemantics: Bool | ||
) -> QuickResult<String.Index?> { | ||
assert(currentPosition < endIndex) | ||
assert(currentPosition < end) | ||
guard let (asciiValue, next, isCRLF) = _quickASCIICharacter( | ||
at: currentPosition | ||
at: currentPosition, limitedBy: end | ||
) else { | ||
return .unknown | ||
} | ||
|
@@ -167,46 +208,47 @@ extension String { | |
@inline(never) | ||
private func _thoroughMatchAnyNonNewline( | ||
at currentPosition: String.Index, | ||
limitedBy end: String.Index, | ||
isScalarSemantics: Bool | ||
) -> String.Index? { | ||
assert(currentPosition < endIndex) | ||
if isScalarSemantics { | ||
guard currentPosition < end else { return nil } | ||
let scalar = unicodeScalars[currentPosition] | ||
guard !scalar.isNewline else { return nil } | ||
return unicodeScalars.index(after: currentPosition) | ||
} | ||
|
||
let char = self[currentPosition] | ||
guard !char.isNewline else { return nil } | ||
return index(after: currentPosition) | ||
guard let (char, next) = characterAndEnd(at: currentPosition, limitedBy: end), | ||
!char.isNewline | ||
else { return nil } | ||
return next | ||
} | ||
} | ||
|
||
// MARK: - Built-in character class matching | ||
extension String { | ||
// TODO: Should the below have a `limitedBy` parameter? | ||
|
||
// Mentioned in ProgrammersManual.md, update docs if redesigned | ||
func matchBuiltinCC( | ||
_ cc: _CharacterClassModel.Representation, | ||
at currentPosition: String.Index, | ||
limitedBy end: String.Index, | ||
isInverted: Bool, | ||
isStrictASCII: Bool, | ||
isScalarSemantics: Bool | ||
) -> String.Index? { | ||
guard currentPosition < endIndex else { | ||
return nil | ||
} | ||
guard currentPosition < end else { return nil } | ||
if case .definite(let result) = _quickMatchBuiltinCC( | ||
cc, | ||
at: currentPosition, | ||
limitedBy: end, | ||
isInverted: isInverted, | ||
isStrictASCII: isStrictASCII, | ||
isScalarSemantics: isScalarSemantics | ||
) { | ||
assert(result == _thoroughMatchBuiltinCC( | ||
cc, | ||
at: currentPosition, | ||
limitedBy: end, | ||
isInverted: isInverted, | ||
isStrictASCII: isStrictASCII, | ||
isScalarSemantics: isScalarSemantics)) | ||
|
@@ -215,6 +257,7 @@ extension String { | |
return _thoroughMatchBuiltinCC( | ||
cc, | ||
at: currentPosition, | ||
limitedBy: end, | ||
isInverted: isInverted, | ||
isStrictASCII: isStrictASCII, | ||
isScalarSemantics: isScalarSemantics) | ||
|
@@ -225,13 +268,17 @@ extension String { | |
private func _quickMatchBuiltinCC( | ||
_ cc: _CharacterClassModel.Representation, | ||
at currentPosition: String.Index, | ||
limitedBy end: String.Index, | ||
isInverted: Bool, | ||
isStrictASCII: Bool, | ||
isScalarSemantics: Bool | ||
) -> QuickResult<String.Index?> { | ||
assert(currentPosition < endIndex) | ||
assert(currentPosition < end) | ||
guard let (next, result) = _quickMatch( | ||
cc, at: currentPosition, isScalarSemantics: isScalarSemantics | ||
cc, | ||
at: currentPosition, | ||
limitedBy: end, | ||
isScalarSemantics: isScalarSemantics | ||
) else { | ||
return .unknown | ||
} | ||
|
@@ -243,27 +290,25 @@ extension String { | |
private func _thoroughMatchBuiltinCC( | ||
_ cc: _CharacterClassModel.Representation, | ||
at currentPosition: String.Index, | ||
limitedBy end: String.Index, | ||
isInverted: Bool, | ||
isStrictASCII: Bool, | ||
isScalarSemantics: Bool | ||
) -> String.Index? { | ||
assert(currentPosition < endIndex) | ||
let char = self[currentPosition] | ||
// TODO: Branch here on scalar semantics | ||
// Don't want to pay character cost if unnecessary | ||
guard var (char, next) = | ||
characterAndEnd(at: currentPosition, limitedBy: end) | ||
else { return nil } | ||
let scalar = unicodeScalars[currentPosition] | ||
|
||
let asciiCheck = !isStrictASCII | ||
|| (scalar.isASCII && isScalarSemantics) | ||
|| char.isASCII | ||
|
||
var matched: Bool | ||
var next: String.Index | ||
switch (isScalarSemantics, cc) { | ||
case (_, .anyGrapheme): | ||
next = index(after: currentPosition) | ||
case (true, _): | ||
if isScalarSemantics && cc != .anyGrapheme { | ||
next = unicodeScalars.index(after: currentPosition) | ||
case (false, _): | ||
next = index(after: currentPosition) | ||
} | ||
|
||
switch cc { | ||
|
@@ -291,7 +336,7 @@ extension String { | |
if isScalarSemantics { | ||
matched = scalar.isNewline && asciiCheck | ||
if matched && scalar == "\r" | ||
&& next != endIndex && unicodeScalars[next] == "\n" { | ||
&& next < end && unicodeScalars[next] == "\n" { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. E.g. here is a place where we implicitly rely on |
||
// Match a full CR-LF sequence even in scalar semantics | ||
unicodeScalars.formIndex(after: &next) | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,18 +17,18 @@ extension Processor { | |
boundaryCheck: !isScalarSemantics, | ||
isCaseInsensitive: false) | ||
case .builtin: | ||
// FIXME: bounds check? endIndex or end? | ||
guard currentPosition < end else { return nil } | ||
|
||
// We only emit .quantify if it consumes a single character | ||
return input.matchBuiltinCC( | ||
payload.builtin, | ||
at: currentPosition, | ||
limitedBy: end, | ||
isInverted: payload.builtinIsInverted, | ||
isStrictASCII: payload.builtinIsStrict, | ||
isScalarSemantics: isScalarSemantics) | ||
case .any: | ||
// FIXME: endIndex or end? | ||
guard currentPosition < input.endIndex else { return nil } | ||
guard currentPosition < end else { return nil } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Were you able to write a test case for this one? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is hit by the existing "match any" test cases when you pass a substring instead of a string. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we hoist this bounds check to the top of the method, or else leave it as a concern for the String methods? Either way, we want a consistent place and the body of the switch would ideally just be doing dispatch. |
||
|
||
if payload.anyMatchesNewline { | ||
if isScalarSemantics { | ||
|
@@ -38,7 +38,9 @@ extension Processor { | |
} | ||
|
||
return input.matchAnyNonNewline( | ||
at: currentPosition, isScalarSemantics: isScalarSemantics) | ||
at: currentPosition, | ||
limitedBy: end, | ||
isScalarSemantics: isScalarSemantics) | ||
} | ||
} | ||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.