-
Notifications
You must be signed in to change notification settings - Fork 49
Optimize matching to match on scalar values when possible #525
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 15 commits
e518bdf
eb64036
1237f9a
fdf2c23
eeb38e9
6f76f36
3a2b324
df8919e
c2ee8cc
809b085
06c77c7
e3d7ad7
3e1e088
b4c7c8c
5359e31
6e4c2bd
1a359b4
097ffeb
76667a2
6a1f6e9
fce8f9a
0860368
c6bc811
bca1d2b
79e60ac
22cc9d5
7a9923d
47f8e66
7aa98d1
113cfe3
9949b8e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -74,6 +74,77 @@ fileprivate extension Compiler.ByteCodeGen { | |
} | ||
} | ||
|
||
mutating func emitQuotedLiteral(_ s: String) { | ||
if options.semanticLevel == .graphemeCluster { | ||
if options.isCaseInsensitive { | ||
// future work: if all ascii, emit matchBitset instructions with | ||
// case insensitive bitsets | ||
|
||
// TODO: buildCaseInsensitiveMatchSequence(c) or alternative | ||
builder.buildConsume { input, bounds in | ||
var iterator = s.makeIterator() | ||
var currentIndex = bounds.lowerBound | ||
while let ch = iterator.next() { | ||
guard currentIndex < bounds.upperBound, | ||
ch.lowercased() == input[currentIndex].lowercased() | ||
else { return nil } | ||
input.formIndex(after: ¤tIndex) | ||
} | ||
return currentIndex | ||
} | ||
} else { | ||
rctcwyvrn marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if optimizationsEnabled && s.allSatisfy({char in char.isASCII}) { | ||
rctcwyvrn marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for char in s.dropLast(1) { | ||
// Note: only cr-lf is multiple scalars | ||
for scalar in char.unicodeScalars { | ||
builder.buildMatchScalar(scalar, boundaryCheck: false) | ||
} | ||
} | ||
let lastChar = s.last! | ||
for scalar in lastChar.unicodeScalars { | ||
// Only boundary check if we are the last scalar in the last character | ||
// to make sure that there isn't a combining scalar after the quoted literal | ||
let boundaryCheck = scalar == lastChar.unicodeScalars.last! | ||
builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck) | ||
} | ||
rctcwyvrn marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} else { | ||
builder.buildMatchSequence(s) | ||
} | ||
} | ||
} else { | ||
if optimizationsEnabled && !options.isCaseInsensitive { | ||
// Match all scalars exactly, never boundary check because we're in | ||
// unicode scalars mode | ||
for char in s { | ||
for scalar in char.unicodeScalars { | ||
builder.buildMatchScalar(scalar, boundaryCheck: false) | ||
rctcwyvrn marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
} | ||
} else { | ||
builder.buildConsume { | ||
[caseInsensitive = options.isCaseInsensitive] input, bounds in | ||
// TODO: Case folding | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you double check our testing story here? If we're missing something, let's try to add XFAIL tests. |
||
var iterator = s.unicodeScalars.makeIterator() | ||
var currentIndex = bounds.lowerBound | ||
while let scalar = iterator.next() { | ||
guard currentIndex < bounds.upperBound else { return nil } | ||
if caseInsensitive { | ||
if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping { | ||
return nil | ||
} | ||
} else { | ||
if scalar != input.unicodeScalars[currentIndex] { | ||
return nil | ||
} | ||
} | ||
input.unicodeScalars.formIndex(after: ¤tIndex) | ||
} | ||
return currentIndex | ||
} | ||
} | ||
} | ||
} | ||
|
||
mutating func emitBackreference( | ||
_ ref: AST.Reference | ||
) throws { | ||
|
@@ -216,6 +287,11 @@ fileprivate extension Compiler.ByteCodeGen { | |
builder.buildConsume(by: consumeScalar { | ||
$0.properties.lowercaseMapping == s.properties.lowercaseMapping | ||
}) | ||
return | ||
} | ||
|
||
if optimizationsEnabled { // should we just do this unconditionally? | ||
rctcwyvrn marked this conversation as resolved.
Show resolved
Hide resolved
|
||
builder.buildMatchScalar(s, boundaryCheck: false) | ||
rctcwyvrn marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} else { | ||
builder.buildConsume(by: consumeScalar { | ||
$0 == s | ||
|
@@ -241,9 +317,17 @@ fileprivate extension Compiler.ByteCodeGen { | |
? input.index(after: bounds.lowerBound) | ||
: nil | ||
} | ||
} else { | ||
builder.buildMatch(c) | ||
} | ||
|
||
if optimizationsEnabled && c.isASCII { | ||
for scalar in c.unicodeScalars { | ||
let boundaryCheck = scalar == c.unicodeScalars.last! | ||
rctcwyvrn marked this conversation as resolved.
Show resolved
Hide resolved
|
||
builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another idea for the builder is to have a convenience function that adds a grapheme boundary check at the current position. It could either update the most recent instruction if that instruction is capable of supporting it or insert the anchor otherwise. |
||
} | ||
return | ||
} | ||
|
||
builder.buildMatch(c) | ||
} | ||
|
||
mutating func emitAny() { | ||
|
@@ -652,11 +736,12 @@ fileprivate extension Compiler.ByteCodeGen { | |
_ ccc: DSLTree.CustomCharacterClass | ||
) throws { | ||
if let asciiBitset = ccc.asAsciiBitset(options), | ||
options.semanticLevel == .graphemeCluster, | ||
optimizationsEnabled { | ||
// future work: add a bit to .matchBitset to consume either a character | ||
// or a scalar so we can have this optimization in scalar mode | ||
builder.buildMatchAsciiBitset(asciiBitset) | ||
if options.semanticLevel == .unicodeScalar { | ||
builder.buildScalarMatchAsciiBitset(asciiBitset) | ||
} else { | ||
builder.buildMatchAsciiBitset(asciiBitset) | ||
} | ||
} else { | ||
let consumer = try ccc.generateConsumer(options) | ||
builder.buildConsume(by: consumer) | ||
|
@@ -733,45 +818,7 @@ fileprivate extension Compiler.ByteCodeGen { | |
try emitAtom(a) | ||
|
||
case let .quotedLiteral(s): | ||
if options.semanticLevel == .graphemeCluster { | ||
if options.isCaseInsensitive { | ||
// TODO: buildCaseInsensitiveMatchSequence(c) or alternative | ||
builder.buildConsume { input, bounds in | ||
var iterator = s.makeIterator() | ||
var currentIndex = bounds.lowerBound | ||
while let ch = iterator.next() { | ||
guard currentIndex < bounds.upperBound, | ||
ch.lowercased() == input[currentIndex].lowercased() | ||
else { return nil } | ||
input.formIndex(after: ¤tIndex) | ||
} | ||
return currentIndex | ||
} | ||
} else { | ||
builder.buildMatchSequence(s) | ||
} | ||
} else { | ||
builder.buildConsume { | ||
[caseInsensitive = options.isCaseInsensitive] input, bounds in | ||
// TODO: Case folding | ||
var iterator = s.unicodeScalars.makeIterator() | ||
var currentIndex = bounds.lowerBound | ||
while let scalar = iterator.next() { | ||
guard currentIndex < bounds.upperBound else { return nil } | ||
if caseInsensitive { | ||
if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping { | ||
return nil | ||
} | ||
} else { | ||
if scalar != input.unicodeScalars[currentIndex] { | ||
return nil | ||
} | ||
} | ||
input.unicodeScalars.formIndex(after: ¤tIndex) | ||
} | ||
return currentIndex | ||
} | ||
} | ||
emitQuotedLiteral(s) | ||
|
||
case let .regexLiteral(l): | ||
return try emitNode(l.ast.dslTreeNode) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -227,6 +227,30 @@ extension Processor { | |
return true | ||
} | ||
|
||
func loadScalar() -> Unicode.Scalar? { | ||
currentPosition < end ? input.unicodeScalars[currentPosition] : nil | ||
} | ||
|
||
func nextScalarIndex(offsetBy n: Int, boundaryCheck: Bool) -> Input.Index? { | ||
rctcwyvrn marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if let idx = input.unicodeScalars.index(currentPosition, offsetBy: 1, limitedBy: end), | ||
(!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) { | ||
return idx | ||
} | ||
return nil | ||
} | ||
|
||
mutating func matchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) -> Bool { | ||
guard let curScalar = loadScalar(), | ||
curScalar == s, | ||
let idx = nextScalarIndex(offsetBy: 1, boundaryCheck: boundaryCheck) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might make more sense to drop the helper function and just do this directly:
Longer term we will want to use a lower-level String interface that returns both the current scalar and the next index. E.g. a nicer version of https://github.com/apple/swift/blob/main/stdlib/public/core/UnicodeHelpers.swift#L107. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or you could consider adding that now if you like, something like the below and we can micro-optimize it later:
|
||
else { | ||
signalFailure() | ||
return false | ||
} | ||
currentPosition = idx | ||
return true | ||
} | ||
|
||
// If we have a bitset we know that the CharacterClass only matches against | ||
// ascii characters, so check if the current input element is ascii then | ||
// check if it is set in the bitset | ||
|
@@ -241,6 +265,20 @@ extension Processor { | |
return true | ||
} | ||
|
||
// Equivalent of matchBitset but emitted when in unicode scalar semantic mode | ||
mutating func matchBitsetScalar( | ||
_ bitset: DSLTree.CustomCharacterClass.AsciiBitset | ||
) -> Bool { | ||
guard let curScalar = loadScalar(), | ||
bitset.matches(scalar: curScalar), | ||
let idx = nextScalarIndex(offsetBy: 1, boundaryCheck: false) else { | ||
signalFailure() | ||
return false | ||
} | ||
currentPosition = idx | ||
return true | ||
} | ||
|
||
mutating func signalFailure() { | ||
guard let (pc, pos, stackEnd, capEnds, intRegisters) = | ||
savePoints.popLast()?.destructure | ||
|
@@ -378,12 +416,29 @@ extension Processor { | |
controller.step() | ||
} | ||
|
||
case .matchScalar: | ||
let scalar = payload.scalar | ||
if matchScalar(scalar, boundaryCheck: true) { | ||
controller.step() | ||
} | ||
case .matchScalarUnchecked: | ||
let scalar = payload.scalar | ||
if matchScalar(scalar, boundaryCheck: false) { | ||
controller.step() | ||
} | ||
|
||
case .matchBitset: | ||
let reg = payload.bitset | ||
let bitset = registers[reg] | ||
if matchBitset(bitset) { | ||
controller.step() | ||
} | ||
case .matchBitsetScalar: | ||
let reg = payload.bitset | ||
let bitset = registers[reg] | ||
if matchBitsetScalar(bitset) { | ||
controller.step() | ||
} | ||
|
||
case .consumeBy: | ||
let reg = payload.consumer | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Other ideas are to have instructions that say to do case-insensitive matching, that is we have variants of match-character/scalar and similarly for match-sequence.
A special case for ASCII could be a dedicated instruction for the parts of ASCII that are case-sensitive. It could store both scalars to match, or just store the lowercase and the engine would do the arithmetic when matching.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I.e. we'll want to get all of this off of using consumer interfaces at some point, but doesn't have to be this PR.