-
Notifications
You must be signed in to change notification settings - Fork 49
Use a bitset for ascii-only character classes #511
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 34 commits
5fd8840
03fe8d6
5667705
bde259b
bf95e81
243ec7b
eeb0852
49efd67
b3a61a7
926d208
752ea76
7327e74
7a900b6
50e8e6d
3c7f62c
b71b177
e2a011c
f7900e5
e9d1902
cf59091
f4019d4
ed82cb0
cc1ac9d
ccf6ade
7b83e0c
5121076
3607b65
291a974
ddcf40f
f87b325
2d8ac2d
fd66693
22c8213
0781b93
ffff944
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,13 +3,19 @@ | |
extension Compiler { | ||
struct ByteCodeGen { | ||
var options: MatchingOptions | ||
private let compileOptions: CompileOptions | ||
var builder = MEProgram.Builder() | ||
/// A Boolean indicating whether the first matchable atom has been emitted. | ||
/// This is used to determine whether to apply initial options. | ||
var hasEmittedFirstMatchableAtom = false | ||
|
||
init(options: MatchingOptions, captureList: CaptureList) { | ||
init( | ||
options: MatchingOptions, | ||
compileOptions: CompileOptions, | ||
captureList: CaptureList | ||
) { | ||
self.options = options | ||
self.compileOptions = compileOptions | ||
self.builder.captureList = captureList | ||
} | ||
} | ||
|
@@ -643,8 +649,16 @@ fileprivate extension Compiler.ByteCodeGen { | |
mutating func emitCustomCharacterClass( | ||
_ ccc: DSLTree.CustomCharacterClass | ||
) throws { | ||
let consumer = try ccc.generateConsumer(options) | ||
builder.buildConsume(by: consumer) | ||
if let asciiBitset = ccc.asAsciiBitset(options), | ||
options.semanticLevel == .graphemeCluster, | ||
!compileOptions.contains(.unoptimized) { | ||
// future work: add a bit to .matchBitset to consume either a character | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you make sure we do this soon? I want to have as much of a unified performance story between grapheme semantic and scalar semantic as possible. Ideally a lot of perf analysis will be downgrading grapheme to scalar operations as permitted. Having two different paths also complicates testing, as now many tests that were exhaustively testing the engine are now only testing one path in the engine. We'll need to meet to discuss testing and validation as we add special-case optimizations. |
||
// or a scalar so we can have this optimization in scalar mode | ||
builder.buildMatchAsciiBitset(asciiBitset) | ||
} else { | ||
let consumer = try ccc.generateConsumer(options) | ||
builder.buildConsume(by: consumer) | ||
} | ||
} | ||
|
||
@discardableResult | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ class Compiler { | |
|
||
// TODO: Or are these stored on the tree? | ||
var options = MatchingOptions() | ||
private var compileOptions: CompileOptions = .default | ||
|
||
init(ast: AST) { | ||
self.tree = ast.dslTree | ||
|
@@ -25,23 +26,22 @@ class Compiler { | |
self.tree = tree | ||
} | ||
|
||
init(tree: DSLTree, compileOptions: CompileOptions) { | ||
self.tree = tree | ||
self.compileOptions = compileOptions | ||
} | ||
|
||
__consuming func emit() throws -> MEProgram { | ||
// TODO: Handle global options | ||
var codegen = ByteCodeGen( | ||
options: options, captureList: tree.captureList | ||
) | ||
options: options, | ||
compileOptions: | ||
compileOptions, | ||
captureList: tree.captureList) | ||
return try codegen.emitRoot(tree.root) | ||
} | ||
} | ||
|
||
func _compileRegex( | ||
_ regex: String, _ syntax: SyntaxOptions = .traditional | ||
) throws -> Executor { | ||
let ast = try parse(regex, .semantic, syntax) | ||
let program = try Compiler(ast: ast).emit() | ||
return Executor(program: program) | ||
} | ||
|
||
// An error produced when compiling a regular expression. | ||
enum RegexCompilationError: Error, CustomStringConvertible { | ||
// TODO: Source location? | ||
|
@@ -54,3 +54,35 @@ enum RegexCompilationError: Error, CustomStringConvertible { | |
} | ||
} | ||
} | ||
|
||
// Testing support | ||
@available(SwiftStdlib 5.7, *) | ||
func _compileRegex( | ||
_ regex: String, | ||
_ syntax: SyntaxOptions = .traditional, | ||
_ semanticLevel: RegexSemanticLevel? = nil | ||
) throws -> Executor { | ||
let ast = try parse(regex, .semantic, syntax) | ||
let dsl: DSLTree | ||
|
||
switch semanticLevel?.base { | ||
case .graphemeCluster: | ||
let sequence = AST.MatchingOptionSequence(adding: [.init(.graphemeClusterSemantics, location: .fake)]) | ||
dsl = DSLTree(.nonCapturingGroup(.init(ast: .changeMatchingOptions(sequence)), ast.dslTree.root)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Future: we'll want a DSLTree node for changing options that's not via groups |
||
case .unicodeScalar: | ||
let sequence = AST.MatchingOptionSequence(adding: [.init(.unicodeScalarSemantics, location: .fake)]) | ||
dsl = DSLTree(.nonCapturingGroup(.init(ast: .changeMatchingOptions(sequence)), ast.dslTree.root)) | ||
case .none: | ||
dsl = ast.dslTree | ||
} | ||
let program = try Compiler(tree: dsl).emit() | ||
return Executor(program: program) | ||
} | ||
|
||
extension Compiler { | ||
struct CompileOptions: OptionSet { | ||
let rawValue: Int | ||
static let unoptimized = CompileOptions(rawValue: 1) | ||
rctcwyvrn marked this conversation as resolved.
Show resolved
Hide resolved
|
||
static let `default`: CompileOptions = [] | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -51,7 +51,26 @@ extension DSLTree.Node { | |
} | ||
} | ||
|
||
extension DSLTree._AST.Atom { | ||
var singleScalarASCIIValue: UInt8? { | ||
return ast.singleScalarASCIIValue | ||
} | ||
} | ||
|
||
extension DSLTree.Atom { | ||
var singleScalarASCIIValue: UInt8? { | ||
switch self { | ||
case let .char(c) where c != "\r\n": | ||
return c.asciiValue | ||
case let .scalar(s) where s.isASCII: | ||
return UInt8(ascii: s) | ||
case let .unconverted(atom): | ||
return atom.singleScalarASCIIValue | ||
default: | ||
return nil | ||
} | ||
} | ||
|
||
rctcwyvrn marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// TODO: If ByteCodeGen switches first, then this is unnecessary for | ||
// top-level nodes, but it's also invoked for `.atom` members of a custom CC | ||
func generateConsumer( | ||
|
@@ -61,17 +80,32 @@ extension DSLTree.Atom { | |
|
||
switch self { | ||
case let .char(c): | ||
// TODO: Match level? | ||
return { input, bounds in | ||
let low = bounds.lowerBound | ||
if isCaseInsensitive && c.isCased { | ||
return input[low].lowercased() == c.lowercased() | ||
? input.index(after: low) | ||
: nil | ||
} else { | ||
return input[low] == c | ||
? input.index(after: low) | ||
: nil | ||
if opts.semanticLevel == .graphemeCluster { | ||
return { input, bounds in | ||
let low = bounds.lowerBound | ||
if isCaseInsensitive && c.isCased { | ||
return input[low].lowercased() == c.lowercased() | ||
? input.index(after: low) | ||
: nil | ||
} else { | ||
return input[low] == c | ||
? input.index(after: low) | ||
: nil | ||
} | ||
} | ||
} else { | ||
let consumers = c.unicodeScalars.map { s in consumeScalar { | ||
isCaseInsensitive | ||
? $0.properties.lowercaseMapping == s.properties.lowercaseMapping | ||
: $0 == s | ||
}} | ||
return { input, bounds in | ||
for fn in consumers { | ||
if let idx = fn(input, bounds) { | ||
return idx | ||
} | ||
} | ||
return nil | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks ok to me for now. Future: we seriously need to kill this code. The FIXME above states that it's only called for character classes, yet it is emitting atom consumers, so there's some technical debt here. |
||
} | ||
} | ||
case let .scalar(s): | ||
|
@@ -177,7 +211,18 @@ extension AST.Atom { | |
default: return nil | ||
} | ||
} | ||
|
||
|
||
var singleScalarASCIIValue: UInt8? { | ||
switch kind { | ||
case let .char(c) where c != "\r\n": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Future cleanup: something like the below to consolidate logic
|
||
return c.asciiValue | ||
case let .scalar(s) where s.value.isASCII: | ||
return UInt8(ascii: s.value) | ||
default: | ||
return nil | ||
} | ||
} | ||
|
||
func generateConsumer( | ||
_ opts: MatchingOptions | ||
) throws -> MEProgram.ConsumeFunction? { | ||
|
@@ -235,6 +280,34 @@ extension AST.Atom { | |
} | ||
|
||
extension DSLTree.CustomCharacterClass.Member { | ||
func asAsciiBitset( | ||
_ opts: MatchingOptions, | ||
_ isInverted: Bool | ||
) -> DSLTree.CustomCharacterClass.AsciiBitset? { | ||
switch self { | ||
case let .atom(a): | ||
if let val = a.singleScalarASCIIValue { | ||
return DSLTree.CustomCharacterClass.AsciiBitset( | ||
val, | ||
isInverted, | ||
opts.isCaseInsensitive | ||
) | ||
} | ||
case let .range(low, high): | ||
if let lowVal = low.singleScalarASCIIValue, let highVal = high.singleScalarASCIIValue { | ||
return DSLTree.CustomCharacterClass.AsciiBitset( | ||
low: lowVal, | ||
high: highVal, | ||
isInverted: isInverted, | ||
isCaseInsensitive: opts.isCaseInsensitive | ||
) | ||
} | ||
default: | ||
return nil | ||
} | ||
return nil | ||
} | ||
|
||
func generateConsumer( | ||
_ opts: MatchingOptions | ||
) throws -> MEProgram.ConsumeFunction { | ||
|
@@ -342,6 +415,19 @@ extension DSLTree.CustomCharacterClass.Member { | |
} | ||
|
||
extension DSLTree.CustomCharacterClass { | ||
func asAsciiBitset(_ opts: MatchingOptions) -> AsciiBitset? { | ||
return members.reduce( | ||
.init(isInverted: isInverted), | ||
{result, member in | ||
if let next = member.asAsciiBitset(opts, isInverted) { | ||
return result?.union(next) | ||
} else { | ||
return nil | ||
} | ||
} | ||
) | ||
} | ||
|
||
func generateConsumer( | ||
_ opts: MatchingOptions | ||
) throws -> MEProgram.ConsumeFunction { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -226,6 +226,20 @@ extension Processor { | |
} | ||
return true | ||
} | ||
|
||
// If we have a bitset we know that the CharacterClass only matches against | ||
// ascii characters, so check if the current input element is ascii then | ||
// check if it is set in the bitset | ||
mutating func matchBitset( | ||
_ bitset: DSLTree.CustomCharacterClass.AsciiBitset | ||
) -> Bool { | ||
guard let cur = load(), bitset.matches(char: cur) else { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Future work: we can implement this on the UTF-8 view, but we'd have to handle grapheme breaking ourselves. BTW, @natecook1000 what is the model for semantic mode processing around here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you want to try a fast check, IIRC you could have this at the top:
That'd be for measuring or approximating the potential benefit. I think we'd want to have a more consistent series of helper functions surrounding sub-grapheme cluster processing though. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When matching with grapheme cluster semantics:
When matching with Unicode scalar semantics:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but what is the model for the engine? The engine isn't querying options on every loop. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So IIUC, this optimization only applies to grapheme-semantic mode right now, which is an unfortunate limitation. Lily, can you make sure to write a test for this somehow? We may need to revise our compilation testing approach. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think another approach is to have a bit in some instructions or payloads (whether that is really a dedicated bit or a virtual bit because we expand opcodes around it) that signals whether it should end in a grapheme break check. That would allow us to have a specialized There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it would make sense to do optimizations for scalar mode (new instructions, new processor functions) as a future PR, for now I just made it not generate the bitset when in scalar mode. I also added some support to |
||
signalFailure() | ||
return false | ||
} | ||
_uncheckedForcedConsumeOne() | ||
return true | ||
} | ||
|
||
mutating func signalFailure() { | ||
guard let (pc, pos, stackEnd, capEnds, intRegisters) = | ||
|
@@ -364,6 +378,13 @@ extension Processor { | |
controller.step() | ||
} | ||
|
||
case .matchBitset: | ||
let reg = payload.bitset | ||
let bitset = registers[reg] | ||
if matchBitset(bitset) { | ||
controller.step() | ||
} | ||
|
||
case .consumeBy: | ||
let reg = payload.consumer | ||
guard currentPosition < searchBounds.upperBound, | ||
|
Uh oh!
There was an error while loading. Please reload this page.