-
Notifications
You must be signed in to change notification settings - Fork 49
Remove most consumer functions #660
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
60b4d78
ded0d1b
b7db5e5
4b382cc
99591ef
55340fc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -256,9 +256,11 @@ fileprivate extension Compiler.ByteCodeGen { | |
} | ||
} | ||
|
||
mutating func emitAlternation( | ||
_ children: [DSLTree.Node] | ||
) throws { | ||
mutating func emitAlternationGen<C: BidirectionalCollection>( | ||
_ elements: C, | ||
withBacktracking: Bool, | ||
_ body: (inout Compiler.ByteCodeGen, C.Element) throws -> Void | ||
) rethrows { | ||
// Alternation: p0 | p1 | ... | pn | ||
// save next_p1 | ||
// <code for p0> | ||
|
@@ -276,16 +278,27 @@ fileprivate extension Compiler.ByteCodeGen { | |
// <code for pn> | ||
// done: | ||
let done = builder.makeAddress() | ||
for component in children.dropLast() { | ||
for element in elements.dropLast() { | ||
let next = builder.makeAddress() | ||
builder.buildSave(next) | ||
try emitNode(component) | ||
try body(&self, element) | ||
if !withBacktracking { | ||
builder.buildClear() | ||
} | ||
builder.buildBranch(to: done) | ||
builder.label(next) | ||
} | ||
try emitNode(children.last!) | ||
try body(&self, elements.last!) | ||
builder.label(done) | ||
} | ||
|
||
mutating func emitAlternation( | ||
_ children: [DSLTree.Node] | ||
) throws { | ||
try emitAlternationGen(children, withBacktracking: true) { | ||
try $0.emitNode($1) | ||
} | ||
} | ||
|
||
mutating func emitConcatenationComponent( | ||
_ node: DSLTree.Node | ||
|
@@ -872,19 +885,187 @@ fileprivate extension Compiler.ByteCodeGen { | |
} | ||
} | ||
|
||
/// Flatten quoted strings into sequences of atoms, so that the standard | ||
/// CCC codegen will handle them. | ||
func flatteningCustomCharacterClassMembers( | ||
_ members: [DSLTree.CustomCharacterClass.Member] | ||
) -> [DSLTree.CustomCharacterClass.Member] { | ||
var characters: Set<Character> = [] | ||
var scalars: Set<UnicodeScalar> = [] | ||
var result: [DSLTree.CustomCharacterClass.Member] = [] | ||
for member in members { | ||
switch member { | ||
case .atom(let atom): | ||
switch atom { | ||
case let .char(char): | ||
characters.insert(char) | ||
case let .scalar(scalar): | ||
scalars.insert(scalar) | ||
default: | ||
result.append(member) | ||
} | ||
case let .quotedLiteral(str): | ||
characters.formUnion(str) | ||
default: | ||
result.append(member) | ||
} | ||
} | ||
result.append(contentsOf: characters.map { .atom(.char($0)) }) | ||
result.append(contentsOf: scalars.map { .atom(.scalar($0)) }) | ||
return result | ||
} | ||
|
||
func coalescingCustomCharacterClass( | ||
_ ccc: DSLTree.CustomCharacterClass | ||
) -> DSLTree.CustomCharacterClass { | ||
// This only needs to be done in grapheme semantic mode. In scalar semantic | ||
// mode, we don't want to coalesce any scalars into a grapheme. This | ||
// means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and | ||
// U+302. | ||
guard options.semanticLevel == .graphemeCluster else { return ccc } | ||
|
||
let members = coalescingCustomCharacterClassMembers(ccc.members) | ||
return .init(members: members, isInverted: ccc.isInverted) | ||
let members = options.semanticLevel == .graphemeCluster | ||
? coalescingCustomCharacterClassMembers(ccc.members) | ||
: ccc.members | ||
return .init( | ||
members: flatteningCustomCharacterClassMembers(members), | ||
isInverted: ccc.isInverted) | ||
} | ||
|
||
mutating func emitCharacterInCCC(_ c: Character) { | ||
switch options.semanticLevel { | ||
case .graphemeCluster: | ||
emitCharacter(c) | ||
case .unicodeScalar: | ||
// When in scalar mode, act like an alternation of the individual scalars | ||
// that comprise a character. | ||
emitAlternationGen(c.unicodeScalars, withBacktracking: false) { | ||
$0.emitMatchScalar($1) | ||
} | ||
} | ||
} | ||
|
||
mutating func emitCCCMember( | ||
_ member: DSLTree.CustomCharacterClass.Member | ||
) throws { | ||
switch member { | ||
case .atom(let atom): | ||
switch atom { | ||
case .char(let c): | ||
emitCharacterInCCC(c) | ||
case .scalar(let s): | ||
emitCharacterInCCC(Character(s)) | ||
default: | ||
try emitAtom(atom) | ||
} | ||
case .custom(let ccc): | ||
try emitCustomCharacterClass(ccc) | ||
case .quotedLiteral: | ||
fatalError("Removed in 'flatteningCustomCharacterClassMembers'") | ||
case .range: | ||
let consumer = try member.generateConsumer(options) | ||
builder.buildConsume(by: consumer) | ||
case .trivia: | ||
return | ||
|
||
// TODO: Can we decide when it's better to try `rhs` first? | ||
// Intersection is trivial, since failure on either side propagates: | ||
// - store current position | ||
// - lhs | ||
// - restore current position | ||
// - rhs | ||
case let .intersection(lhs, rhs): | ||
let r = builder.makePositionRegister() | ||
builder.buildMoveCurrentPosition(into: r) | ||
try emitCustomCharacterClass(lhs) | ||
builder.buildRestorePosition(from: r) | ||
try emitCustomCharacterClass(rhs) | ||
|
||
// TODO: Can we decide when it's better to try `rhs` first? | ||
// For subtraction, failure in `lhs` propagates, while failure in `rhs` is | ||
// swallowed/reversed: | ||
// - store current position | ||
// - lhs | ||
// - save to end | ||
// - restore current position | ||
// - rhs | ||
// - clear, fail (since both succeeded) | ||
// - end: ... | ||
case let .subtraction(lhs, rhs): | ||
let r = builder.makePositionRegister() | ||
let end = builder.makeAddress() | ||
builder.buildMoveCurrentPosition(into: r) | ||
try emitCustomCharacterClass(lhs) // no match here = failure, propagates | ||
builder.buildSave(end) | ||
builder.buildRestorePosition(from: r) | ||
try emitCustomCharacterClass(rhs) // no match here = success, resumes at 'end' | ||
builder.buildClear() // clears 'end' | ||
builder.buildFail() // this failure propagates outward | ||
builder.label(end) | ||
|
||
// Symmetric difference always requires executing both `rhs` and `lhs`. | ||
// Execute each, ignoring failure and storing the resulting position in a | ||
// register. If those results are equal, fail. If they're different, use | ||
// the position that is different from the starting position: | ||
// - store current position as r0 | ||
// - save to lhsFail | ||
// - lhs | ||
// - clear lhsFail (and continue) | ||
// - lhsFail: save position as r1 | ||
// | ||
// - restore current position | ||
// - save to rhsFail | ||
// - rhs | ||
// - clear rhsFail (and continue) | ||
// - rhsFail: save position as r2 | ||
// | ||
// - restore to resulting position from lhs (r1) | ||
// - if equal to r2, goto fail (both sides had same result) | ||
// - if equal to r0, goto advance (lhs failed) | ||
// - goto end | ||
// - advance: restore to resulting position from rhs (r2) | ||
// - goto end | ||
// - fail: fail | ||
// - end: ... | ||
case let .symmetricDifference(lhs, rhs): | ||
let r0 = builder.makePositionRegister() | ||
let r1 = builder.makePositionRegister() | ||
let r2 = builder.makePositionRegister() | ||
let lhsFail = builder.makeAddress() | ||
let rhsFail = builder.makeAddress() | ||
let advance = builder.makeAddress() | ||
let fail = builder.makeAddress() | ||
let end = builder.makeAddress() | ||
|
||
builder.buildMoveCurrentPosition(into: r0) | ||
builder.buildSave(lhsFail) | ||
try emitCustomCharacterClass(lhs) | ||
builder.buildClear() | ||
builder.label(lhsFail) | ||
builder.buildMoveCurrentPosition(into: r1) | ||
|
||
builder.buildRestorePosition(from: r0) | ||
builder.buildSave(rhsFail) | ||
try emitCustomCharacterClass(rhs) | ||
builder.buildClear() | ||
builder.label(rhsFail) | ||
builder.buildMoveCurrentPosition(into: r2) | ||
|
||
// If r1 == r2, then fail | ||
builder.buildRestorePosition(from: r1) | ||
builder.buildCondBranch(to: fail, ifSamePositionAs: r2) | ||
|
||
// If r1 == r0, then move to r2 before ending | ||
builder.buildCondBranch(to: advance, ifSamePositionAs: r0) | ||
builder.buildBranch(to: end) | ||
builder.label(advance) | ||
builder.buildRestorePosition(from: r2) | ||
builder.buildBranch(to: end) | ||
|
||
builder.label(fail) | ||
builder.buildFail() | ||
builder.label(end) | ||
} | ||
} | ||
|
||
mutating func emitCustomCharacterClass( | ||
_ ccc: DSLTree.CustomCharacterClass | ||
) throws { | ||
|
@@ -902,8 +1083,67 @@ fileprivate extension Compiler.ByteCodeGen { | |
} | ||
return | ||
} | ||
let consumer = try ccc.generateConsumer(options) | ||
builder.buildConsume(by: consumer) | ||
|
||
let updatedCCC: DSLTree.CustomCharacterClass | ||
if optimizationsEnabled { | ||
updatedCCC = ccc.coalescingASCIIMembers(options) | ||
} else { | ||
updatedCCC = ccc | ||
} | ||
let filteredMembers = updatedCCC.members.filter({!$0.isOnlyTrivia}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Quick note: I think it makes sense for this to be a single operation that will recursively traverse and produce a code-gen-relevant representation. That will also help with testing as we can test that the flattened output is properly associated and has the members we expect. It will also help divide out linearizing the set structure into the instruction stream from removing consumer functions themselves. E.g.
|
||
|
||
if updatedCCC.isInverted { | ||
// inverted | ||
// custom character class: p0 | p1 | ... | pn | ||
// Try each member to make sure they all fail | ||
// save next_p1 | ||
// <code for p0> | ||
// clear, fail | ||
// next_p1: | ||
// save next_p2 | ||
// <code for p1> | ||
// clear fail | ||
// next_p2: | ||
// save next_p... | ||
// <code for p2> | ||
// clear fail | ||
// ... | ||
// next_pn: | ||
// save done | ||
// <code for pn> | ||
// clear fail | ||
// done: | ||
// step forward by 1 | ||
let done = builder.makeAddress() | ||
for member in filteredMembers.dropLast() { | ||
let next = builder.makeAddress() | ||
builder.buildSave(next) | ||
try emitCCCMember(member) | ||
builder.buildClear() | ||
builder.buildFail() | ||
builder.label(next) | ||
} | ||
builder.buildSave(done) | ||
try emitCCCMember(filteredMembers.last!) | ||
builder.buildClear() | ||
builder.buildFail() | ||
builder.label(done) | ||
Comment on lines
+1117
to
+1130
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could add an |
||
|
||
// Consume a single unit for the inverted ccc | ||
switch options.semanticLevel { | ||
case .graphemeCluster: | ||
builder.buildAdvance(1) | ||
case .unicodeScalar: | ||
builder.buildAdvanceUnicodeScalar(1) | ||
} | ||
return | ||
} | ||
// non inverted CCC | ||
// Custom character class: p0 | p1 | ... | pn | ||
// Very similar to alternation, but we don't keep backtracking save points | ||
try emitAlternationGen(filteredMembers, withBacktracking: false) { | ||
try $0.emitCCCMember($1) | ||
} | ||
} | ||
|
||
mutating func emitConcatenation(_ children: [DSLTree.Node]) throws { | ||
|
@@ -1040,6 +1280,12 @@ fileprivate extension Compiler.ByteCodeGen { | |
} | ||
|
||
extension DSLTree.Node { | ||
/// A Boolean value indicating whether this node advances the match position | ||
/// on a successful match. | ||
/// | ||
/// For example, an alternation like `(a|b|c)` always advances the position | ||
/// by a character, but `(a|b|)` has an empty branch, which matches without | ||
/// advancing. | ||
var guaranteesForwardProgress: Bool { | ||
switch self { | ||
case .orderedChoice(let children): | ||
|
@@ -1070,12 +1316,34 @@ extension DSLTree.Node { | |
case .consumer, .matcher: | ||
// Allow zero width consumers and matchers | ||
return false | ||
case .customCharacterClass: | ||
return true | ||
case .customCharacterClass(let ccc): | ||
return ccc.guaranteesForwardProgress | ||
case .quantification(let amount, _, let child): | ||
let (atLeast, _) = amount.ast.bounds | ||
return atLeast ?? 0 > 0 && child.guaranteesForwardProgress | ||
default: return false | ||
} | ||
} | ||
} | ||
|
||
extension DSLTree.CustomCharacterClass { | ||
/// We allow trivia into CustomCharacterClass, which could result in a CCC | ||
/// that matches nothing, ie `(?x)[ ]`. | ||
var guaranteesForwardProgress: Bool { | ||
for m in members { | ||
switch m { | ||
case .trivia: | ||
continue | ||
case let .intersection(lhs, rhs): | ||
return lhs.guaranteesForwardProgress && rhs.guaranteesForwardProgress | ||
case let .subtraction(lhs, _): | ||
return lhs.guaranteesForwardProgress | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if the LHS's forward progress member is subtracted by the RHS? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I actually haven't been able to get a trivia-only CCC to parse. Do you have a way of constructing one? If not, CCCs might just always guarantee forward progress. Note that this is "guarantees forward progress" only upon matching, so even if we continue to allow empty CCCs (like |
||
case let .symmetricDifference(lhs, rhs): | ||
return lhs.guaranteesForwardProgress && rhs.guaranteesForwardProgress | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similarly, what if the symmetric difference only contains trivia? |
||
default: | ||
return true | ||
} | ||
} | ||
return false | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just to double check, do we want to need to preserve ordering?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No need to preserve this ordering — at this level, everything is a union.