Skip to content

Remove most consumer functions #660

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
296 changes: 282 additions & 14 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -256,9 +256,11 @@ fileprivate extension Compiler.ByteCodeGen {
}
}

mutating func emitAlternation(
_ children: [DSLTree.Node]
) throws {
mutating func emitAlternationGen<C: BidirectionalCollection>(
_ elements: C,
withBacktracking: Bool,
_ body: (inout Compiler.ByteCodeGen, C.Element) throws -> Void
) rethrows {
// Alternation: p0 | p1 | ... | pn
// save next_p1
// <code for p0>
Expand All @@ -276,16 +278,27 @@ fileprivate extension Compiler.ByteCodeGen {
// <code for pn>
// done:
let done = builder.makeAddress()
for component in children.dropLast() {
for element in elements.dropLast() {
let next = builder.makeAddress()
builder.buildSave(next)
try emitNode(component)
try body(&self, element)
if !withBacktracking {
builder.buildClear()
}
builder.buildBranch(to: done)
builder.label(next)
}
try emitNode(children.last!)
try body(&self, elements.last!)
builder.label(done)
}

mutating func emitAlternation(
_ children: [DSLTree.Node]
) throws {
try emitAlternationGen(children, withBacktracking: true) {
try $0.emitNode($1)
}
}

mutating func emitConcatenationComponent(
_ node: DSLTree.Node
Expand Down Expand Up @@ -872,19 +885,187 @@ fileprivate extension Compiler.ByteCodeGen {
}
}

/// Flatten quoted strings into sequences of atoms, so that the standard
/// CCC codegen will handle them.
func flatteningCustomCharacterClassMembers(
_ members: [DSLTree.CustomCharacterClass.Member]
) -> [DSLTree.CustomCharacterClass.Member] {
var characters: Set<Character> = []
var scalars: Set<UnicodeScalar> = []
var result: [DSLTree.CustomCharacterClass.Member] = []
for member in members {
switch member {
case .atom(let atom):
switch atom {
case let .char(char):
characters.insert(char)
case let .scalar(scalar):
scalars.insert(scalar)
default:
result.append(member)
}
case let .quotedLiteral(str):
characters.formUnion(str)
default:
result.append(member)
}
}
result.append(contentsOf: characters.map { .atom(.char($0)) })
result.append(contentsOf: scalars.map { .atom(.scalar($0)) })
return result
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to double check, do we want to need to preserve ordering?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to preserve this ordering — at this level, everything is a union.

}

func coalescingCustomCharacterClass(
_ ccc: DSLTree.CustomCharacterClass
) -> DSLTree.CustomCharacterClass {
// This only needs to be done in grapheme semantic mode. In scalar semantic
// mode, we don't want to coalesce any scalars into a grapheme. This
// means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
// U+302.
guard options.semanticLevel == .graphemeCluster else { return ccc }

let members = coalescingCustomCharacterClassMembers(ccc.members)
return .init(members: members, isInverted: ccc.isInverted)
let members = options.semanticLevel == .graphemeCluster
? coalescingCustomCharacterClassMembers(ccc.members)
: ccc.members
return .init(
members: flatteningCustomCharacterClassMembers(members),
isInverted: ccc.isInverted)
}

mutating func emitCharacterInCCC(_ c: Character) {
switch options.semanticLevel {
case .graphemeCluster:
emitCharacter(c)
case .unicodeScalar:
// When in scalar mode, act like an alternation of the individual scalars
// that comprise a character.
emitAlternationGen(c.unicodeScalars, withBacktracking: false) {
$0.emitMatchScalar($1)
}
}
}

mutating func emitCCCMember(
_ member: DSLTree.CustomCharacterClass.Member
) throws {
switch member {
case .atom(let atom):
switch atom {
case .char(let c):
emitCharacterInCCC(c)
case .scalar(let s):
emitCharacterInCCC(Character(s))
default:
try emitAtom(atom)
}
case .custom(let ccc):
try emitCustomCharacterClass(ccc)
case .quotedLiteral:
fatalError("Removed in 'flatteningCustomCharacterClassMembers'")
case .range:
let consumer = try member.generateConsumer(options)
builder.buildConsume(by: consumer)
case .trivia:
return

// TODO: Can we decide when it's better to try `rhs` first?
// Intersection is trivial, since failure on either side propagates:
// - store current position
// - lhs
// - restore current position
// - rhs
case let .intersection(lhs, rhs):
let r = builder.makePositionRegister()
builder.buildMoveCurrentPosition(into: r)
try emitCustomCharacterClass(lhs)
builder.buildRestorePosition(from: r)
try emitCustomCharacterClass(rhs)

// TODO: Can we decide when it's better to try `rhs` first?
// For subtraction, failure in `lhs` propagates, while failure in `rhs` is
// swallowed/reversed:
// - store current position
// - lhs
// - save to end
// - restore current position
// - rhs
// - clear, fail (since both succeeded)
// - end: ...
case let .subtraction(lhs, rhs):
let r = builder.makePositionRegister()
let end = builder.makeAddress()
builder.buildMoveCurrentPosition(into: r)
try emitCustomCharacterClass(lhs) // no match here = failure, propagates
builder.buildSave(end)
builder.buildRestorePosition(from: r)
try emitCustomCharacterClass(rhs) // no match here = success, resumes at 'end'
builder.buildClear() // clears 'end'
builder.buildFail() // this failure propagates outward
builder.label(end)

// Symmetric difference always requires executing both `rhs` and `lhs`.
// Execute each, ignoring failure and storing the resulting position in a
// register. If those results are equal, fail. If they're different, use
// the position that is different from the starting position:
// - store current position as r0
// - save to lhsFail
// - lhs
// - clear lhsFail (and continue)
// - lhsFail: save position as r1
//
// - restore current position
// - save to rhsFail
// - rhs
// - clear rhsFail (and continue)
// - rhsFail: save position as r2
//
// - restore to resulting position from lhs (r1)
// - if equal to r2, goto fail (both sides had same result)
// - if equal to r0, goto advance (lhs failed)
// - goto end
// - advance: restore to resulting position from rhs (r2)
// - goto end
// - fail: fail
// - end: ...
case let .symmetricDifference(lhs, rhs):
let r0 = builder.makePositionRegister()
let r1 = builder.makePositionRegister()
let r2 = builder.makePositionRegister()
let lhsFail = builder.makeAddress()
let rhsFail = builder.makeAddress()
let advance = builder.makeAddress()
let fail = builder.makeAddress()
let end = builder.makeAddress()

builder.buildMoveCurrentPosition(into: r0)
builder.buildSave(lhsFail)
try emitCustomCharacterClass(lhs)
builder.buildClear()
builder.label(lhsFail)
builder.buildMoveCurrentPosition(into: r1)

builder.buildRestorePosition(from: r0)
builder.buildSave(rhsFail)
try emitCustomCharacterClass(rhs)
builder.buildClear()
builder.label(rhsFail)
builder.buildMoveCurrentPosition(into: r2)

// If r1 == r2, then fail
builder.buildRestorePosition(from: r1)
builder.buildCondBranch(to: fail, ifSamePositionAs: r2)

// If r1 == r0, then move to r2 before ending
builder.buildCondBranch(to: advance, ifSamePositionAs: r0)
builder.buildBranch(to: end)
builder.label(advance)
builder.buildRestorePosition(from: r2)
builder.buildBranch(to: end)

builder.label(fail)
builder.buildFail()
builder.label(end)
}
}

mutating func emitCustomCharacterClass(
_ ccc: DSLTree.CustomCharacterClass
) throws {
Expand All @@ -902,8 +1083,67 @@ fileprivate extension Compiler.ByteCodeGen {
}
return
}
let consumer = try ccc.generateConsumer(options)
builder.buildConsume(by: consumer)

let updatedCCC: DSLTree.CustomCharacterClass
if optimizationsEnabled {
updatedCCC = ccc.coalescingASCIIMembers(options)
} else {
updatedCCC = ccc
}
let filteredMembers = updatedCCC.members.filter({!$0.isOnlyTrivia})
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Quick note: I think it makes sense for this to be a single operation that will recursively traverse and produce a code-gen-relevant representation. That will also help with testing as we can test that the flattened output is properly associated and has the members we expect. It will also help divide out linearizing the set structure into the instruction stream from removing consumer functions themselves.

E.g.

extension DSLTree.CustomCharacterClass {
  enum CodeGenClassMember {
    case bitset: ASCIIBitset
    case consumerFunction: () -> () ...
    case range: ...
    case character: Character
    case scalar: Unicode.Scalar
  }

  enum Operation {
    case union, intersection, ...
    case endOfOps
  }

  func codeGenRepresentation() -> [(CodeGenClassMember, Operation)]
}


if updatedCCC.isInverted {
// inverted
// custom character class: p0 | p1 | ... | pn
// Try each member to make sure they all fail
// save next_p1
// <code for p0>
// clear, fail
// next_p1:
// save next_p2
// <code for p1>
// clear fail
// next_p2:
// save next_p...
// <code for p2>
// clear fail
// ...
// next_pn:
// save done
// <code for pn>
// clear fail
// done:
// step forward by 1
let done = builder.makeAddress()
for member in filteredMembers.dropLast() {
let next = builder.makeAddress()
builder.buildSave(next)
try emitCCCMember(member)
builder.buildClear()
builder.buildFail()
builder.label(next)
}
builder.buildSave(done)
try emitCCCMember(filteredMembers.last!)
builder.buildClear()
builder.buildFail()
builder.label(done)
Comment on lines +1117 to +1130
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could add an inverted parameter to emitAlternationGen to cover this case as well...


// Consume a single unit for the inverted ccc
switch options.semanticLevel {
case .graphemeCluster:
builder.buildAdvance(1)
case .unicodeScalar:
builder.buildAdvanceUnicodeScalar(1)
}
return
}
// non inverted CCC
// Custom character class: p0 | p1 | ... | pn
// Very similar to alternation, but we don't keep backtracking save points
try emitAlternationGen(filteredMembers, withBacktracking: false) {
try $0.emitCCCMember($1)
}
}

mutating func emitConcatenation(_ children: [DSLTree.Node]) throws {
Expand Down Expand Up @@ -1040,6 +1280,12 @@ fileprivate extension Compiler.ByteCodeGen {
}

extension DSLTree.Node {
/// A Boolean value indicating whether this node advances the match position
/// on a successful match.
///
/// For example, an alternation like `(a|b|c)` always advances the position
/// by a character, but `(a|b|)` has an empty branch, which matches without
/// advancing.
var guaranteesForwardProgress: Bool {
switch self {
case .orderedChoice(let children):
Expand Down Expand Up @@ -1070,12 +1316,34 @@ extension DSLTree.Node {
case .consumer, .matcher:
// Allow zero width consumers and matchers
return false
case .customCharacterClass:
return true
case .customCharacterClass(let ccc):
return ccc.guaranteesForwardProgress
case .quantification(let amount, _, let child):
let (atLeast, _) = amount.ast.bounds
return atLeast ?? 0 > 0 && child.guaranteesForwardProgress
default: return false
}
}
}

extension DSLTree.CustomCharacterClass {
/// We allow trivia into CustomCharacterClass, which could result in a CCC
/// that matches nothing, ie `(?x)[ ]`.
var guaranteesForwardProgress: Bool {
for m in members {
switch m {
case .trivia:
continue
case let .intersection(lhs, rhs):
return lhs.guaranteesForwardProgress && rhs.guaranteesForwardProgress
case let .subtraction(lhs, _):
return lhs.guaranteesForwardProgress
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if the LHS's forward progress member is subtracted by the RHS?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually haven't been able to get a trivia-only CCC to parse. Do you have a way of constructing one? If not, CCCs might just always guarantee forward progress.

Note that this is "guarantees forward progress" only upon matching, so even if we continue to allow empty CCCs (like [[a]--[a]]), that would just fail to match against any character, not match against a zero length position. Some regex engines (e.g. Rust) prohibit empty character classes, but others just treat them like something that's impossible to match (e.g. ICU/NSRE). Since we're currently doing the second, we could eventually eliminate branches that include these if we recognize them.

case let .symmetricDifference(lhs, rhs):
return lhs.guaranteesForwardProgress && rhs.guaranteesForwardProgress
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similarly, what if the symmetric difference only contains trivia?

default:
return true
}
}
return false
}
}
Loading