Skip to content

Implement instructions for matching builtin character classes and assertions #547

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Aug 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
3b6b676
Copy over new ascii bitset
rctcwyvrn Jul 5, 2022
33caa79
Add matchBuiltin
rctcwyvrn Jul 5, 2022
139daa5
Remove debug prints
rctcwyvrn Jul 5, 2022
9abf4af
Remove bitset fast path
rctcwyvrn Jul 5, 2022
286f5d8
Fully remove remnants of the bitset fast path
rctcwyvrn Jul 6, 2022
9e915cd
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 7, 2022
e593ddb
Completely replace AssertionFunction with regexAssert(by:)
rctcwyvrn Jul 11, 2022
25dc277
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 12, 2022
3e38ac6
Cleanup
rctcwyvrn Jul 12, 2022
e5d8b4a
Move match builtin and assert + Add AssertionPayload
rctcwyvrn Jul 12, 2022
0466c25
Cleanup assertions
rctcwyvrn Jul 12, 2022
87078ad
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 12, 2022
f401e84
Fix tests
rctcwyvrn Jul 13, 2022
b09f45f
Update opcode description for assertBy
rctcwyvrn Jul 13, 2022
c581ea2
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 14, 2022
2a82231
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 15, 2022
fb1576a
Update branch to match main
rctcwyvrn Jul 15, 2022
3b9485e
Use the newly cleaned up _CharacterClassModel
rctcwyvrn Jul 16, 2022
64d1ed9
Add characterClass DSLTree node
rctcwyvrn Jul 16, 2022
2a6fe3c
Bugfixes
rctcwyvrn Jul 19, 2022
206bfc6
Add documentation for matchBuiltin
rctcwyvrn Jul 21, 2022
b53f524
Lots of cleanup
rctcwyvrn Jul 25, 2022
bb5245f
Move assertion payload
rctcwyvrn Jul 25, 2022
0746847
More minor cleanup
rctcwyvrn Jul 25, 2022
c718543
Perform boundary check for .anyScalar when in grapheme mode
rctcwyvrn Jul 25, 2022
d35b578
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 28, 2022
ca8acf2
Merge branch 'main' into speedy-builtins
rctcwyvrn Aug 2, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 23 additions & 11 deletions Sources/RegexBuilder/CharacterClass.swift
Original file line number Diff line number Diff line change
Expand Up @@ -15,27 +15,39 @@
@available(SwiftStdlib 5.7, *)
public struct CharacterClass {
internal var ccc: DSLTree.CustomCharacterClass
/// The builtin character class, if this CharacterClass is representable by one
internal var builtin: DSLTree.Atom.CharacterClass?

init(_ ccc: DSLTree.CustomCharacterClass) {
self.ccc = ccc
self.builtin = nil
}

init(unconverted atom: DSLTree._AST.Atom) {
self.ccc = .init(members: [.atom(.unconverted(atom))])
init(builtin: DSLTree.Atom.CharacterClass) {
self.ccc = .init(members: [.atom(.characterClass(builtin))])
self.builtin = builtin
}
}

@available(SwiftStdlib 5.7, *)
extension CharacterClass: RegexComponent {
public var regex: Regex<Substring> {
_RegexFactory().customCharacterClass(ccc)
if let cc = builtin {
return _RegexFactory().characterClass(cc)
} else {
return _RegexFactory().customCharacterClass(ccc)
}
}
}

@available(SwiftStdlib 5.7, *)
extension CharacterClass {
public var inverted: CharacterClass {
CharacterClass(ccc.inverted)
if let inv = builtin?.inverted {
return CharacterClass(builtin: inv)
} else {
return CharacterClass(ccc.inverted)
}
}
}

Expand All @@ -50,15 +62,15 @@ extension RegexComponent where Self == CharacterClass {
}

public static var anyGraphemeCluster: CharacterClass {
.init(unconverted: ._anyGrapheme)
.init(builtin: .anyGrapheme)
}

public static var whitespace: CharacterClass {
.init(unconverted: ._whitespace)
.init(builtin: .whitespace)
}

public static var digit: CharacterClass {
.init(unconverted: ._digit)
.init(builtin: .digit)
}

public static var hexDigit: CharacterClass {
Expand All @@ -70,19 +82,19 @@ extension RegexComponent where Self == CharacterClass {
}

public static var horizontalWhitespace: CharacterClass {
.init(unconverted: ._horizontalWhitespace)
.init(builtin: .horizontalWhitespace)
}

public static var newlineSequence: CharacterClass {
.init(unconverted: ._newlineSequence)
.init(builtin: .newlineSequence)
}

public static var verticalWhitespace: CharacterClass {
.init(unconverted: ._verticalWhitespace)
.init(builtin: .verticalWhitespace)
}

public static var word: CharacterClass {
.init(unconverted: ._word)
.init(builtin: .word)
}
}

Expand Down
156 changes: 18 additions & 138 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ fileprivate extension Compiler.ByteCodeGen {
emitMatchScalar(s)
}

case let .characterClass(cc):
emitCharacterClass(cc)

case let .assertion(kind):
try emitAssertion(kind)

Expand Down Expand Up @@ -148,147 +151,24 @@ fileprivate extension Compiler.ByteCodeGen {
}
}

mutating func emitStartOfLine() {
builder.buildAssert { [semanticLevel = options.semanticLevel]
(_, _, input, pos, subjectBounds) in
if pos == subjectBounds.lowerBound { return true }
switch semanticLevel {
case .graphemeCluster:
return input[input.index(before: pos)].isNewline
case .unicodeScalar:
return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline
}
}
}

mutating func emitEndOfLine() {
builder.buildAssert { [semanticLevel = options.semanticLevel]
(_, _, input, pos, subjectBounds) in
if pos == subjectBounds.upperBound { return true }
switch semanticLevel {
case .graphemeCluster:
return input[pos].isNewline
case .unicodeScalar:
return input.unicodeScalars[pos].isNewline
}
}
}

mutating func emitAssertion(
_ kind: DSLTree.Atom.Assertion
) throws {
// FIXME: Depends on API model we have... We may want to
// think through some of these with API interactions in mind
//
// This might break how we use `bounds` for both slicing
// and things like `firstIndex`, that is `firstIndex` may
// need to supply both a slice bounds and a per-search bounds.
switch kind {
case .startOfSubject:
builder.buildAssert { (_, _, input, pos, subjectBounds) in
pos == subjectBounds.lowerBound
}

case .endOfSubjectBeforeNewline:
builder.buildAssert { [semanticLevel = options.semanticLevel]
(_, _, input, pos, subjectBounds) in
if pos == subjectBounds.upperBound { return true }
switch semanticLevel {
case .graphemeCluster:
return input.index(after: pos) == subjectBounds.upperBound
&& input[pos].isNewline
case .unicodeScalar:
return input.unicodeScalars.index(after: pos) == subjectBounds.upperBound
&& input.unicodeScalars[pos].isNewline
}
}

case .endOfSubject:
builder.buildAssert { (_, _, input, pos, subjectBounds) in
pos == subjectBounds.upperBound
}

case .resetStartOfMatch:
// FIXME: Figure out how to communicate this out
if kind == .resetStartOfMatch {
throw Unsupported(#"\K (reset/keep assertion)"#)

case .firstMatchingPositionInSubject:
// TODO: We can probably build a nice model with API here

// FIXME: This needs to be based on `searchBounds`,
// not the `subjectBounds` given as an argument here
builder.buildAssert { (_, _, input, pos, subjectBounds) in false }

case .textSegment:
builder.buildAssert { (_, _, input, pos, _) in
// FIXME: Grapheme or word based on options
input.isOnGraphemeClusterBoundary(pos)
}

case .notTextSegment:
builder.buildAssert { (_, _, input, pos, _) in
// FIXME: Grapheme or word based on options
!input.isOnGraphemeClusterBoundary(pos)
}

case .startOfLine:
emitStartOfLine()

case .endOfLine:
emitEndOfLine()

case .caretAnchor:
if options.anchorsMatchNewlines {
emitStartOfLine()
} else {
builder.buildAssert { (_, _, input, pos, subjectBounds) in
pos == subjectBounds.lowerBound
}
}

case .dollarAnchor:
if options.anchorsMatchNewlines {
emitEndOfLine()
} else {
builder.buildAssert { (_, _, input, pos, subjectBounds) in
pos == subjectBounds.upperBound
}
}

case .wordBoundary:
builder.buildAssert { [options]
(cache, maxIndex, input, pos, subjectBounds) in
if options.usesSimpleUnicodeBoundaries {
// TODO: How should we handle bounds?
return _CharacterClassModel.word.isBoundary(
input,
at: pos,
bounds: subjectBounds,
with: options
)
} else {
return input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
}
}

case .notWordBoundary:
builder.buildAssert { [options]
(cache, maxIndex, input, pos, subjectBounds) in
if options.usesSimpleUnicodeBoundaries {
// TODO: How should we handle bounds?
return !_CharacterClassModel.word.isBoundary(
input,
at: pos,
bounds: subjectBounds,
with: options
)
} else {
return !input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
}
}
}
builder.buildAssert(
by: kind,
options.anchorsMatchNewlines,
options.usesSimpleUnicodeBoundaries,
options.usesASCIIWord,
options.semanticLevel)
}


mutating func emitCharacterClass(_ cc: DSLTree.Atom.CharacterClass) {
builder.buildMatchBuiltin(model: cc.asRuntimeModel(options))
}

mutating func emitMatchScalar(_ s: UnicodeScalar) {
assert(options.semanticLevel == .unicodeScalar)
if options.isCaseInsensitive && s.properties.isCased {
Expand Down Expand Up @@ -907,10 +787,10 @@ fileprivate extension Compiler.ByteCodeGen {
} else {
builder.buildMatchAsciiBitset(asciiBitset)
}
} else {
let consumer = try ccc.generateConsumer(options)
builder.buildConsume(by: consumer)
return
}
let consumer = try ccc.generateConsumer(options)
builder.buildConsume(by: consumer)
}

mutating func emitConcatenation(_ children: [DSLTree.Node]) throws {
Expand Down
26 changes: 15 additions & 11 deletions Sources/_StringProcessing/ConsumerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,8 @@ extension DSLTree.Atom {
case .assertion:
// TODO: We could handle, should this be total?
return nil
case .characterClass(let cc):
return cc.generateConsumer(opts)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Future: get rid of all these consumers except custom ones like custom character classes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main reason why all these consumer functions + _CharacterClassModel.matches still exist is because we have to emit any non-ascii CustomCharacterClass as one big consumer so it needs to know how to create a consumer for each possible member type (which includes Atom).

Do you think we could emit it as an ordered choice over its members instead? I think the main thing to add would be instructions for handling the custom character class set operations, and if we had that then we could rip out all of the non character property related stuff in ConsumerInterface

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to avoid emitting backtracking traffic for custom character classes if we can. However, since there's a compilation boundary around each member anyways, that might not account for much. Let's keep this for now and try to come back to it again

Copy link
Contributor Author

@rctcwyvrn rctcwyvrn Jul 25, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another thing to consider is that if we keep non-ascii custom character classes as one big consumer we're unable to access any optimized matching for any of its members.

Also just no longer having all the matching logic exactly duplicated across ConsumerInterface and Processor seems like something pretty important for future extensibility/preventing bugs. We pay some cost for emitting backtracking traffic but this is mostly for uncommon cases, the most common ascii cases are already handled by the bitset.


case .backreference:
// TODO: Should we handle?
Expand All @@ -182,6 +184,15 @@ extension DSLTree.Atom {
}
}

extension DSLTree.Atom.CharacterClass {
func generateConsumer(_ opts: MatchingOptions) -> MEProgram.ConsumeFunction {
let model = asRuntimeModel(opts)
return { input, bounds in
model.matches(in: input, at: bounds.lowerBound)
}
}
}

extension String {
/// Compares this string to `other` using the loose matching rule UAX44-LM2,
/// which ignores case, whitespace, underscores, and nearly all medial
Expand Down Expand Up @@ -269,16 +280,6 @@ extension AST.Atom {
func generateConsumer(
_ opts: MatchingOptions
) throws -> MEProgram.ConsumeFunction? {
// TODO: Wean ourselves off of this type...
if let cc = self.characterClass?.withMatchLevel(
opts.matchLevel
) {
return { input, bounds in
// FIXME: should we worry about out of bounds?
cc.matches(in: input, at: bounds.lowerBound, with: opts)
}
}

switch kind {
case let .scalar(s):
assertionFailure(
Expand Down Expand Up @@ -312,8 +313,11 @@ extension AST.Atom {
case .caretAnchor, .dollarAnchor:
// handled in emitAssertion
return nil
case .escaped:
// handled in emitAssertion and emitCharacterClass
return nil

case .scalarSequence, .escaped, .keyboardControl, .keyboardMeta,
case .scalarSequence, .keyboardControl, .keyboardMeta,
.keyboardMetaControl, .backreference, .subpattern, .callout,
.backtrackingDirective, .changeMatchingOptions, .invalid:
// FIXME: implement
Expand Down
Loading