Skip to content

Implement instructions for matching builtin character classes and assertions #547

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Aug 3, 2022
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
3b6b676
Copy over new ascii bitset
rctcwyvrn Jul 5, 2022
33caa79
Add matchBuiltin
rctcwyvrn Jul 5, 2022
139daa5
Remove debug prints
rctcwyvrn Jul 5, 2022
9abf4af
Remove bitset fast path
rctcwyvrn Jul 5, 2022
286f5d8
Fully remove remnants of the bitset fast path
rctcwyvrn Jul 6, 2022
9e915cd
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 7, 2022
e593ddb
Completely replace AssertionFunction with regexAssert(by:)
rctcwyvrn Jul 11, 2022
25dc277
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 12, 2022
3e38ac6
Cleanup
rctcwyvrn Jul 12, 2022
e5d8b4a
Move match builtin and assert + Add AssertionPayload
rctcwyvrn Jul 12, 2022
0466c25
Cleanup assertions
rctcwyvrn Jul 12, 2022
87078ad
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 12, 2022
f401e84
Fix tests
rctcwyvrn Jul 13, 2022
b09f45f
Update opcode description for assertBy
rctcwyvrn Jul 13, 2022
c581ea2
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 14, 2022
2a82231
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 15, 2022
fb1576a
Update branch to match main
rctcwyvrn Jul 15, 2022
3b9485e
Use the newly cleaned up _CharacterClassModel
rctcwyvrn Jul 16, 2022
64d1ed9
Add characterClass DSLTree node
rctcwyvrn Jul 16, 2022
2a6fe3c
Bugfixes
rctcwyvrn Jul 19, 2022
206bfc6
Add documentation for matchBuiltin
rctcwyvrn Jul 21, 2022
b53f524
Lots of cleanup
rctcwyvrn Jul 25, 2022
bb5245f
Move assertion payload
rctcwyvrn Jul 25, 2022
0746847
More minor cleanup
rctcwyvrn Jul 25, 2022
c718543
Perform boundary check for .anyScalar when in grapheme mode
rctcwyvrn Jul 25, 2022
d35b578
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 28, 2022
ca8acf2
Merge branch 'main' into speedy-builtins
rctcwyvrn Aug 2, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 14 additions & 130 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,14 @@ fileprivate extension Compiler.ByteCodeGen {
options.apply(optionSequence.ast)

case let .unconverted(astAtom):
if optimizationsEnabled,
let cc = astAtom.ast.characterClass?.builtinCC {
builder.buildMatchBuiltin(
cc,
cc.isStrict(options: options),
isScalar: options.semanticLevel == .unicodeScalar)
return
}
if let consumer = try astAtom.ast.generateConsumer(options) {
builder.buildConsume(by: consumer)
} else {
Expand Down Expand Up @@ -113,136 +121,12 @@ fileprivate extension Compiler.ByteCodeGen {
mutating func emitAssertion(
_ kind: AST.Atom.AssertionKind
) throws {
// FIXME: Depends on API model we have... We may want to
// think through some of these with API interactions in mind
//
// This might break how we use `bounds` for both slicing
// and things like `firstIndex`, that is `firstIndex` may
// need to supply both a slice bounds and a per-search bounds.
switch kind {
case .startOfSubject:
builder.buildAssert { (_, _, input, pos, subjectBounds) in
pos == subjectBounds.lowerBound
}

case .endOfSubjectBeforeNewline:
builder.buildAssert { [semanticLevel = options.semanticLevel]
(_, _, input, pos, subjectBounds) in
if pos == subjectBounds.upperBound { return true }
switch semanticLevel {
case .graphemeCluster:
return input.index(after: pos) == subjectBounds.upperBound
&& input[pos].isNewline
case .unicodeScalar:
return input.unicodeScalars.index(after: pos) == subjectBounds.upperBound
&& input.unicodeScalars[pos].isNewline
}
}

case .endOfSubject:
builder.buildAssert { (_, _, input, pos, subjectBounds) in
pos == subjectBounds.upperBound
}

case .resetStartOfMatch:
// FIXME: Figure out how to communicate this out
throw Unsupported(#"\K (reset/keep assertion)"#)

case .firstMatchingPositionInSubject:
// TODO: We can probably build a nice model with API here

// FIXME: This needs to be based on `searchBounds`,
// not the `subjectBounds` given as an argument here
builder.buildAssert { (_, _, input, pos, subjectBounds) in false }

case .textSegment:
builder.buildAssert { (_, _, input, pos, _) in
// FIXME: Grapheme or word based on options
input.isOnGraphemeClusterBoundary(pos)
}

case .notTextSegment:
builder.buildAssert { (_, _, input, pos, _) in
// FIXME: Grapheme or word based on options
!input.isOnGraphemeClusterBoundary(pos)
}

case .startOfLine:
// FIXME: Anchor.startOfLine must always use this first branch
// The behavior of `^` should depend on `anchorsMatchNewlines`, but
// the DSL-based `.startOfLine` anchor should always match the start
// of a line. Right now we don't distinguish between those anchors.
if options.anchorsMatchNewlines {
builder.buildAssert { [semanticLevel = options.semanticLevel]
(_, _, input, pos, subjectBounds) in
if pos == subjectBounds.lowerBound { return true }
switch semanticLevel {
case .graphemeCluster:
return input[input.index(before: pos)].isNewline
case .unicodeScalar:
return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline
}
}
} else {
builder.buildAssert { (_, _, input, pos, subjectBounds) in
pos == subjectBounds.lowerBound
}
}

case .endOfLine:
// FIXME: Anchor.endOfLine must always use this first branch
// The behavior of `$` should depend on `anchorsMatchNewlines`, but
// the DSL-based `.endOfLine` anchor should always match the end
// of a line. Right now we don't distinguish between those anchors.
if options.anchorsMatchNewlines {
builder.buildAssert { [semanticLevel = options.semanticLevel]
(_, _, input, pos, subjectBounds) in
if pos == subjectBounds.upperBound { return true }
switch semanticLevel {
case .graphemeCluster:
return input[pos].isNewline
case .unicodeScalar:
return input.unicodeScalars[pos].isNewline
}
}
} else {
builder.buildAssert { (_, _, input, pos, subjectBounds) in
pos == subjectBounds.upperBound
}
}

case .wordBoundary:
builder.buildAssert { [options]
(cache, maxIndex, input, pos, subjectBounds) in
if options.usesSimpleUnicodeBoundaries {
// TODO: How should we handle bounds?
return _CharacterClassModel.word.isBoundary(
input,
at: pos,
bounds: subjectBounds,
with: options
)
} else {
return input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
}
}

case .notWordBoundary:
builder.buildAssert { [options]
(cache, maxIndex, input, pos, subjectBounds) in
if options.usesSimpleUnicodeBoundaries {
// TODO: How should we handle bounds?
return !_CharacterClassModel.word.isBoundary(
input,
at: pos,
bounds: subjectBounds,
with: options
)
} else {
return !input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
}
}
}
builder.buildAssert(
by: kind,
options.anchorsMatchNewlines,
options.usesSimpleUnicodeBoundaries,
options.usesASCIIWord,
options.semanticLevel)
}

mutating func emitScalar(_ s: UnicodeScalar) throws {
Expand Down
7 changes: 7 additions & 0 deletions Sources/_StringProcessing/ConsumerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@

@_implementationOnly import _RegexParser

extension Character {
var _singleScalarAsciiValue: UInt8? {
guard self != "\r\n" else { return nil }
return asciiValue
}
}

extension DSLTree.Node {
/// Attempt to generate a consumer from this AST node
///
Expand Down
82 changes: 75 additions & 7 deletions Sources/_StringProcessing/Engine/InstPayload.swift
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//

@_implementationOnly import _RegexParser // For AssertionKind

extension Instruction {
/// An instruction's payload packs operands and destination
Expand Down Expand Up @@ -51,7 +51,6 @@ extension Instruction.Payload {
case element(ElementRegister)
case consumer(ConsumeFunctionRegister)
case bitset(AsciiBitsetRegister)
case assertion(AssertionFunctionRegister)
case addr(InstructionAddress)
case capture(CaptureRegister)

Expand Down Expand Up @@ -203,6 +202,22 @@ extension Instruction.Payload {
var bitset: AsciiBitsetRegister {
interpret()
}

init(_ cc: BuiltinCC, _ isStrict: Bool, _ isScalar: Bool) {
let strictBit = isStrict ? 1 << 15 : 0
let scalarBit = isScalar ? 1 << 14 : 0
// val must be 16 bits, reserve the top 2 bits for if it is strict ascii or scalar
assert(cc.rawValue <= 0x3F_FF)
let val = cc.rawValue + UInt64(strictBit) + UInt64(scalarBit)
self.init(val)
}
var builtinCCPayload: (cc: BuiltinCC, isStrict: Bool, isScalar: Bool) {
let val = self.rawValue
let cc = BuiltinCC(rawValue: val & 0x3F_FF)!
let isStrict = (val >> 15) & 1 == 1
let isScalar = (val >> 14) & 1 == 1
return (cc, isStrict, isScalar)
}

init(consumer: ConsumeFunctionRegister) {
self.init(consumer)
Expand All @@ -211,11 +226,64 @@ extension Instruction.Payload {
interpret()
}

init(assertion: AssertionFunctionRegister) {
self.init(assertion)
}
var assertion: AssertionFunctionRegister {
interpret()
var _assertionKindMask: UInt64 { ~0xFFF0_0000_0000_0000 }
init(assertion: AST.Atom.AssertionKind,
_ anchorsMatchNewlines: Bool,
_ usesSimpleUnicodeBoundaries: Bool,
_ usesASCIIWord: Bool,
_ semanticLevel: MatchingOptions.SemanticLevel
) {
// 4 bits of options
let anchorBit: UInt64 = anchorsMatchNewlines ? (1 << 55) : 0
let boundaryBit: UInt64 = usesSimpleUnicodeBoundaries ? (1 << 54) : 0
let strictBit: UInt64 = usesASCIIWord ? (1 << 53) : 0
let semanticLevelBit: UInt64 = semanticLevel == .unicodeScalar ? (1 << 52) : 0
let optionsBits: UInt64 = anchorBit + boundaryBit + strictBit + semanticLevelBit

// 4 bits for the assertion kind
// Future work: Optimize this layout
let kind: UInt64
switch assertion {
case .endOfLine: kind = 0
case .endOfSubject: kind = 1
case .endOfSubjectBeforeNewline: kind = 2
case .firstMatchingPositionInSubject: kind = 3
case .notTextSegment: kind = 4
case .notWordBoundary: kind = 5
case .resetStartOfMatch: kind = 6
case .startOfLine: kind = 7
case .startOfSubject: kind = 8
case .textSegment: kind = 9
case .wordBoundary: kind = 10
}
self.init(rawValue: kind + optionsBits)
}
var assertion: (AST.Atom.AssertionKind, Bool, Bool, Bool, MatchingOptions.SemanticLevel) {
let anchorsMatchNewlines = (self.rawValue >> 55) & 1 == 1
let usesSimpleUnicodeBoundaries = (self.rawValue >> 54) & 1 == 1
let usesASCIIWord = (self.rawValue >> 53) & 1 == 1
let semanticLevel: MatchingOptions.SemanticLevel
if (self.rawValue >> 52) & 1 == 1 {
semanticLevel = .unicodeScalar
} else {
semanticLevel = .graphemeCluster
}
let kind: AST.Atom.AssertionKind
switch self.rawValue & _assertionKindMask {
case 0: kind = .endOfLine
case 1: kind = .endOfSubject
case 2: kind = .endOfSubjectBeforeNewline
case 3: kind = .firstMatchingPositionInSubject
case 4: kind = .notTextSegment
case 5: kind = .notWordBoundary
case 6: kind = .resetStartOfMatch
case 7: kind = .startOfLine
case 8: kind = .startOfSubject
case 9: kind = .textSegment
case 10: kind = .wordBoundary
default: fatalError("Unreachable")
}
return (kind, anchorsMatchNewlines, usesSimpleUnicodeBoundaries, usesASCIIWord, semanticLevel)
}

init(addr: InstructionAddress) {
Expand Down
6 changes: 1 addition & 5 deletions Sources/_StringProcessing/Engine/Instruction.swift
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,7 @@ extension Instruction {
/// Operand: Ascii bitset register containing the bitset
case matchBitset

/// TODO: builtin assertions and anchors
case builtinAssertion

/// TODO: builtin character classes
case builtinCharacterClass
case matchBuiltin

// MARK: Extension points

Expand Down
32 changes: 21 additions & 11 deletions Sources/_StringProcessing/Engine/MEBuilder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ extension MEProgram {

var asciiBitsets: [DSLTree.CustomCharacterClass.AsciiBitset] = []
var consumeFunctions: [ConsumeFunction] = []
var assertionFunctions: [AssertionFunction] = []
var transformFunctions: [TransformFunction] = []
var matcherFunctions: [MatcherFunction] = []

Expand Down Expand Up @@ -163,6 +162,15 @@ extension MEProgram.Builder {
instructions.append(.init(
.matchBitset, .init(bitset: makeAsciiBitset(b))))
}

mutating func buildMatchBuiltin(
_ cc: BuiltinCC,
_ isStrict: Bool,
isScalar: Bool
) {
instructions.append(.init(
.matchBuiltin, .init(cc, isStrict, isScalar)))
}

mutating func buildConsume(
by p: @escaping MEProgram.ConsumeFunction
Expand All @@ -172,10 +180,20 @@ extension MEProgram.Builder {
}

mutating func buildAssert(
by p: @escaping MEProgram.AssertionFunction
by kind: AST.Atom.AssertionKind,
_ anchorsMatchNewlines: Bool,
_ usesSimpleUnicodeBoundaries: Bool,
_ usesASCIIWord: Bool,
_ semanticLevel: MatchingOptions.SemanticLevel
) {
instructions.append(.init(
.assertBy, .init(assertion: makeAssertionFunction(p))))
.assertBy,
.init(
assertion: kind,
anchorsMatchNewlines,
usesSimpleUnicodeBoundaries,
usesASCIIWord,
semanticLevel)))
}

mutating func buildAccept() {
Expand Down Expand Up @@ -298,7 +316,6 @@ extension MEProgram.Builder {
regInfo.positions = nextPositionRegister.rawValue
regInfo.bitsets = asciiBitsets.count
regInfo.consumeFunctions = consumeFunctions.count
regInfo.assertionFunctions = assertionFunctions.count
regInfo.transformFunctions = transformFunctions.count
regInfo.matcherFunctions = matcherFunctions.count
regInfo.captures = nextCaptureRegister.rawValue
Expand All @@ -309,7 +326,6 @@ extension MEProgram.Builder {
staticSequences: sequences.stored,
staticBitsets: asciiBitsets,
staticConsumeFunctions: consumeFunctions,
staticAssertionFunctions: assertionFunctions,
staticTransformFunctions: transformFunctions,
staticMatcherFunctions: matcherFunctions,
registerInfo: regInfo,
Expand Down Expand Up @@ -458,12 +474,6 @@ extension MEProgram.Builder {
defer { consumeFunctions.append(f) }
return ConsumeFunctionRegister(consumeFunctions.count)
}
mutating func makeAssertionFunction(
_ f: @escaping MEProgram.AssertionFunction
) -> AssertionFunctionRegister {
defer { assertionFunctions.append(f) }
return AssertionFunctionRegister(assertionFunctions.count)
}
mutating func makeTransformFunction(
_ f: @escaping MEProgram.TransformFunction
) -> TransformRegister {
Expand Down
13 changes: 0 additions & 13 deletions Sources/_StringProcessing/Engine/MEBuiltins.swift

This file was deleted.

Loading