Skip to content

Use a bitset for ascii-only character classes #511

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 35 commits into from
Jun 29, 2022
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
5fd8840
[benchmark] Add no-capture version of grapheme breaking exercise
milseman Jun 19, 2022
03fe8d6
[benchmark] Add cross-engine benchmark helpers
milseman Jun 19, 2022
5667705
[benchmark] Hangul Syllable finding benchmark
milseman Jun 19, 2022
bde259b
Add debug mode
rctcwyvrn Jun 20, 2022
bf95e81
Fix typo in css regex
rctcwyvrn Jun 20, 2022
243ec7b
Add HTML benchmark
rctcwyvrn Jun 20, 2022
eeb0852
Add email regex benchmarks
rctcwyvrn Jun 20, 2022
49efd67
Add save/compare functionality to the benchmarker
rctcwyvrn Jun 20, 2022
b3a61a7
Clean up compare and add cli flags
rctcwyvrn Jun 20, 2022
926d208
Merge branch 'main' into more_more_benchmarks
milseman Jun 21, 2022
752ea76
Make fixes
rctcwyvrn Jun 21, 2022
7327e74
Merge branch 'more_more_benchmarks' of github.com:rctcwyvrn/swift-exp…
rctcwyvrn Jun 21, 2022
7a900b6
oops, remove some leftover code
rctcwyvrn Jun 21, 2022
50e8e6d
Fix linux build issue + add cli option for specifying compare file
rctcwyvrn Jun 21, 2022
3c7f62c
First ver of bitset character classes
rctcwyvrn Jun 22, 2022
b71b177
Did a dumb and didn't use the new api I had added...
rctcwyvrn Jun 22, 2022
e2a011c
Fix bug in inverted character sets
rctcwyvrn Jun 22, 2022
f7900e5
Remove nested chararcter class cases
rctcwyvrn Jun 22, 2022
e9d1902
Remove comment
rctcwyvrn Jun 22, 2022
cf59091
Merge branch 'main' into many-closures-vs-one-bitset-boi
rctcwyvrn Jun 22, 2022
f4019d4
Cleanup handling of isInverted
rctcwyvrn Jun 23, 2022
ed82cb0
Cleanup
rctcwyvrn Jun 23, 2022
cc1ac9d
Remove isCaseInsensitive property
rctcwyvrn Jun 23, 2022
ccf6ade
Add tests for special cases
rctcwyvrn Jun 23, 2022
7b83e0c
Use switch on ranges instead of if
rctcwyvrn Jun 24, 2022
5121076
Rename asciivalue to singleScalarAsciiValue
rctcwyvrn Jun 27, 2022
3607b65
Properly handle unicode scalars mode in custom character classes
rctcwyvrn Jun 27, 2022
291a974
I most definitely did not forget to commit the tests
rctcwyvrn Jun 27, 2022
ddcf40f
Cleanup
rctcwyvrn Jun 27, 2022
f87b325
Add support for testing if compilation contains certain opcodes
rctcwyvrn Jun 27, 2022
2d8ac2d
Forgot the tests again, twice in one day...
rctcwyvrn Jun 27, 2022
fd66693
Spelling mistakes
rctcwyvrn Jun 27, 2022
22c8213
Make expectProgram take sets of opcodes
rctcwyvrn Jun 27, 2022
0781b93
Add compiler options + validation testing against unoptimized regexes
rctcwyvrn Jun 28, 2022
ffff944
Cleanup, clear cache of Regex.Program when setting new compile options
rctcwyvrn Jun 28, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -643,8 +643,12 @@ fileprivate extension Compiler.ByteCodeGen {
mutating func emitCustomCharacterClass(
_ ccc: DSLTree.CustomCharacterClass
) throws {
let consumer = try ccc.generateConsumer(options)
builder.buildConsume(by: consumer)
if let asciiBitset = ccc.asAsciiBitset(options) {
builder.buildMatchAsciiBitset(asciiBitset)
} else {
let consumer = try ccc.generateConsumer(options)
builder.buildConsume(by: consumer)
}
}

@discardableResult
Expand Down
73 changes: 72 additions & 1 deletion Sources/_StringProcessing/ConsumerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,26 @@ extension DSLTree.Node {
}
}

extension DSLTree._AST.Atom {
var asciiValue: UInt8? {
return ast.asciiValue
}
}

extension DSLTree.Atom {
var asciiValue: UInt8? {
switch self {
case let .char(c) where c != "\r\n":
return c.asciiValue
case let .scalar(s) where s.isASCII:
return UInt8(ascii: s)
case let .unconverted(atom):
return atom.asciiValue
default:
return nil
}
}

// TODO: If ByteCodeGen switches first, then this is unnecessary for
// top-level nodes, but it's also invoked for `.atom` members of a custom CC
func generateConsumer(
Expand Down Expand Up @@ -177,7 +196,18 @@ extension AST.Atom {
default: return nil
}
}


var asciiValue: UInt8? {
switch kind {
case let .char(c) where c != "\r\n":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Future cleanup: something like the below to consolidate logic

extension Character {
  var _singleScalarASCIIValue: UInt8? { ... }
}

return c.asciiValue
case let .scalar(s) where s.value.isASCII:
return UInt8(ascii: s.value)
default:
return nil
}
}

func generateConsumer(
_ opts: MatchingOptions
) throws -> MEProgram.ConsumeFunction? {
Expand Down Expand Up @@ -235,6 +265,34 @@ extension AST.Atom {
}

extension DSLTree.CustomCharacterClass.Member {
func asAsciiBitset(
_ opts: MatchingOptions,
_ isInverted: Bool
) -> DSLTree.CustomCharacterClass.AsciiBitset? {
switch self {
case let .atom(a):
if let val = a.asciiValue {
return DSLTree.CustomCharacterClass.AsciiBitset(
val,
isInverted,
opts.isCaseInsensitive
)
}
case let .range(low, high):
if let lowVal = low.asciiValue, let highVal = high.asciiValue {
return DSLTree.CustomCharacterClass.AsciiBitset(
low: lowVal,
high: highVal,
isInverted: isInverted,
isCaseInsensitive: opts.isCaseInsensitive
)
}
default:
return nil
}
return nil
}

func generateConsumer(
_ opts: MatchingOptions
) throws -> MEProgram.ConsumeFunction {
Expand Down Expand Up @@ -342,6 +400,19 @@ extension DSLTree.CustomCharacterClass.Member {
}

extension DSLTree.CustomCharacterClass {
func asAsciiBitset(_ opts: MatchingOptions) -> AsciiBitset? {
return members.reduce(
.init(isInverted: isInverted),
{result, member in
if let next = member.asAsciiBitset(opts, isInverted) {
return result?.union(next)
} else {
return nil
}
}
)
}

func generateConsumer(
_ opts: MatchingOptions
) throws -> MEProgram.ConsumeFunction {
Expand Down
8 changes: 8 additions & 0 deletions Sources/_StringProcessing/Engine/InstPayload.swift
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ extension Instruction.Payload {
case bool(BoolRegister)
case element(ElementRegister)
case consumer(ConsumeFunctionRegister)
case bitset(AsciiBitsetRegister)
case assertion(AssertionFunctionRegister)
case addr(InstructionAddress)
case capture(CaptureRegister)
Expand Down Expand Up @@ -196,6 +197,13 @@ extension Instruction.Payload {
interpret()
}

init(bitset: AsciiBitsetRegister) {
self.init(bitset)
}
var bitset: AsciiBitsetRegister {
interpret()
}

init(consumer: ConsumeFunctionRegister) {
self.init(consumer)
}
Expand Down
4 changes: 4 additions & 0 deletions Sources/_StringProcessing/Engine/Instruction.swift
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ extension Instruction {
/// Operand: Sequence register to compare against.
case matchSequence

/// Match against a set of valid ascii values stored in a bitset
/// Operand: Ascii bitset register containing the bitset
case matchBitset

/// TODO: builtin assertions and anchors
case builtinAssertion

Expand Down
19 changes: 19 additions & 0 deletions Sources/_StringProcessing/Engine/MEBuilder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ extension MEProgram {
var elements = TypedSetVector<Input.Element, _ElementRegister>()
var sequences = TypedSetVector<[Input.Element], _SequenceRegister>()

var asciiBitsets: [DSLTree.CustomCharacterClass.AsciiBitset] = []
var consumeFunctions: [ConsumeFunction] = []
var assertionFunctions: [AssertionFunction] = []
var transformFunctions: [TransformFunction] = []
Expand Down Expand Up @@ -147,6 +148,15 @@ extension MEProgram.Builder {
.init(sequence: sequences.store(.init(s)))))
}

mutating func buildMatchAsciiBitset(
_ b: DSLTree.CustomCharacterClass.AsciiBitset
) {
instructions.append(
Instruction.init(
Instruction.OpCode.matchBitset,
Instruction.Payload.init(bitset: makeAsciiBitset(b))))
}

mutating func buildConsume(
by p: @escaping MEProgram.ConsumeFunction
) {
Expand Down Expand Up @@ -273,6 +283,7 @@ extension MEProgram.Builder {
regInfo.sequences = sequences.count
regInfo.ints = nextIntRegister.rawValue
regInfo.values = nextValueRegister.rawValue
regInfo.bitsets = asciiBitsets.count
regInfo.consumeFunctions = consumeFunctions.count
regInfo.assertionFunctions = assertionFunctions.count
regInfo.transformFunctions = transformFunctions.count
Expand All @@ -283,6 +294,7 @@ extension MEProgram.Builder {
instructions: InstructionList(instructions),
staticElements: elements.stored,
staticSequences: sequences.stored,
staticBitsets: asciiBitsets,
staticConsumeFunctions: consumeFunctions,
staticAssertionFunctions: assertionFunctions,
staticTransformFunctions: transformFunctions,
Expand Down Expand Up @@ -414,6 +426,13 @@ extension MEProgram.Builder {
// TODO: A register-mapping helper struct, which could release
// registers without monotonicity required

mutating func makeAsciiBitset(
_ b: DSLTree.CustomCharacterClass.AsciiBitset
) -> AsciiBitsetRegister {
defer { asciiBitsets.append(b) }
return AsciiBitsetRegister(asciiBitsets.count)
}

mutating func makeConsumeFunction(
_ f: @escaping MEProgram.ConsumeFunction
) -> ConsumeFunctionRegister {
Expand Down
1 change: 1 addition & 0 deletions Sources/_StringProcessing/Engine/MEProgram.swift
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ struct MEProgram {

var staticElements: [Input.Element]
var staticSequences: [[Input.Element]]
var staticBitsets: [DSLTree.CustomCharacterClass.AsciiBitset]
var staticConsumeFunctions: [ConsumeFunction]
var staticAssertionFunctions: [AssertionFunction]
var staticTransformFunctions: [TransformFunction]
Expand Down
21 changes: 21 additions & 0 deletions Sources/_StringProcessing/Engine/Processor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,20 @@ extension Processor {
}
return true
}

// If we have a bitset we know that the CharacterClass only matches against
// ascii characters, so check if the current input element is ascii then
// check if it is set in the bitset
mutating func matchBitset(
_ bitset: DSLTree.CustomCharacterClass.AsciiBitset
) -> Bool {
guard let cur = load(), bitset.matches(char: cur) else {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Future work: we can implement this on the UTF-8 view, but we'd have to handle grapheme breaking ourselves.

BTW, @natecook1000 what is the model for semantic mode processing around here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you want to try a fast check, IIRC you could have this at the top:

guard bitset.matches(input.utf8[currentPosition]),  
      input._isOnGraphemeBoundarySomething(input.utf8.index(after: currentPosition)) 
else {

That'd be for measuring or approximating the potential benefit. I think we'd want to have a more consistent series of helper functions surrounding sub-grapheme cluster processing though.

Copy link
Member

@natecook1000 natecook1000 Jun 23, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When matching with grapheme cluster semantics:

  • this should only match a single-scalar ASCII character (unless inverted)
  • it should advance to the next character after successfully matching

When matching with Unicode scalar semantics:

  • this should only check the current Unicode scalar value
  • it should advance to the next Unicode scalar value after successfully matching

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but what is the model for the engine? The engine isn't querying options on every loop.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So IIUC, this optimization only applies to grapheme-semantic mode right now, which is an unfortunate limitation. Lily, can you make sure to write a test for this somehow? We may need to revise our compilation testing approach.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think another approach is to have a bit in some instructions or payloads (whether that is really a dedicated bit or a virtual bit because we expand opcodes around it) that signals whether it should end in a grapheme break check. That would allow us to have a specialized matchScalar instruction, and we'd not bother to check grapheme boundaries for scalar sequences that we statically know are NFC invariant and don't need a check between every scalar.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would make sense to do optimizations for scalar mode (new instructions, new processor functions) as a future PR, for now I just made it not generate the bitset when in scalar mode.

I also added some support to Compiler and CompileTests to check for the existence of certain opcodes under different semantic levels

signalFailure()
return false
}
_uncheckedForcedConsumeOne()
return true
}

mutating func signalFailure() {
guard let (pc, pos, stackEnd, capEnds, intRegisters) =
Expand Down Expand Up @@ -364,6 +378,13 @@ extension Processor {
controller.step()
}

case .matchBitset:
let reg = payload.bitset
let bitset = registers[reg]
if matchBitset(bitset) {
controller.step()
}

case .consumeBy:
let reg = payload.consumer
guard currentPosition < searchBounds.upperBound,
Expand Down
11 changes: 11 additions & 0 deletions Sources/_StringProcessing/Engine/Registers.swift
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ extension Processor {
//
// TODO: Degenericize Processor and store Strings
var sequences: [[Element]] = []

var bitsets: [DSLTree.CustomCharacterClass.AsciiBitset]

var consumeFunctions: [MEProgram.ConsumeFunction]

Expand Down Expand Up @@ -67,6 +69,11 @@ extension Processor.Registers {
subscript(_ i: ElementRegister) -> Input.Element {
elements[i.rawValue]
}
subscript(
_ i: AsciiBitsetRegister
) -> DSLTree.CustomCharacterClass.AsciiBitset {
bitsets[i.rawValue]
}
subscript(_ i: ConsumeFunctionRegister) -> MEProgram.ConsumeFunction {
consumeFunctions[i.rawValue]
}
Expand Down Expand Up @@ -94,6 +101,9 @@ extension Processor.Registers {
self.sequences = program.staticSequences
assert(sequences.count == info.sequences)

self.bitsets = program.staticBitsets
assert(bitsets.count == info.bitsets)

self.consumeFunctions = program.staticConsumeFunctions
assert(consumeFunctions.count == info.consumeFunctions)

Expand Down Expand Up @@ -133,6 +143,7 @@ extension MEProgram {
var sequences = 0
var bools = 0
var strings = 0
var bitsets = 0
var consumeFunctions = 0
var assertionFunctions = 0
var transformFunctions = 0
Expand Down
81 changes: 81 additions & 0 deletions Sources/_StringProcessing/Regex/DSLTree.swift
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,87 @@ extension DSLTree {
indirect case subtraction(CustomCharacterClass, CustomCharacterClass)
indirect case symmetricDifference(CustomCharacterClass, CustomCharacterClass)
}

public struct AsciiBitset {
let isInverted: Bool
var a: UInt64 = 0
var b: UInt64 = 0

init(isInverted: Bool) {
self.isInverted = isInverted
}

init(_ val: UInt8, _ isInverted: Bool, _ isCaseInsensitive: Bool) {
self.isInverted = isInverted
add(val, isCaseInsensitive)
}

init(low: UInt8, high: UInt8, isInverted: Bool, isCaseInsensitive: Bool) {
self.isInverted = isInverted
for val in low...high {
add(val, isCaseInsensitive)
}
}

internal init(
a: UInt64,
b: UInt64,
isInverted: Bool
) {
self.isInverted = isInverted
self.a = a
self.b = b
}

internal mutating func add(_ val: UInt8, _ isCaseInsensitive: Bool) {
setBit(val)
if isCaseInsensitive {
if val >= 64 && val <= 90 {
setBit(val + 32)
}
if val >= 97 && val <= 122 {
setBit(val - 32)
}
}
}

internal mutating func setBit(_ val: UInt8) {
if val < 64 {
a = a | 1 << val
} else {
b = b | 1 << (val - 64)
}
}

internal func matches(char: Character) -> Bool {
let ret: Bool
if let val = char.asciiValue {
if val < 64 {
ret = (a >> val) & 1 == 1
} else {
ret = (b >> (val - 64)) & 1 == 1
}
} else {
ret = false
}

if isInverted {
return !ret
}

return ret
}

/// Joins another bitset from a Member of the same CustomCharacterClass
internal func union(_ other: AsciiBitset) -> AsciiBitset {
precondition(self.isInverted == other.isInverted)
return AsciiBitset(
a: self.a | other.a,
b: self.b | other.b,
isInverted: self.isInverted
)
}
}
}

@_spi(RegexBuilder)
Expand Down
4 changes: 4 additions & 0 deletions Sources/_StringProcessing/Utility/TypedInt.swift
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,10 @@ enum _BoolRegister {}
typealias StringRegister = TypedInt<_StringRegister>
enum _StringRegister {}

/// Used for matching sets of ascii values via bitsets
typealias AsciiBitsetRegister = TypedInt<_AsciiBitsetRegister>
enum _AsciiBitsetRegister {}

/// Used for consume functions, e.g. character classes
typealias ConsumeFunctionRegister = TypedInt<_ConsumeFunctionRegister>
enum _ConsumeFunctionRegister {}
Expand Down
Loading