Skip to content

Commit 711c6e3

Browse files
rctcwyvrnmilseman
andauthored
Use a bitset for ascii-only character classes + Correctly handle character classes in unicode scalars mode (#511)
* [benchmark] Add no-capture version of grapheme breaking exercise * [benchmark] Add cross-engine benchmark helpers * [benchmark] Hangul Syllable finding benchmark * Add debug mode * Fix typo in css regex * Add HTML benchmark * Add email regex benchmarks * Add save/compare functionality to the benchmarker * Clean up compare and add cli flags * Make fixes * oops, remove some leftover code * Fix linux build issue + add cli option for specifying compare file * First ver of bitset character classes * Did a dumb and didn't use the new api I had added... * Fix bug in inverted character sets * Remove nested chararcter class cases * Remove comment it did in fact, not need @escaping * Cleanup handling of isInverted * Cleanup * Remove isCaseInsensitive property It was already being folded into the value on initialization, no reason to keep it * Add tests for special cases * Use switch on ranges instead of if * Rename asciivalue to singleScalarAsciiValue * Properly handle unicode scalars mode in custom character classes * I most definitely did not forget to commit the tests * Cleanup * Add support for testing if compilation contains certain opcodes * Forgot the tests again, twice in one day... * Spelling mistakes * Make expectProgram take sets of opcodes * Add compiler options + validation testing against unoptimized regexes * Cleanup, clear cache of Regex.Program when setting new compile options Co-authored-by: Michael Ilseman <[email protected]>
1 parent e87149a commit 711c6e3

15 files changed

+451
-36
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,16 @@ extension Compiler {
88
/// This is used to determine whether to apply initial options.
99
var hasEmittedFirstMatchableAtom = false
1010

11-
init(options: MatchingOptions, captureList: CaptureList) {
11+
private let compileOptions: CompileOptions
12+
fileprivate var optimizationsEnabled: Bool { !compileOptions.contains(.disableOptimizations) }
13+
14+
init(
15+
options: MatchingOptions,
16+
compileOptions: CompileOptions,
17+
captureList: CaptureList
18+
) {
1219
self.options = options
20+
self.compileOptions = compileOptions
1321
self.builder.captureList = captureList
1422
}
1523
}
@@ -643,8 +651,16 @@ fileprivate extension Compiler.ByteCodeGen {
643651
mutating func emitCustomCharacterClass(
644652
_ ccc: DSLTree.CustomCharacterClass
645653
) throws {
646-
let consumer = try ccc.generateConsumer(options)
647-
builder.buildConsume(by: consumer)
654+
if let asciiBitset = ccc.asAsciiBitset(options),
655+
options.semanticLevel == .graphemeCluster,
656+
optimizationsEnabled {
657+
// future work: add a bit to .matchBitset to consume either a character
658+
// or a scalar so we can have this optimization in scalar mode
659+
builder.buildMatchAsciiBitset(asciiBitset)
660+
} else {
661+
let consumer = try ccc.generateConsumer(options)
662+
builder.buildConsume(by: consumer)
663+
}
648664
}
649665

650666
@discardableResult

Sources/_StringProcessing/Compiler.swift

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ class Compiler {
1616

1717
// TODO: Or are these stored on the tree?
1818
var options = MatchingOptions()
19+
private var compileOptions: CompileOptions = .default
1920

2021
init(ast: AST) {
2122
self.tree = ast.dslTree
@@ -25,23 +26,22 @@ class Compiler {
2526
self.tree = tree
2627
}
2728

29+
init(tree: DSLTree, compileOptions: CompileOptions) {
30+
self.tree = tree
31+
self.compileOptions = compileOptions
32+
}
33+
2834
__consuming func emit() throws -> MEProgram {
2935
// TODO: Handle global options
3036
var codegen = ByteCodeGen(
31-
options: options, captureList: tree.captureList
32-
)
37+
options: options,
38+
compileOptions:
39+
compileOptions,
40+
captureList: tree.captureList)
3341
return try codegen.emitRoot(tree.root)
3442
}
3543
}
3644

37-
func _compileRegex(
38-
_ regex: String, _ syntax: SyntaxOptions = .traditional
39-
) throws -> Executor {
40-
let ast = try parse(regex, .semantic, syntax)
41-
let program = try Compiler(ast: ast).emit()
42-
return Executor(program: program)
43-
}
44-
4545
// An error produced when compiling a regular expression.
4646
enum RegexCompilationError: Error, CustomStringConvertible {
4747
// TODO: Source location?
@@ -54,3 +54,35 @@ enum RegexCompilationError: Error, CustomStringConvertible {
5454
}
5555
}
5656
}
57+
58+
// Testing support
59+
@available(SwiftStdlib 5.7, *)
60+
func _compileRegex(
61+
_ regex: String,
62+
_ syntax: SyntaxOptions = .traditional,
63+
_ semanticLevel: RegexSemanticLevel? = nil
64+
) throws -> Executor {
65+
let ast = try parse(regex, .semantic, syntax)
66+
let dsl: DSLTree
67+
68+
switch semanticLevel?.base {
69+
case .graphemeCluster:
70+
let sequence = AST.MatchingOptionSequence(adding: [.init(.graphemeClusterSemantics, location: .fake)])
71+
dsl = DSLTree(.nonCapturingGroup(.init(ast: .changeMatchingOptions(sequence)), ast.dslTree.root))
72+
case .unicodeScalar:
73+
let sequence = AST.MatchingOptionSequence(adding: [.init(.unicodeScalarSemantics, location: .fake)])
74+
dsl = DSLTree(.nonCapturingGroup(.init(ast: .changeMatchingOptions(sequence)), ast.dslTree.root))
75+
case .none:
76+
dsl = ast.dslTree
77+
}
78+
let program = try Compiler(tree: dsl).emit()
79+
return Executor(program: program)
80+
}
81+
82+
extension Compiler {
83+
struct CompileOptions: OptionSet {
84+
let rawValue: Int
85+
static let disableOptimizations = CompileOptions(rawValue: 1)
86+
static let `default`: CompileOptions = []
87+
}
88+
}

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 98 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,26 @@ extension DSLTree.Node {
5151
}
5252
}
5353

54+
extension DSLTree._AST.Atom {
55+
var singleScalarASCIIValue: UInt8? {
56+
return ast.singleScalarASCIIValue
57+
}
58+
}
59+
5460
extension DSLTree.Atom {
61+
var singleScalarASCIIValue: UInt8? {
62+
switch self {
63+
case let .char(c) where c != "\r\n":
64+
return c.asciiValue
65+
case let .scalar(s) where s.isASCII:
66+
return UInt8(ascii: s)
67+
case let .unconverted(atom):
68+
return atom.singleScalarASCIIValue
69+
default:
70+
return nil
71+
}
72+
}
73+
5574
// TODO: If ByteCodeGen switches first, then this is unnecessary for
5675
// top-level nodes, but it's also invoked for `.atom` members of a custom CC
5776
func generateConsumer(
@@ -61,17 +80,32 @@ extension DSLTree.Atom {
6180

6281
switch self {
6382
case let .char(c):
64-
// TODO: Match level?
65-
return { input, bounds in
66-
let low = bounds.lowerBound
67-
if isCaseInsensitive && c.isCased {
68-
return input[low].lowercased() == c.lowercased()
69-
? input.index(after: low)
70-
: nil
71-
} else {
72-
return input[low] == c
73-
? input.index(after: low)
74-
: nil
83+
if opts.semanticLevel == .graphemeCluster {
84+
return { input, bounds in
85+
let low = bounds.lowerBound
86+
if isCaseInsensitive && c.isCased {
87+
return input[low].lowercased() == c.lowercased()
88+
? input.index(after: low)
89+
: nil
90+
} else {
91+
return input[low] == c
92+
? input.index(after: low)
93+
: nil
94+
}
95+
}
96+
} else {
97+
let consumers = c.unicodeScalars.map { s in consumeScalar {
98+
isCaseInsensitive
99+
? $0.properties.lowercaseMapping == s.properties.lowercaseMapping
100+
: $0 == s
101+
}}
102+
return { input, bounds in
103+
for fn in consumers {
104+
if let idx = fn(input, bounds) {
105+
return idx
106+
}
107+
}
108+
return nil
75109
}
76110
}
77111
case let .scalar(s):
@@ -177,7 +211,18 @@ extension AST.Atom {
177211
default: return nil
178212
}
179213
}
180-
214+
215+
var singleScalarASCIIValue: UInt8? {
216+
switch kind {
217+
case let .char(c) where c != "\r\n":
218+
return c.asciiValue
219+
case let .scalar(s) where s.value.isASCII:
220+
return UInt8(ascii: s.value)
221+
default:
222+
return nil
223+
}
224+
}
225+
181226
func generateConsumer(
182227
_ opts: MatchingOptions
183228
) throws -> MEProgram.ConsumeFunction? {
@@ -235,6 +280,34 @@ extension AST.Atom {
235280
}
236281

237282
extension DSLTree.CustomCharacterClass.Member {
283+
func asAsciiBitset(
284+
_ opts: MatchingOptions,
285+
_ isInverted: Bool
286+
) -> DSLTree.CustomCharacterClass.AsciiBitset? {
287+
switch self {
288+
case let .atom(a):
289+
if let val = a.singleScalarASCIIValue {
290+
return DSLTree.CustomCharacterClass.AsciiBitset(
291+
val,
292+
isInverted,
293+
opts.isCaseInsensitive
294+
)
295+
}
296+
case let .range(low, high):
297+
if let lowVal = low.singleScalarASCIIValue, let highVal = high.singleScalarASCIIValue {
298+
return DSLTree.CustomCharacterClass.AsciiBitset(
299+
low: lowVal,
300+
high: highVal,
301+
isInverted: isInverted,
302+
isCaseInsensitive: opts.isCaseInsensitive
303+
)
304+
}
305+
default:
306+
return nil
307+
}
308+
return nil
309+
}
310+
238311
func generateConsumer(
239312
_ opts: MatchingOptions
240313
) throws -> MEProgram.ConsumeFunction {
@@ -342,6 +415,19 @@ extension DSLTree.CustomCharacterClass.Member {
342415
}
343416

344417
extension DSLTree.CustomCharacterClass {
418+
func asAsciiBitset(_ opts: MatchingOptions) -> AsciiBitset? {
419+
return members.reduce(
420+
.init(isInverted: isInverted),
421+
{result, member in
422+
if let next = member.asAsciiBitset(opts, isInverted) {
423+
return result?.union(next)
424+
} else {
425+
return nil
426+
}
427+
}
428+
)
429+
}
430+
345431
func generateConsumer(
346432
_ opts: MatchingOptions
347433
) throws -> MEProgram.ConsumeFunction {

Sources/_StringProcessing/Engine/InstPayload.swift

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ extension Instruction.Payload {
5050
case bool(BoolRegister)
5151
case element(ElementRegister)
5252
case consumer(ConsumeFunctionRegister)
53+
case bitset(AsciiBitsetRegister)
5354
case assertion(AssertionFunctionRegister)
5455
case addr(InstructionAddress)
5556
case capture(CaptureRegister)
@@ -196,6 +197,13 @@ extension Instruction.Payload {
196197
interpret()
197198
}
198199

200+
init(bitset: AsciiBitsetRegister) {
201+
self.init(bitset)
202+
}
203+
var bitset: AsciiBitsetRegister {
204+
interpret()
205+
}
206+
199207
init(consumer: ConsumeFunctionRegister) {
200208
self.init(consumer)
201209
}

Sources/_StringProcessing/Engine/Instruction.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,10 @@ extension Instruction {
8484
/// Operand: Sequence register to compare against.
8585
case matchSequence
8686

87+
/// Match against a set of valid ascii values stored in a bitset
88+
/// Operand: Ascii bitset register containing the bitset
89+
case matchBitset
90+
8791
/// TODO: builtin assertions and anchors
8892
case builtinAssertion
8993

Sources/_StringProcessing/Engine/MEBuilder.swift

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ extension MEProgram {
1818
var elements = TypedSetVector<Input.Element, _ElementRegister>()
1919
var sequences = TypedSetVector<[Input.Element], _SequenceRegister>()
2020

21+
var asciiBitsets: [DSLTree.CustomCharacterClass.AsciiBitset] = []
2122
var consumeFunctions: [ConsumeFunction] = []
2223
var assertionFunctions: [AssertionFunction] = []
2324
var transformFunctions: [TransformFunction] = []
@@ -147,6 +148,13 @@ extension MEProgram.Builder {
147148
.init(sequence: sequences.store(.init(s)))))
148149
}
149150

151+
mutating func buildMatchAsciiBitset(
152+
_ b: DSLTree.CustomCharacterClass.AsciiBitset
153+
) {
154+
instructions.append(.init(
155+
.matchBitset, .init(bitset: makeAsciiBitset(b))))
156+
}
157+
150158
mutating func buildConsume(
151159
by p: @escaping MEProgram.ConsumeFunction
152160
) {
@@ -273,6 +281,7 @@ extension MEProgram.Builder {
273281
regInfo.sequences = sequences.count
274282
regInfo.ints = nextIntRegister.rawValue
275283
regInfo.values = nextValueRegister.rawValue
284+
regInfo.bitsets = asciiBitsets.count
276285
regInfo.consumeFunctions = consumeFunctions.count
277286
regInfo.assertionFunctions = assertionFunctions.count
278287
regInfo.transformFunctions = transformFunctions.count
@@ -283,6 +292,7 @@ extension MEProgram.Builder {
283292
instructions: InstructionList(instructions),
284293
staticElements: elements.stored,
285294
staticSequences: sequences.stored,
295+
staticBitsets: asciiBitsets,
286296
staticConsumeFunctions: consumeFunctions,
287297
staticAssertionFunctions: assertionFunctions,
288298
staticTransformFunctions: transformFunctions,
@@ -414,6 +424,13 @@ extension MEProgram.Builder {
414424
// TODO: A register-mapping helper struct, which could release
415425
// registers without monotonicity required
416426

427+
mutating func makeAsciiBitset(
428+
_ b: DSLTree.CustomCharacterClass.AsciiBitset
429+
) -> AsciiBitsetRegister {
430+
defer { asciiBitsets.append(b) }
431+
return AsciiBitsetRegister(asciiBitsets.count)
432+
}
433+
417434
mutating func makeConsumeFunction(
418435
_ f: @escaping MEProgram.ConsumeFunction
419436
) -> ConsumeFunctionRegister {

Sources/_StringProcessing/Engine/MEProgram.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ struct MEProgram {
2626

2727
var staticElements: [Input.Element]
2828
var staticSequences: [[Input.Element]]
29+
var staticBitsets: [DSLTree.CustomCharacterClass.AsciiBitset]
2930
var staticConsumeFunctions: [ConsumeFunction]
3031
var staticAssertionFunctions: [AssertionFunction]
3132
var staticTransformFunctions: [TransformFunction]

Sources/_StringProcessing/Engine/Processor.swift

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,20 @@ extension Processor {
226226
}
227227
return true
228228
}
229+
230+
// If we have a bitset we know that the CharacterClass only matches against
231+
// ascii characters, so check if the current input element is ascii then
232+
// check if it is set in the bitset
233+
mutating func matchBitset(
234+
_ bitset: DSLTree.CustomCharacterClass.AsciiBitset
235+
) -> Bool {
236+
guard let cur = load(), bitset.matches(char: cur) else {
237+
signalFailure()
238+
return false
239+
}
240+
_uncheckedForcedConsumeOne()
241+
return true
242+
}
229243

230244
mutating func signalFailure() {
231245
guard let (pc, pos, stackEnd, capEnds, intRegisters) =
@@ -364,6 +378,13 @@ extension Processor {
364378
controller.step()
365379
}
366380

381+
case .matchBitset:
382+
let reg = payload.bitset
383+
let bitset = registers[reg]
384+
if matchBitset(bitset) {
385+
controller.step()
386+
}
387+
367388
case .consumeBy:
368389
let reg = payload.consumer
369390
guard currentPosition < searchBounds.upperBound,

0 commit comments

Comments
 (0)