14
14
15
15
import XCTest
16
16
17
+ enum DecodedInstr {
18
+ case invalid
19
+ case moveImmediate
20
+ case branch
21
+ case condBranchZeroElseDecrement
22
+ case save
23
+ case saveAddress
24
+ case splitSaving
25
+ case clear
26
+ case clearThrough
27
+ case accept
28
+ case fail
29
+ case advance
30
+ case match
31
+ case matchCaseInsensitive
32
+ case matchScalar
33
+ case matchScalarCaseInsensitiveUnchecked
34
+ case matchScalarCaseInsensitive
35
+ case matchScalarUnchecked
36
+ case matchBitsetScalar
37
+ case matchBitset
38
+ case consumeBy
39
+ case assertBy
40
+ case matchBy
41
+ case backreference
42
+ case beginCapture
43
+ case endCapture
44
+ case transformCapture
45
+ case captureValue
46
+ case builtinAssertion
47
+ case builtinCharacterClass
48
+ }
49
+
50
+ extension DecodedInstr {
51
+ /// Decode the given instruction by looking at the opcode and payload, expanding out certain instructions
52
+ /// like matchScalar and match into their variants
53
+ ///
54
+ /// Must stay in sync with Processor.cycle
55
+ static func decode( _ instruction: Instruction ) -> DecodedInstr {
56
+ let ( opcode, payload) = instruction. destructure
57
+
58
+ switch opcode {
59
+ case . invalid:
60
+ fatalError ( " Invalid program " )
61
+ case . moveImmediate:
62
+ return . moveImmediate
63
+ case . branch:
64
+ return . branch
65
+ case . condBranchZeroElseDecrement:
66
+ return . condBranchZeroElseDecrement
67
+ case . save:
68
+ return . save
69
+ case . saveAddress:
70
+ return . saveAddress
71
+ case . splitSaving:
72
+ return . splitSaving
73
+ case . clear:
74
+ return . clear
75
+ case . clearThrough:
76
+ return . clearThrough
77
+ case . accept:
78
+ return . accept
79
+ case . fail:
80
+ return . fail
81
+ case . advance:
82
+ return . advance
83
+ case . match:
84
+ let ( isCaseInsensitive, _) = payload. elementPayload
85
+ if isCaseInsensitive {
86
+ return . matchCaseInsensitive
87
+ } else {
88
+ return . match
89
+ }
90
+ case . matchScalar:
91
+ let ( _, caseInsensitive, boundaryCheck) = payload. scalarPayload
92
+ if caseInsensitive {
93
+ if boundaryCheck {
94
+ return . matchScalarCaseInsensitive
95
+ } else {
96
+ return . matchScalarCaseInsensitiveUnchecked
97
+ }
98
+ } else {
99
+ if boundaryCheck {
100
+ return . matchScalar
101
+ } else {
102
+ return . matchScalarUnchecked
103
+ }
104
+ }
105
+ case . matchBitset:
106
+ let ( isScalar, _) = payload. bitsetPayload
107
+ if isScalar {
108
+ return . matchBitsetScalar
109
+ } else {
110
+ return . matchBitset
111
+ }
112
+ case . consumeBy:
113
+ return consumeBy
114
+ case . assertBy:
115
+ return . assertBy
116
+ case . matchBy:
117
+ return . matchBy
118
+ case . backreference:
119
+ return . backreference
120
+ case . beginCapture:
121
+ return . beginCapture
122
+ case . endCapture:
123
+ return . endCapture
124
+ case . transformCapture:
125
+ return . transformCapture
126
+ case . captureValue:
127
+ return . captureValue
128
+ case . builtinAssertion:
129
+ return . builtinAssertion
130
+ case . builtinCharacterClass:
131
+ return . builtinCharacterClass
132
+ }
133
+ }
134
+ }
135
+
17
136
extension RegexTests {
18
137
19
138
private func testCompilationEquivalence(
@@ -147,20 +266,21 @@ extension RegexTests {
147
266
for regex: String ,
148
267
syntax: SyntaxOptions = . traditional,
149
268
semanticLevel: RegexSemanticLevel ? = nil ,
150
- contains targets: Set < Instruction . OpCode > = [ ] ,
151
- doesNotContain invalid: Set < Instruction . OpCode > = [ ] ,
269
+ contains targets: Set < DecodedInstr > = [ ] ,
270
+ doesNotContain invalid: Set < DecodedInstr > = [ ] ,
152
271
file: StaticString = #file,
153
272
line: UInt = #line
154
273
) {
155
274
do {
156
275
let prog = try _compileRegex ( regex, syntax, semanticLevel)
157
- var found : Set < Instruction . OpCode > = [ ]
276
+ var found : Set < DecodedInstr > = [ ]
158
277
for inst in prog. engine. instructions {
159
- found. insert ( inst. opcode)
278
+ let decoded = DecodedInstr . decode ( inst)
279
+ found. insert ( decoded)
160
280
161
- if invalid. contains ( inst . opcode ) {
281
+ if invalid. contains ( decoded ) {
162
282
XCTFail (
163
- " Compiled regex ' \( regex) ' contains incorrect opcode \( inst . opcode ) " ,
283
+ " Compiled regex ' \( regex) ' contains incorrect opcode \( decoded ) " ,
164
284
file: file,
165
285
line: line)
166
286
return
@@ -181,94 +301,94 @@ extension RegexTests {
181
301
}
182
302
}
183
303
184
- // func testBitsetCompile() {
185
- // expectProgram(
186
- // for: "[abc]",
187
- // contains: [.matchBitset],
188
- // doesNotContain: [.consumeBy, .matchBitsetScalar])
189
- // expectProgram(
190
- // for: "[abc]",
191
- // semanticLevel: .unicodeScalar,
192
- // contains: [.matchBitsetScalar],
193
- // doesNotContain: [.matchBitset, .consumeBy])
194
- // }
195
- //
196
- // func testScalarOptimizeCompilation() {
197
- // // all ascii quoted literal -> elide boundary checks
198
- // expectProgram(
199
- // for: "abcd",
200
- // contains: [.matchScalar, .matchScalarUnchecked],
201
- // doesNotContain: [.match, .matchSequence , .consumeBy])
202
- // // ascii character -> matchScalar with boundary check
203
- // expectProgram(
204
- // for: "a",
205
- // contains: [.matchScalar],
206
- // doesNotContain: [.match, .matchSequence , .consumeBy, .matchScalarUnchecked])
207
- // // quoted literal is not all ascii -> match scalar when possible, always do boundary checks
208
- // expectProgram(
209
- // for: "aaa\u{301}",
210
- // contains: [.match, .matchScalar],
211
- // doesNotContain: [.consumeBy, .matchScalarUnchecked])
212
- // // scalar mode -> always emit match scalar without boundary checks
213
- // expectProgram(
214
- // for: "abcd",
215
- // semanticLevel: .unicodeScalar,
216
- // contains: [.matchScalarUnchecked],
217
- // doesNotContain: [.match, .matchSequence , .consumeBy, .matchScalar])
218
- // expectProgram(
219
- // for: "a",
220
- // semanticLevel: .unicodeScalar,
221
- // contains: [.matchScalarUnchecked],
222
- // doesNotContain: [.match, .matchSequence , .consumeBy, .matchScalar])
223
- // expectProgram(
224
- // for: "aaa\u{301}",
225
- // semanticLevel: .unicodeScalar,
226
- // contains: [.matchScalarUnchecked],
227
- // doesNotContain: [.match, .matchSequence , .consumeBy, .matchScalar])
228
- // }
229
- //
230
- // func testCaseInsensitivityCompilation() {
231
- // // quoted literal is all ascii -> match scalar case insensitive and skip
232
- // // boundary checks
233
- // expectProgram(
234
- // for: "(?i)abcd",
235
- // contains: [.matchScalarCaseInsensitiveUnchecked, .matchScalarCaseInsensitive],
236
- // doesNotContain: [.match, .matchCaseInsensitive, .matchScalar, .matchScalarUnchecked])
237
- // // quoted literal is all non-cased ascii -> emit match scalar instructions
238
- // expectProgram(
239
- // for: "(?i)&&&&",
240
- // contains: [.matchScalar, .matchScalarUnchecked],
241
- // doesNotContain: [.match, .matchCaseInsensitive,
242
- // .matchScalarCaseInsensitive, .matchScalarCaseInsensitiveUnchecked])
243
- // // quoted literal is not all ascii -> match scalar case insensitive when
244
- // // possible, match character case insensitive when needed, always perform
245
- // // boundary check
246
- // expectProgram(
247
- // for: "(?i)abcd\u{301}",
248
- // contains: [.matchCaseInsensitive, .matchScalarCaseInsensitive],
249
- // doesNotContain: [.matchScalarCaseInsensitiveUnchecked, .match, .matchScalar])
250
- // // same as before but contains ascii non cased characters -> emit matchScalar for them
251
- // expectProgram(
252
- // for: "(?i)abcd\u{301};.'!",
253
- // contains: [.matchCaseInsensitive, .matchScalarCaseInsensitive, .matchScalar],
254
- // doesNotContain: [.matchScalarCaseInsensitiveUnchecked, .match])
255
- // // contains non-ascii non-cased characters -> emit match
256
- // expectProgram(
257
- // for: "(?i)abcd\u{301};.'!💖",
258
- // contains: [.matchCaseInsensitive, .matchScalarCaseInsensitive, .matchScalar, .match],
259
- // doesNotContain: [.matchScalarCaseInsensitiveUnchecked])
260
- //
261
- // // scalar mode -> emit unchecked scalar match only, emit case insensitive
262
- // // only if the scalar is cased
263
- // expectProgram(
264
- // for: "(?i);.'!💖",
265
- // semanticLevel: .unicodeScalar,
266
- // contains: [.matchScalarUnchecked],
267
- // doesNotContain: [.matchScalarCaseInsensitiveUnchecked])
268
- // expectProgram(
269
- // for: "(?i)abcdé",
270
- // semanticLevel: .unicodeScalar,
271
- // contains: [.matchScalarCaseInsensitiveUnchecked],
272
- // doesNotContain: [.matchScalarUnchecked])
273
- // }
304
+ func testBitsetCompile( ) {
305
+ expectProgram (
306
+ for: " [abc] " ,
307
+ contains: [ . matchBitset] ,
308
+ doesNotContain: [ . consumeBy, . matchBitsetScalar] )
309
+ expectProgram (
310
+ for: " [abc] " ,
311
+ semanticLevel: . unicodeScalar,
312
+ contains: [ . matchBitsetScalar] ,
313
+ doesNotContain: [ . matchBitset, . consumeBy] )
314
+ }
315
+
316
+ func testScalarOptimizeCompilation( ) {
317
+ // all ascii quoted literal -> elide boundary checks
318
+ expectProgram (
319
+ for: " abcd " ,
320
+ contains: [ . matchScalar, . matchScalarUnchecked] ,
321
+ doesNotContain: [ . match, . consumeBy] )
322
+ // ascii character -> matchScalar with boundary check
323
+ expectProgram (
324
+ for: " a " ,
325
+ contains: [ . matchScalar] ,
326
+ doesNotContain: [ . match, . consumeBy, . matchScalarUnchecked] )
327
+ // quoted literal is not all ascii -> match scalar when possible, always do boundary checks
328
+ expectProgram (
329
+ for: " aaa \u{301} " ,
330
+ contains: [ . match, . matchScalar] ,
331
+ doesNotContain: [ . consumeBy, . matchScalarUnchecked] )
332
+ // scalar mode -> always emit match scalar without boundary checks
333
+ expectProgram (
334
+ for: " abcd " ,
335
+ semanticLevel: . unicodeScalar,
336
+ contains: [ . matchScalarUnchecked] ,
337
+ doesNotContain: [ . match, . consumeBy, . matchScalar] )
338
+ expectProgram (
339
+ for: " a " ,
340
+ semanticLevel: . unicodeScalar,
341
+ contains: [ . matchScalarUnchecked] ,
342
+ doesNotContain: [ . match, . consumeBy, . matchScalar] )
343
+ expectProgram (
344
+ for: " aaa \u{301} " ,
345
+ semanticLevel: . unicodeScalar,
346
+ contains: [ . matchScalarUnchecked] ,
347
+ doesNotContain: [ . match, . consumeBy, . matchScalar] )
348
+ }
349
+
350
+ func testCaseInsensitivityCompilation( ) {
351
+ // quoted literal is all ascii -> match scalar case insensitive and skip
352
+ // boundary checks
353
+ expectProgram (
354
+ for: " (?i)abcd " ,
355
+ contains: [ . matchScalarCaseInsensitiveUnchecked, . matchScalarCaseInsensitive] ,
356
+ doesNotContain: [ . match, . matchCaseInsensitive, . matchScalar, . matchScalarUnchecked] )
357
+ // quoted literal is all non-cased ascii -> emit match scalar instructions
358
+ expectProgram (
359
+ for: " (?i)&&&& " ,
360
+ contains: [ . matchScalar, . matchScalarUnchecked] ,
361
+ doesNotContain: [ . match, . matchCaseInsensitive,
362
+ . matchScalarCaseInsensitive, . matchScalarCaseInsensitiveUnchecked] )
363
+ // quoted literal is not all ascii -> match scalar case insensitive when
364
+ // possible, match character case insensitive when needed, always perform
365
+ // boundary check
366
+ expectProgram (
367
+ for: " (?i)abcd \u{301} " ,
368
+ contains: [ . matchCaseInsensitive, . matchScalarCaseInsensitive] ,
369
+ doesNotContain: [ . matchScalarCaseInsensitiveUnchecked, . match, . matchScalar] )
370
+ // same as before but contains ascii non cased characters -> emit matchScalar for them
371
+ expectProgram (
372
+ for: " (?i)abcd \u{301} ;.'! " ,
373
+ contains: [ . matchCaseInsensitive, . matchScalarCaseInsensitive, . matchScalar] ,
374
+ doesNotContain: [ . matchScalarCaseInsensitiveUnchecked, . match] )
375
+ // contains non-ascii non-cased characters -> emit match
376
+ expectProgram (
377
+ for: " (?i)abcd \u{301} ;.'!💖 " ,
378
+ contains: [ . matchCaseInsensitive, . matchScalarCaseInsensitive, . matchScalar, . match] ,
379
+ doesNotContain: [ . matchScalarCaseInsensitiveUnchecked] )
380
+
381
+ // scalar mode -> emit unchecked scalar match only, emit case insensitive
382
+ // only if the scalar is cased
383
+ expectProgram (
384
+ for: " (?i);.'!💖 " ,
385
+ semanticLevel: . unicodeScalar,
386
+ contains: [ . matchScalarUnchecked] ,
387
+ doesNotContain: [ . matchScalarCaseInsensitiveUnchecked] )
388
+ expectProgram (
389
+ for: " (?i)abcdé " ,
390
+ semanticLevel: . unicodeScalar,
391
+ contains: [ . matchScalarCaseInsensitiveUnchecked] ,
392
+ doesNotContain: [ . matchScalarUnchecked] )
393
+ }
274
394
}
0 commit comments