Skip to content

Commit 4cf39c3

Browse files
rctcwyvrnhamishknight
authored andcommitted
Optimize matching to match on scalar values when possible (swiftlang#525)
- Adds new instructions for matching characters and scalars case insensitively - Compiles ascii character matches into the faster scalar match instructions even in grapheme semantic mode - Optimizes out unnecessary runtime grapheme boundary checks for all ascii strings - Also includes fixes to scalar matching in grapheme semantic mode (swiftlang#565)
1 parent 9efa856 commit 4cf39c3

14 files changed

+777
-287
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 68 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,14 @@ fileprivate extension Compiler.ByteCodeGen {
6565
emitDot()
6666

6767
case let .char(c):
68-
try emitCharacter(c)
68+
emitCharacter(c)
6969

7070
case let .scalar(s):
71-
try emitScalar(s)
71+
if options.semanticLevel == .graphemeCluster {
72+
emitCharacter(Character(s))
73+
} else {
74+
emitMatchScalar(s)
75+
}
7276

7377
case let .assertion(kind):
7478
try emitAssertion(kind)
@@ -94,6 +98,34 @@ fileprivate extension Compiler.ByteCodeGen {
9498
}
9599
}
96100

101+
mutating func emitQuotedLiteral(_ s: String) {
102+
guard options.semanticLevel == .graphemeCluster else {
103+
for char in s {
104+
for scalar in char.unicodeScalars {
105+
emitMatchScalar(scalar)
106+
}
107+
}
108+
return
109+
}
110+
111+
// Fast path for eliding boundary checks for an all ascii quoted literal
112+
if optimizationsEnabled && s.allSatisfy(\.isASCII) {
113+
let lastIdx = s.unicodeScalars.indices.last!
114+
for idx in s.unicodeScalars.indices {
115+
let boundaryCheck = idx == lastIdx
116+
let scalar = s.unicodeScalars[idx]
117+
if options.isCaseInsensitive && scalar.properties.isCased {
118+
builder.buildMatchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck)
119+
} else {
120+
builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck)
121+
}
122+
}
123+
return
124+
}
125+
126+
for c in s { emitCharacter(c) }
127+
}
128+
97129
mutating func emitBackreference(
98130
_ ref: AST.Reference
99131
) throws {
@@ -257,41 +289,47 @@ fileprivate extension Compiler.ByteCodeGen {
257289
}
258290
}
259291

260-
mutating func emitScalar(_ s: UnicodeScalar) throws {
261-
// TODO: Native instruction buildMatchScalar(s)
262-
if options.isCaseInsensitive {
263-
// TODO: e.g. buildCaseInsensitiveMatchScalar(s)
264-
builder.buildConsume(by: consumeScalar {
265-
$0.properties.lowercaseMapping == s.properties.lowercaseMapping
266-
})
292+
mutating func emitMatchScalar(_ s: UnicodeScalar) {
293+
assert(options.semanticLevel == .unicodeScalar)
294+
if options.isCaseInsensitive && s.properties.isCased {
295+
builder.buildMatchScalarCaseInsensitive(s, boundaryCheck: false)
267296
} else {
268-
builder.buildConsume(by: consumeScalar {
269-
$0 == s
270-
})
297+
builder.buildMatchScalar(s, boundaryCheck: false)
271298
}
272299
}
273300

274-
mutating func emitCharacter(_ c: Character) throws {
275-
// Unicode scalar matches the specific scalars that comprise a character
301+
mutating func emitCharacter(_ c: Character) {
302+
// Unicode scalar mode matches the specific scalars that comprise a character
276303
if options.semanticLevel == .unicodeScalar {
277304
for scalar in c.unicodeScalars {
278-
try emitScalar(scalar)
305+
emitMatchScalar(scalar)
279306
}
280307
return
281308
}
282309

283310
if options.isCaseInsensitive && c.isCased {
284-
// TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true)
285-
builder.buildConsume { input, bounds in
286-
let inputChar = input[bounds.lowerBound].lowercased()
287-
let matchChar = c.lowercased()
288-
return inputChar == matchChar
289-
? input.index(after: bounds.lowerBound)
290-
: nil
311+
if optimizationsEnabled && c.isASCII {
312+
// c.isCased ensures that c is not CR-LF,
313+
// so we know that c is a single scalar
314+
assert(c.unicodeScalars.count == 1)
315+
builder.buildMatchScalarCaseInsensitive(
316+
c.unicodeScalars.last!,
317+
boundaryCheck: true)
318+
} else {
319+
builder.buildMatch(c, isCaseInsensitive: true)
291320
}
292-
} else {
293-
builder.buildMatch(c)
321+
return
294322
}
323+
324+
if optimizationsEnabled && c.isASCII {
325+
let lastIdx = c.unicodeScalars.indices.last!
326+
for idx in c.unicodeScalars.indices {
327+
builder.buildMatchScalar(c.unicodeScalars[idx], boundaryCheck: idx == lastIdx)
328+
}
329+
return
330+
}
331+
332+
builder.buildMatch(c, isCaseInsensitive: false)
295333
}
296334

297335
mutating func emitAny() {
@@ -741,11 +779,12 @@ fileprivate extension Compiler.ByteCodeGen {
741779
_ ccc: DSLTree.CustomCharacterClass
742780
) throws {
743781
if let asciiBitset = ccc.asAsciiBitset(options),
744-
options.semanticLevel == .graphemeCluster,
745782
optimizationsEnabled {
746-
// future work: add a bit to .matchBitset to consume either a character
747-
// or a scalar so we can have this optimization in scalar mode
748-
builder.buildMatchAsciiBitset(asciiBitset)
783+
if options.semanticLevel == .unicodeScalar {
784+
builder.buildScalarMatchAsciiBitset(asciiBitset)
785+
} else {
786+
builder.buildMatchAsciiBitset(asciiBitset)
787+
}
749788
} else {
750789
let consumer = try ccc.generateConsumer(options)
751790
builder.buildConsume(by: consumer)
@@ -822,45 +861,7 @@ fileprivate extension Compiler.ByteCodeGen {
822861
try emitAtom(a)
823862

824863
case let .quotedLiteral(s):
825-
if options.semanticLevel == .graphemeCluster {
826-
if options.isCaseInsensitive {
827-
// TODO: buildCaseInsensitiveMatchSequence(c) or alternative
828-
builder.buildConsume { input, bounds in
829-
var iterator = s.makeIterator()
830-
var currentIndex = bounds.lowerBound
831-
while let ch = iterator.next() {
832-
guard currentIndex < bounds.upperBound,
833-
ch.lowercased() == input[currentIndex].lowercased()
834-
else { return nil }
835-
input.formIndex(after: &currentIndex)
836-
}
837-
return currentIndex
838-
}
839-
} else {
840-
builder.buildMatchSequence(s)
841-
}
842-
} else {
843-
builder.buildConsume {
844-
[caseInsensitive = options.isCaseInsensitive] input, bounds in
845-
// TODO: Case folding
846-
var iterator = s.unicodeScalars.makeIterator()
847-
var currentIndex = bounds.lowerBound
848-
while let scalar = iterator.next() {
849-
guard currentIndex < bounds.upperBound else { return nil }
850-
if caseInsensitive {
851-
if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping {
852-
return nil
853-
}
854-
} else {
855-
if scalar != input.unicodeScalars[currentIndex] {
856-
return nil
857-
}
858-
}
859-
input.unicodeScalars.formIndex(after: &currentIndex)
860-
}
861-
return currentIndex
862-
}
863-
}
864+
emitQuotedLiteral(s)
864865

865866
case let .convertedRegexLiteral(n, _):
866867
return try emitNode(n)

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 62 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@
1111

1212
@_implementationOnly import _RegexParser
1313

14+
extension Character {
15+
var _singleScalarAsciiValue: UInt8? {
16+
guard self != "\r\n" else { return nil }
17+
return asciiValue
18+
}
19+
}
20+
1421
extension DSLTree.Node {
1522
/// Attempt to generate a consumer from this AST node
1623
///
@@ -53,11 +60,50 @@ extension DSLTree._AST.Atom {
5360
}
5461
}
5562

63+
extension Character {
64+
func generateConsumer(
65+
_ opts: MatchingOptions
66+
) throws -> MEProgram.ConsumeFunction? {
67+
let isCaseInsensitive = opts.isCaseInsensitive
68+
switch opts.semanticLevel {
69+
case .graphemeCluster:
70+
return { input, bounds in
71+
let low = bounds.lowerBound
72+
if isCaseInsensitive && isCased {
73+
return input[low].lowercased() == lowercased()
74+
? input.index(after: low)
75+
: nil
76+
} else {
77+
return input[low] == self
78+
? input.index(after: low)
79+
: nil
80+
}
81+
}
82+
case .unicodeScalar:
83+
// TODO: This should only be reachable from character class emission, can
84+
// we guarantee that? Otherwise we'd want a different matching behavior.
85+
let consumers = unicodeScalars.map { s in consumeScalar {
86+
isCaseInsensitive
87+
? $0.properties.lowercaseMapping == s.properties.lowercaseMapping
88+
: $0 == s
89+
}}
90+
return { input, bounds in
91+
for fn in consumers {
92+
if let idx = fn(input, bounds) {
93+
return idx
94+
}
95+
}
96+
return nil
97+
}
98+
}
99+
}
100+
}
101+
56102
extension DSLTree.Atom {
57103
var singleScalarASCIIValue: UInt8? {
58104
switch self {
59-
case let .char(c) where c != "\r\n":
60-
return c.asciiValue
105+
case let .char(c):
106+
return c._singleScalarAsciiValue
61107
case let .scalar(s) where s.isASCII:
62108
return UInt8(ascii: s)
63109
case let .unconverted(atom):
@@ -72,44 +118,15 @@ extension DSLTree.Atom {
72118
func generateConsumer(
73119
_ opts: MatchingOptions
74120
) throws -> MEProgram.ConsumeFunction? {
75-
let isCaseInsensitive = opts.isCaseInsensitive
76-
77121
switch self {
78122
case let .char(c):
79-
if opts.semanticLevel == .graphemeCluster {
80-
return { input, bounds in
81-
let low = bounds.lowerBound
82-
if isCaseInsensitive && c.isCased {
83-
return input[low].lowercased() == c.lowercased()
84-
? input.index(after: low)
85-
: nil
86-
} else {
87-
return input[low] == c
88-
? input.index(after: low)
89-
: nil
90-
}
91-
}
92-
} else {
93-
let consumers = c.unicodeScalars.map { s in consumeScalar {
94-
isCaseInsensitive
95-
? $0.properties.lowercaseMapping == s.properties.lowercaseMapping
96-
: $0 == s
97-
}}
98-
return { input, bounds in
99-
for fn in consumers {
100-
if let idx = fn(input, bounds) {
101-
return idx
102-
}
103-
}
104-
return nil
105-
}
106-
}
123+
return try c.generateConsumer(opts)
124+
107125
case let .scalar(s):
108-
return consumeScalar {
109-
isCaseInsensitive
110-
? $0.properties.lowercaseMapping == s.properties.lowercaseMapping
111-
: $0 == s
112-
}
126+
// A scalar always matches the same as a single scalar character. This
127+
// means it must match a whole grapheme in grapheme semantic mode, but
128+
// can match a single scalar in scalar semantic mode.
129+
return try Character(s).generateConsumer(opts)
113130

114131
case .any:
115132
// FIXME: Should this be a total ordering?
@@ -230,16 +247,20 @@ extension AST.Atom {
230247
var singleScalar: UnicodeScalar? {
231248
switch kind {
232249
case .scalar(let s): return s.value
250+
case .escaped(let e):
251+
guard let s = e.scalarValue else { return nil }
252+
return s
233253
default: return nil
234254
}
235255
}
236256

237257
var singleScalarASCIIValue: UInt8? {
258+
if let s = singleScalar, s.isASCII {
259+
return UInt8(ascii: s)
260+
}
238261
switch kind {
239-
case let .char(c) where c != "\r\n":
240-
return c.asciiValue
241-
case let .scalar(s) where s.value.isASCII:
242-
return UInt8(ascii: s.value)
262+
case let .char(c):
263+
return c._singleScalarAsciiValue
243264
default:
244265
return nil
245266
}

Sources/_StringProcessing/Engine/InstPayload.swift

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,26 @@ extension Instruction.Payload {
147147
var string: StringRegister {
148148
interpret()
149149
}
150+
151+
init(scalar: Unicode.Scalar) {
152+
self.init(UInt64(scalar.value))
153+
}
154+
var scalar: Unicode.Scalar {
155+
return Unicode.Scalar(_value: UInt32(self.rawValue))
156+
}
157+
158+
init(scalar: Unicode.Scalar, caseInsensitive: Bool, boundaryCheck: Bool) {
159+
let raw = UInt64(scalar.value)
160+
+ (caseInsensitive ? 1 << 55: 0)
161+
+ (boundaryCheck ? 1 << 54 : 0)
162+
self.init(raw)
163+
}
164+
var scalarPayload: (Unicode.Scalar, caseInsensitive: Bool, boundaryCheck: Bool) {
165+
let caseInsensitive = (self.rawValue >> 55) & 1 == 1
166+
let boundaryCheck = (self.rawValue >> 54) & 1 == 1
167+
let scalar = Unicode.Scalar(_value: UInt32(self.rawValue & 0xFFFF_FFFF))
168+
return (scalar, caseInsensitive: caseInsensitive, boundaryCheck: boundaryCheck)
169+
}
150170

151171
init(sequence: SequenceRegister) {
152172
self.init(sequence)
@@ -190,18 +210,20 @@ extension Instruction.Payload {
190210
interpret()
191211
}
192212

193-
init(element: ElementRegister) {
194-
self.init(element)
213+
init(element: ElementRegister, isCaseInsensitive: Bool) {
214+
self.init(isCaseInsensitive ? 1 : 0, element)
195215
}
196-
var element: ElementRegister {
197-
interpret()
216+
var elementPayload: (isCaseInsensitive: Bool, ElementRegister) {
217+
let pair: (UInt64, ElementRegister) = interpretPair()
218+
return (isCaseInsensitive: pair.0 == 1, pair.1)
198219
}
199220

200-
init(bitset: AsciiBitsetRegister) {
201-
self.init(bitset)
221+
init(bitset: AsciiBitsetRegister, isScalar: Bool) {
222+
self.init(isScalar ? 1 : 0, bitset)
202223
}
203-
var bitset: AsciiBitsetRegister {
204-
interpret()
224+
var bitsetPayload: (isScalar: Bool, AsciiBitsetRegister) {
225+
let pair: (UInt64, AsciiBitsetRegister) = interpretPair()
226+
return (isScalar: pair.0 == 1, pair.1)
205227
}
206228

207229
init(consumer: ConsumeFunctionRegister) {

0 commit comments

Comments
 (0)