Skip to content

Commit eeb38e9

Browse files
committed
First ver
1 parent 8688296 commit eeb38e9

File tree

5 files changed

+95
-8
lines changed

5 files changed

+95
-8
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 49 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -208,11 +208,20 @@ fileprivate extension Compiler.ByteCodeGen {
208208
builder.buildConsume(by: consumeScalar {
209209
$0.properties.lowercaseMapping == s.properties.lowercaseMapping
210210
})
211-
} else {
212-
builder.buildConsume(by: consumeScalar {
213-
$0 == s
214-
})
211+
return
215212
}
213+
214+
if s.value < 0x300 {
215+
// lily todo: make sure this is correct + add compiler option check after it's merged in
216+
// we unconditionally match against the scalar using consumeScalar in the else case
217+
// so maybe this check is uneccessary??
218+
builder.buildMatchScalar(s, boundaryCheck: false)
219+
return
220+
}
221+
222+
builder.buildConsume(by: consumeScalar {
223+
$0 == s
224+
})
216225
}
217226

218227
mutating func emitCharacter(_ c: Character) throws {
@@ -233,9 +242,27 @@ fileprivate extension Compiler.ByteCodeGen {
233242
? input.index(after: bounds.lowerBound)
234243
: nil
235244
}
236-
} else {
237-
builder.buildMatch(c)
238245
}
246+
247+
// if c.unicodeScalars.count == 1,
248+
// let first = c.unicodeScalars.first,
249+
// first.value < 0x300 { // lily todo: check this more carefully
250+
// if we have a single scalar then this must not be an extended grapheme cluster
251+
// so it must be a character that can be exactly matched by its first scalar
252+
// cr-lf has two scalars right? yes it has two
253+
254+
// i think one these two checks are redundant, I think we only need the second?
255+
// ask alex?
256+
257+
// we can only match against characters that have a single cannonical equivalence
258+
// so I think that rules out any latin in here, so just use ascii for now
259+
// we also need to exclude our good non-single-scalar-ascii friend cr-lf
260+
if c.isASCII && c != "\r\n" {
261+
builder.buildMatchScalar(c.unicodeScalars.first!, boundaryCheck: true)
262+
return
263+
}
264+
265+
builder.buildMatch(c)
239266
}
240267

241268
mutating func emitAny() {
@@ -732,7 +759,22 @@ fileprivate extension Compiler.ByteCodeGen {
732759
return currentIndex
733760
}
734761
} else {
735-
builder.buildMatchSequence(s)
762+
// if we have any extended latin in our characters then we have to
763+
// respect cannoical equivalence, so we cannot match against scalars exactly
764+
// so match against all single scalar ascii
765+
766+
// lily todo: which strings are nfc invariant and matchable by direct scalar comparison?
767+
// alternatively: loop over characters in s and emit either matchScalar or matchCharacter depending on if it is NFC invariant
768+
// getting rid of matchSeq entirely does also get rid of the weird ARC
769+
if s.allSatisfy({c in c.isASCII && c != "\r\n"}) {
770+
for scalar in s.unicodeScalars.dropLast(1) {
771+
builder.buildMatchScalar(scalar, boundaryCheck: false)
772+
}
773+
// check that we are on a boundary at the end and there isn't a combining character after this scalar
774+
builder.buildMatchScalar(s.unicodeScalars.last!, boundaryCheck: true)
775+
} else {
776+
builder.buildMatchSequence(s)
777+
}
736778
}
737779
} else {
738780
builder.buildConsume {

Sources/_StringProcessing/Engine/InstPayload.swift

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,13 @@ extension Instruction.Payload {
146146
var string: StringRegister {
147147
interpret()
148148
}
149+
150+
init(scalar: Unicode.Scalar) {
151+
self.init(UInt64(scalar.value))
152+
}
153+
var scalar: Unicode.Scalar {
154+
return Unicode.Scalar(_value: UInt32(self.rawValue))
155+
}
149156

150157
init(sequence: SequenceRegister) {
151158
self.init(sequence)

Sources/_StringProcessing/Engine/Instruction.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,9 @@ extension Instruction {
8383
///
8484
/// Operand: Sequence register to compare against.
8585
case matchSequence
86+
87+
case matchScalar
88+
case matchScalarUnchecked
8689

8790
/// TODO: builtin assertions and anchors
8891
case builtinAssertion

Sources/_StringProcessing/Engine/MEBuilder.swift

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,14 @@ extension MEProgram.Builder {
146146
.matchSequence,
147147
.init(sequence: sequences.store(.init(s)))))
148148
}
149+
150+
mutating func buildMatchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) {
151+
if boundaryCheck {
152+
instructions.append(.init(.matchScalar, .init(scalar: s)))
153+
} else {
154+
instructions.append(.init(.matchScalarUnchecked, .init(scalar: s)))
155+
}
156+
}
149157

150158
mutating func buildConsume(
151159
by p: @escaping MEProgram.ConsumeFunction

Sources/_StringProcessing/Engine/Processor.swift

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,23 @@ extension Processor {
226226
}
227227
return true
228228
}
229+
230+
func loadScalar() -> Unicode.Scalar? {
231+
currentPosition < end ? input.unicodeScalars[currentPosition] : nil
232+
}
233+
234+
mutating func matchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) -> Bool {
235+
guard let curScalar = loadScalar(),
236+
curScalar == s,
237+
let idx = input.unicodeScalars.index(currentPosition, offsetBy: 1, limitedBy: end),
238+
(!boundaryCheck || input.isOnGraphemeClusterBoundary(idx))
239+
else {
240+
signalFailure()
241+
return false
242+
}
243+
currentPosition = idx
244+
return true
245+
}
229246

230247
mutating func signalFailure() {
231248
guard let (pc, pos, stackEnd, capEnds, intRegisters) =
@@ -363,7 +380,17 @@ extension Processor {
363380
if matchSeq(seq) {
364381
controller.step()
365382
}
366-
383+
384+
case .matchScalar:
385+
let scalar = payload.scalar
386+
if matchScalar(scalar, boundaryCheck: true) {
387+
controller.step()
388+
}
389+
case .matchScalarUnchecked:
390+
let scalar = payload.scalar
391+
if matchScalar(scalar, boundaryCheck: false) {
392+
controller.step()
393+
}
367394
case .consumeBy:
368395
let reg = payload.consumer
369396
guard currentPosition < searchBounds.upperBound,

0 commit comments

Comments
 (0)