Skip to content

Commit 9d18a26

Browse files
natecook1000hamishknight
authored andcommitted
Improve Unicode/UTS18 and semantic level support (swiftlang#268)
* Add tests for UTS18 level support (incomplete) * Implement canonical equivalence tests * Fix canonical equivalence at different levels * Test named chars x semantic level * Enable loose matching on \N{...} scalar names * Make Unicode property classes work with semantics
1 parent 535cd83 commit 9d18a26

File tree

8 files changed

+933
-197
lines changed

8 files changed

+933
-197
lines changed

Package.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ let package = Package(
6767
name: "RegexTests",
6868
dependencies: ["_StringProcessing"],
6969
swiftSettings: [
70-
.unsafeFlags(["-Xfrontend", "-disable-availability-checking"])
70+
.unsafeFlags(["-Xfrontend", "-disable-availability-checking"]),
7171
]),
7272
.testTarget(
7373
name: "RegexBuilderTests",

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 43 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,15 @@ extension Compiler.ByteCodeGen {
168168
}
169169

170170
mutating func emitCharacter(_ c: Character) throws {
171-
// FIXME: Does semantic level matter?
171+
// Unicode scalar matches the specific scalars that comprise a character
172+
if options.semanticLevel == .unicodeScalar {
173+
print("emitting '\(c)' as a sequence of \(c.unicodeScalars.count) scalars")
174+
for scalar in c.unicodeScalars {
175+
try emitScalar(scalar)
176+
}
177+
return
178+
}
179+
172180
if options.isCaseInsensitive && c.isCased {
173181
// TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true)
174182
builder.buildConsume { input, bounds in
@@ -625,22 +633,44 @@ extension Compiler.ByteCodeGen {
625633
try emitAtom(a)
626634

627635
case let .quotedLiteral(s):
628-
// TODO: Should this incorporate options?
629-
if options.isCaseInsensitive {
630-
// TODO: buildCaseInsensitiveMatchSequence(c) or alternative
631-
builder.buildConsume { input, bounds in
632-
var iterator = s.makeIterator()
636+
if options.semanticLevel == .graphemeCluster {
637+
if options.isCaseInsensitive {
638+
// TODO: buildCaseInsensitiveMatchSequence(c) or alternative
639+
builder.buildConsume { input, bounds in
640+
var iterator = s.makeIterator()
641+
var currentIndex = bounds.lowerBound
642+
while let ch = iterator.next() {
643+
guard currentIndex < bounds.upperBound,
644+
ch.lowercased() == input[currentIndex].lowercased()
645+
else { return nil }
646+
input.formIndex(after: &currentIndex)
647+
}
648+
return currentIndex
649+
}
650+
} else {
651+
builder.buildMatchSequence(s)
652+
}
653+
} else {
654+
builder.buildConsume {
655+
[caseInsensitive = options.isCaseInsensitive] input, bounds in
656+
// TODO: Case folding
657+
var iterator = s.unicodeScalars.makeIterator()
633658
var currentIndex = bounds.lowerBound
634-
while let ch = iterator.next() {
635-
guard currentIndex < bounds.upperBound,
636-
ch.lowercased() == input[currentIndex].lowercased()
637-
else { return nil }
638-
input.formIndex(after: &currentIndex)
659+
while let scalar = iterator.next() {
660+
guard currentIndex < bounds.upperBound else { return nil }
661+
if caseInsensitive {
662+
if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping {
663+
return nil
664+
}
665+
} else {
666+
if scalar != input.unicodeScalars[currentIndex] {
667+
return nil
668+
}
669+
}
670+
input.unicodeScalars.formIndex(after: &currentIndex)
639671
}
640672
return currentIndex
641673
}
642-
} else {
643-
builder.buildMatchSequence(s)
644674
}
645675

646676
case let .regexLiteral(l):

0 commit comments

Comments
 (0)