Skip to content

Commit 4a18187

Browse files
committed
Coalesce character class members
In grapheme semantic mode, coalesce adjacent character and scalar members of a custom character class, over which we can perform grapheme breaking. This involves potentially re-writing ranges such that they contain a complete grapheme of adjacent scalars.
1 parent dd21a39 commit 4a18187

File tree

4 files changed

+276
-4
lines changed

4 files changed

+276
-4
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -775,9 +775,90 @@ fileprivate extension Compiler.ByteCodeGen {
775775
builder.label(exit)
776776
}
777777

778+
func coalescingCustomCharacterClassMembers(
779+
_ ccc: DSLTree.CustomCharacterClass
780+
) -> DSLTree.CustomCharacterClass {
781+
// This only needs to be done in grapheme semantic mode. In scalar semantic
782+
// mode, we don't want to coalesce any scalars into a grapheme. This
783+
// means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
784+
// U+302.
785+
guard options.semanticLevel == .graphemeCluster else { return ccc }
786+
787+
struct Accumulator {
788+
/// A series of range operands. For example, in `[ab-cde-fg]`, this will
789+
/// contain the strings `["ab", "cde", "fg"]`. From there, the resulting
790+
/// ranges will be created.
791+
private var rangeOperands: [String] = [""]
792+
793+
/// The current range operand.
794+
private var current: String {
795+
_read { yield rangeOperands[rangeOperands.count - 1] }
796+
_modify { yield &rangeOperands[rangeOperands.count - 1] }
797+
}
798+
799+
/// Try to accumulate a character class member, returning `true` if
800+
/// successful, `false` otherwise.
801+
mutating func tryAccumulate(
802+
_ member: DSLTree.CustomCharacterClass.Member
803+
) -> Bool {
804+
switch member {
805+
case .atom(let a):
806+
guard let c = a.literalCharacterValue else { return false }
807+
current.append(c)
808+
return true
809+
case .quotedLiteral(let str):
810+
current += str
811+
return true
812+
case let .range(lhs, rhs):
813+
guard let lhs = lhs.literalCharacterValue,
814+
let rhs = rhs.literalCharacterValue
815+
else { return false }
816+
current.append(lhs)
817+
rangeOperands.append(String(rhs))
818+
return true
819+
default:
820+
return false
821+
}
822+
}
823+
824+
func finish() -> [DSLTree.CustomCharacterClass.Member] {
825+
if rangeOperands.count == 1 {
826+
// If we didn't have any additional range operands, this isn't a
827+
// range, we can just form a standard quoted literal.
828+
return [.quotedLiteral(current)]
829+
}
830+
// We have other range operands, splice them together.
831+
var members = [DSLTree.CustomCharacterClass.Member]()
832+
for (i, lhs) in rangeOperands.dropLast().enumerated() {
833+
let rhs = rangeOperands[i + 1]
834+
let lhsMembers = lhs.dropLast()
835+
if !lhsMembers.isEmpty {
836+
members.append(.quotedLiteral(String(lhsMembers)))
837+
}
838+
members.append(.range(.char(lhs.last!), .char(rhs.first!)))
839+
let rhsMembers = rhs.dropFirst()
840+
if !rhsMembers.isEmpty {
841+
members.append(.quotedLiteral(String(rhsMembers)))
842+
}
843+
}
844+
return members
845+
}
846+
}
847+
let members = ccc.members
848+
.coalescing(with: Accumulator(), into: { $0.finish() }) { accum, member in
849+
accum.tryAccumulate(member)
850+
}
851+
return .init(members: members, isInverted: ccc.isInverted)
852+
}
853+
778854
mutating func emitCustomCharacterClass(
779855
_ ccc: DSLTree.CustomCharacterClass
780856
) throws {
857+
// Before emitting a custom character class in grapheme semantic mode, we
858+
// need to coalesce together any adjacent characters and scalars, over which
859+
// we can perform grapheme breaking. This includes e.g range bounds for
860+
// `[e\u{301}-\u{302}]`.
861+
let ccc = coalescingCustomCharacterClassMembers(ccc)
781862
if let asciiBitset = ccc.asAsciiBitset(options),
782863
optimizationsEnabled {
783864
if options.semanticLevel == .unicodeScalar {

Sources/_StringProcessing/Utility/Misc.swift

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@
1111

1212
extension Array {
1313
/// Coalesce adjacent elements using a given accumulator. The accumulator is
14-
/// transformed into an element of the array by `finish`. The `accumulate`
14+
/// transformed into elements of the array by `finish`. The `accumulate`
1515
/// function should return `true` if the accumulator has coalesced the
1616
/// element, `false` otherwise.
1717
func coalescing<T>(
18-
with initialAccumulator: T, into finish: (T) -> Element,
18+
with initialAccumulator: T, into finish: (T) -> Self,
1919
accumulate: (inout T, Element) -> Bool
2020
) -> Self {
2121
var didAccumulate = false
@@ -32,16 +32,28 @@ extension Array {
3232
if didAccumulate {
3333
// We have a leftover accumulator, which needs to be finished before we
3434
// can append the next element.
35-
result.append(finish(accumulator))
35+
result += finish(accumulator)
3636
accumulator = initialAccumulator
3737
didAccumulate = false
3838
}
3939
result.append(elt)
4040
}
4141
// Handle a leftover accumulation.
4242
if didAccumulate {
43-
result.append(finish(accumulator))
43+
result += finish(accumulator)
4444
}
4545
return result
4646
}
47+
48+
/// Coalesce adjacent elements using a given accumulator. The accumulator is
49+
/// transformed into an element of the array by `finish`. The `accumulate`
50+
/// function should return `true` if the accumulator has coalesced the
51+
/// element, `false` otherwise.
52+
func coalescing<T>(
53+
with initialAccumulator: T, into finish: (T) -> Element,
54+
accumulate: (inout T, Element) -> Bool
55+
) -> Self {
56+
coalescing(
57+
with: initialAccumulator, into: { [finish($0) ]}, accumulate: accumulate)
58+
}
4759
}

Tests/RegexTests/MatchTests.swift

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -765,6 +765,183 @@ extension RegexTests {
765765
semanticLevel: .unicodeScalar
766766
)
767767

768+
// Scalar coalescing.
769+
firstMatchTests(
770+
#"[e\u{301}]"#,
771+
(eDecomposed, eDecomposed),
772+
(eComposed, eComposed),
773+
("e", nil),
774+
("\u{301}", nil)
775+
)
776+
firstMatchTests(
777+
#"[e\u{301}]"#,
778+
(eDecomposed, "e"),
779+
(eComposed, nil),
780+
("e", "e"),
781+
("\u{301}", "\u{301}"),
782+
semanticLevel: .unicodeScalar
783+
)
784+
firstMatchTests(
785+
#"[👨\u{200D}👩\u{200D}👧\u{200D}👦]"#,
786+
("👨", nil),
787+
("👩", nil),
788+
("👧", nil),
789+
("👦", nil),
790+
("\u{200D}", nil),
791+
("👨‍👩‍👧‍👦", "👨‍👩‍👧‍👦")
792+
)
793+
firstMatchTests(
794+
#"[👨\u{200D}👩\u{200D}👧\u{200D}👦]"#,
795+
("👨", "👨"),
796+
("👩", "👩"),
797+
("👧", "👧"),
798+
("👦", "👦"),
799+
("\u{200D}", "\u{200D}"),
800+
("👨‍👩‍👧‍👦", "👨"),
801+
semanticLevel: .unicodeScalar
802+
)
803+
firstMatchTests(
804+
#"[e\u{315}\u{301}\u{35C}]"#,
805+
("e", nil),
806+
("e\u{315}", nil),
807+
("e\u{301}", nil),
808+
("e\u{315}\u{301}\u{35C}", "e\u{315}\u{301}\u{35C}"),
809+
("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"),
810+
("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}")
811+
)
812+
813+
firstMatchTests(
814+
#"[a-z1\u{E9}-\u{302}\u{E1}3-59]"#,
815+
("a", "a"),
816+
("a\u{301}", "a\u{301}"),
817+
("\u{E1}", "\u{E1}"),
818+
("\u{E2}", nil),
819+
("z", "z"),
820+
("e", "e"),
821+
(eDecomposed, eDecomposed),
822+
(eComposed, eComposed),
823+
("\u{302}", "\u{302}"),
824+
("1", "1"),
825+
("2", nil),
826+
("3", "3"),
827+
("4", "4"),
828+
("5", "5"),
829+
("6", nil),
830+
("7", nil),
831+
("8", nil),
832+
("9", "9")
833+
)
834+
835+
// These can't compile in grapheme semantic mode, but make sure they work in
836+
// scalar semantic mode.
837+
firstMatchTests(
838+
#"[a\u{315}\u{301}-\u{302}]"#,
839+
("a", "a"),
840+
("\u{315}", "\u{315}"),
841+
("\u{301}", "\u{301}"),
842+
("\u{302}", "\u{302}"),
843+
("\u{303}", nil),
844+
semanticLevel: .unicodeScalar
845+
)
846+
firstMatchTests(
847+
#"[\u{73}\u{323}\u{307}-\u{1E00}]"#,
848+
("\u{73}", "\u{73}"),
849+
("\u{323}", "\u{323}"),
850+
("\u{307}", "\u{307}"),
851+
("\u{400}", "\u{400}"),
852+
("\u{500}", "\u{500}"),
853+
("\u{1E00}", "\u{1E00}"),
854+
("\u{1E01}", nil),
855+
("\u{1E69}", nil),
856+
semanticLevel: .unicodeScalar
857+
)
858+
firstMatchTests(
859+
#"[a\u{302}-✅]"#,
860+
("a", "a"),
861+
("\u{302}", "\u{302}"),
862+
("A\u{302}", "\u{302}"),
863+
("E\u{301}", nil),
864+
("a\u{301}", "a"),
865+
("\u{E1}", nil),
866+
("a\u{302}", "a"),
867+
("\u{E2}", nil),
868+
("\u{E3}", nil),
869+
("\u{EF}", nil),
870+
("e\u{301}", nil),
871+
("e\u{302}", "\u{302}"),
872+
("\u{2705}", "\u{2705}"),
873+
("", ""),
874+
("\u{376}", "\u{376}"),
875+
("\u{850}", "\u{850}"),
876+
("a\u{302}\u{315}", "a"),
877+
semanticLevel: .unicodeScalar
878+
)
879+
firstMatchTests(
880+
#"(?i)[a\u{302}-✅]"#,
881+
("a", "a"),
882+
("\u{302}", "\u{302}"),
883+
("A\u{302}", "A"),
884+
("E\u{301}", nil),
885+
("a\u{301}", "a"),
886+
("\u{E1}", nil),
887+
("a\u{302}", "a"),
888+
("\u{E2}", nil),
889+
("\u{E3}", nil),
890+
("\u{EF}", nil),
891+
("e\u{301}", nil),
892+
("e\u{302}", "\u{302}"),
893+
("\u{2705}", "\u{2705}"),
894+
("", ""),
895+
("\u{376}", "\u{376}"),
896+
("\u{850}", "\u{850}"),
897+
("a\u{302}\u{315}", "a"),
898+
semanticLevel: .unicodeScalar
899+
)
900+
firstMatchTests(
901+
#"[e\u{301}-\u{302}]"#,
902+
("a", nil),
903+
("e", "e"),
904+
("\u{302}", "\u{302}"),
905+
("A\u{302}", "\u{302}"),
906+
("E\u{301}", "\u{301}"),
907+
("\u{C8}", nil),
908+
("\u{C9}", nil),
909+
("\u{CA}", nil),
910+
("\u{CB}", nil),
911+
("a\u{301}", "\u{301}"),
912+
("a\u{302}", "\u{302}"),
913+
("e\u{301}", "e"),
914+
("e\u{302}", "e"),
915+
("\u{E1}", nil),
916+
("\u{E2}", nil),
917+
("\u{E9}", nil),
918+
("\u{EA}", nil),
919+
("\u{EF}", nil),
920+
semanticLevel: .unicodeScalar
921+
)
922+
firstMatchTests(
923+
#"(?i)[e\u{301}-\u{302}]"#,
924+
("a", nil),
925+
("e", "e"),
926+
("\u{302}", "\u{302}"),
927+
("A\u{302}", "\u{302}"),
928+
("E\u{301}", "E"),
929+
("\u{C8}", nil),
930+
("\u{C9}", nil),
931+
("\u{CA}", nil),
932+
("\u{CB}", nil),
933+
("a\u{301}", "\u{301}"),
934+
("a\u{302}", "\u{302}"),
935+
("e\u{301}", "e"),
936+
("e\u{302}", "e"),
937+
("\u{E1}", nil),
938+
("\u{E2}", nil),
939+
("\u{E9}", nil),
940+
("\u{EA}", nil),
941+
("\u{EF}", nil),
942+
semanticLevel: .unicodeScalar
943+
)
944+
768945
firstMatchTest("[-]", input: "123-abcxyz", match: "-")
769946

770947
// These are metacharacters in certain contexts, but normal characters

Tests/RegexTests/ParseTests.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2929,6 +2929,8 @@ extension RegexTests {
29292929
diagnosticTest(#"[c-b]"#, .invalidCharacterRange(from: "c", to: "b"))
29302930
diagnosticTest(#"[\u{66}-\u{65}]"#, .invalidCharacterRange(from: "\u{66}", to: "\u{65}"))
29312931

2932+
diagnosticTest(#"[e\u{301}-e\u{302}]"#, .invalidCharacterRange(from: "\u{301}", to: "e"))
2933+
29322934
diagnosticTest("(?x)[(?#)]", .expected("]"))
29332935
diagnosticTest("(?x)[(?#abc)]", .expected("]"))
29342936

0 commit comments

Comments
 (0)