Skip to content

Commit e7a5617

Browse files
committed
Coalesce character class members
In grapheme semantic mode, coalesce adjacent character and scalar members of a custom character class, over which we can perform grapheme breaking. This involves potentially re-writing ranges such that they contain a complete grapheme of adjacent scalars.
1 parent d82b2fa commit e7a5617

File tree

4 files changed

+345
-4
lines changed

4 files changed

+345
-4
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -775,9 +775,119 @@ fileprivate extension Compiler.ByteCodeGen {
775775
builder.label(exit)
776776
}
777777

778+
/// Coalesce any adjacent scalar members in a custom character class together.
779+
/// This is required in order to produce correct grapheme matching behavior.
780+
func coalescingCustomCharacterClassMembers(
781+
_ members: [DSLTree.CustomCharacterClass.Member]
782+
) -> [DSLTree.CustomCharacterClass.Member] {
783+
struct Accumulator {
784+
/// A series of range operands. For example, in `[ab-cde-fg]`, this will
785+
/// contain the strings `["ab", "cde", "fg"]`. From there, the resulting
786+
/// ranges will be created.
787+
private var rangeOperands: [String] = [""]
788+
789+
/// The current range operand.
790+
private var current: String {
791+
_read { yield rangeOperands[rangeOperands.count - 1] }
792+
_modify { yield &rangeOperands[rangeOperands.count - 1] }
793+
}
794+
795+
/// Try to accumulate a character class member, returning `true` if
796+
/// successful, `false` otherwise.
797+
mutating func tryAccumulate(
798+
_ member: DSLTree.CustomCharacterClass.Member
799+
) -> Bool {
800+
switch member {
801+
case .atom(let a):
802+
guard let c = a.literalCharacterValue else { return false }
803+
current.append(c)
804+
return true
805+
case .quotedLiteral(let str):
806+
current += str
807+
return true
808+
case let .range(lhs, rhs):
809+
guard let lhs = lhs.literalCharacterValue,
810+
let rhs = rhs.literalCharacterValue
811+
else { return false }
812+
current.append(lhs)
813+
rangeOperands.append(String(rhs))
814+
return true
815+
default:
816+
return false
817+
}
818+
}
819+
820+
func finish() -> [DSLTree.CustomCharacterClass.Member] {
821+
if rangeOperands.count == 1 {
822+
// If we didn't have any additional range operands, this isn't a
823+
// range, we can just form a standard quoted literal.
824+
return [.quotedLiteral(current)]
825+
}
826+
// We have other range operands, splice them together.
827+
var members = [DSLTree.CustomCharacterClass.Member]()
828+
for (i, lhs) in rangeOperands.dropLast().enumerated() {
829+
let rhs = rangeOperands[i + 1]
830+
let lhsMembers = lhs.dropLast()
831+
if !lhsMembers.isEmpty {
832+
members.append(.quotedLiteral(String(lhsMembers)))
833+
}
834+
members.append(.range(.char(lhs.last!), .char(rhs.first!)))
835+
let rhsMembers = rhs.dropFirst()
836+
if !rhsMembers.isEmpty {
837+
members.append(.quotedLiteral(String(rhsMembers)))
838+
}
839+
}
840+
return members
841+
}
842+
}
843+
return members
844+
.map { m -> DSLTree.CustomCharacterClass.Member in
845+
// First we need to recursively coalsce any child character classes.
846+
switch m {
847+
case .custom(let ccc):
848+
return .custom(coalescingCustomCharacterClass(ccc))
849+
case .intersection(let lhs, let rhs):
850+
return .intersection(
851+
coalescingCustomCharacterClass(lhs),
852+
coalescingCustomCharacterClass(rhs))
853+
case .subtraction(let lhs, let rhs):
854+
return .subtraction(
855+
coalescingCustomCharacterClass(lhs),
856+
coalescingCustomCharacterClass(rhs))
857+
case .symmetricDifference(let lhs, let rhs):
858+
return .symmetricDifference(
859+
coalescingCustomCharacterClass(lhs),
860+
coalescingCustomCharacterClass(rhs))
861+
case .atom, .range, .quotedLiteral, .trivia:
862+
return m
863+
}
864+
}
865+
.coalescing(with: Accumulator(), into: { $0.finish() }) { accum, member in
866+
accum.tryAccumulate(member)
867+
}
868+
}
869+
870+
func coalescingCustomCharacterClass(
871+
_ ccc: DSLTree.CustomCharacterClass
872+
) -> DSLTree.CustomCharacterClass {
873+
// This only needs to be done in grapheme semantic mode. In scalar semantic
874+
// mode, we don't want to coalesce any scalars into a grapheme. This
875+
// means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
876+
// U+302.
877+
guard options.semanticLevel == .graphemeCluster else { return ccc }
878+
879+
let members = coalescingCustomCharacterClassMembers(ccc.members)
880+
return .init(members: members, isInverted: ccc.isInverted)
881+
}
882+
778883
mutating func emitCustomCharacterClass(
779884
_ ccc: DSLTree.CustomCharacterClass
780885
) throws {
886+
// Before emitting a custom character class in grapheme semantic mode, we
887+
// need to coalesce together any adjacent characters and scalars, over which
888+
// we can perform grapheme breaking. This includes e.g range bounds for
889+
// `[e\u{301}-\u{302}]`.
890+
let ccc = coalescingCustomCharacterClass(ccc)
781891
if let asciiBitset = ccc.asAsciiBitset(options),
782892
optimizationsEnabled {
783893
if options.semanticLevel == .unicodeScalar {

Sources/_StringProcessing/Utility/Misc.swift

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@
1111

1212
extension Array {
1313
/// Coalesce adjacent elements using a given accumulator. The accumulator is
14-
/// transformed into an element of the array by `finish`. The `accumulate`
14+
/// transformed into elements of the array by `finish`. The `accumulate`
1515
/// function should return `true` if the accumulator has coalesced the
1616
/// element, `false` otherwise.
1717
func coalescing<T>(
18-
with initialAccumulator: T, into finish: (T) -> Element,
18+
with initialAccumulator: T, into finish: (T) -> Self,
1919
accumulate: (inout T, Element) -> Bool
2020
) -> Self {
2121
var didAccumulate = false
@@ -32,16 +32,28 @@ extension Array {
3232
if didAccumulate {
3333
// We have a leftover accumulator, which needs to be finished before we
3434
// can append the next element.
35-
result.append(finish(accumulator))
35+
result += finish(accumulator)
3636
accumulator = initialAccumulator
3737
didAccumulate = false
3838
}
3939
result.append(elt)
4040
}
4141
// Handle a leftover accumulation.
4242
if didAccumulate {
43-
result.append(finish(accumulator))
43+
result += finish(accumulator)
4444
}
4545
return result
4646
}
47+
48+
/// Coalesce adjacent elements using a given accumulator. The accumulator is
49+
/// transformed into an element of the array by `finish`. The `accumulate`
50+
/// function should return `true` if the accumulator has coalesced the
51+
/// element, `false` otherwise.
52+
func coalescing<T>(
53+
with initialAccumulator: T, into finish: (T) -> Element,
54+
accumulate: (inout T, Element) -> Bool
55+
) -> Self {
56+
coalescing(
57+
with: initialAccumulator, into: { [finish($0) ]}, accumulate: accumulate)
58+
}
4759
}

Tests/RegexTests/MatchTests.swift

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -765,6 +765,223 @@ extension RegexTests {
765765
semanticLevel: .unicodeScalar
766766
)
767767

768+
// Scalar coalescing.
769+
firstMatchTests(
770+
#"[e\u{301}]"#,
771+
(eDecomposed, eDecomposed),
772+
(eComposed, eComposed),
773+
("e", nil),
774+
("\u{301}", nil)
775+
)
776+
firstMatchTests(
777+
#"[e\u{301}]"#,
778+
(eDecomposed, "e"),
779+
(eComposed, nil),
780+
("e", "e"),
781+
("\u{301}", "\u{301}"),
782+
semanticLevel: .unicodeScalar
783+
)
784+
firstMatchTests(
785+
#"[[[e\u{301}]]]"#,
786+
(eDecomposed, eDecomposed),
787+
(eComposed, eComposed),
788+
("e", nil),
789+
("\u{301}", nil)
790+
)
791+
firstMatchTests(
792+
#"[[[e\u{301}]]]"#,
793+
(eDecomposed, "e"),
794+
(eComposed, nil),
795+
("e", "e"),
796+
("\u{301}", "\u{301}"),
797+
semanticLevel: .unicodeScalar
798+
)
799+
firstMatchTests(
800+
#"[👨\u{200D}👩\u{200D}👧\u{200D}👦]"#,
801+
("👨", nil),
802+
("👩", nil),
803+
("👧", nil),
804+
("👦", nil),
805+
("\u{200D}", nil),
806+
("👨‍👩‍👧‍👦", "👨‍👩‍👧‍👦")
807+
)
808+
firstMatchTests(
809+
#"[👨\u{200D}👩\u{200D}👧\u{200D}👦]"#,
810+
("👨", "👨"),
811+
("👩", "👩"),
812+
("👧", "👧"),
813+
("👦", "👦"),
814+
("\u{200D}", "\u{200D}"),
815+
("👨‍👩‍👧‍👦", "👨"),
816+
semanticLevel: .unicodeScalar
817+
)
818+
firstMatchTests(
819+
#"[e\u{315}\u{301}\u{35C}]"#,
820+
("e", nil),
821+
("e\u{315}", nil),
822+
("e\u{301}", nil),
823+
("e\u{315}\u{301}\u{35C}", "e\u{315}\u{301}\u{35C}"),
824+
("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"),
825+
("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}")
826+
)
827+
828+
firstMatchTests(
829+
#"[a-z1\u{E9}-\u{302}\u{E1}3-59]"#,
830+
("a", "a"),
831+
("a\u{301}", "a\u{301}"),
832+
("\u{E1}", "\u{E1}"),
833+
("\u{E2}", nil),
834+
("z", "z"),
835+
("e", "e"),
836+
(eDecomposed, eDecomposed),
837+
(eComposed, eComposed),
838+
("\u{302}", "\u{302}"),
839+
("1", "1"),
840+
("2", nil),
841+
("3", "3"),
842+
("4", "4"),
843+
("5", "5"),
844+
("6", nil),
845+
("7", nil),
846+
("8", nil),
847+
("9", "9")
848+
)
849+
850+
// These can't compile in grapheme semantic mode, but make sure they work in
851+
// scalar semantic mode.
852+
firstMatchTests(
853+
#"[a\u{315}\u{301}-\u{302}]"#,
854+
("a", "a"),
855+
("\u{315}", "\u{315}"),
856+
("\u{301}", "\u{301}"),
857+
("\u{302}", "\u{302}"),
858+
("\u{303}", nil),
859+
semanticLevel: .unicodeScalar
860+
)
861+
firstMatchTests(
862+
#"[\u{73}\u{323}\u{307}-\u{1E00}]"#,
863+
("\u{73}", "\u{73}"),
864+
("\u{323}", "\u{323}"),
865+
("\u{307}", "\u{307}"),
866+
("\u{400}", "\u{400}"),
867+
("\u{500}", "\u{500}"),
868+
("\u{1E00}", "\u{1E00}"),
869+
("\u{1E01}", nil),
870+
("\u{1E69}", nil),
871+
semanticLevel: .unicodeScalar
872+
)
873+
firstMatchTests(
874+
#"[a\u{302}-✅]"#,
875+
("a", "a"),
876+
("\u{302}", "\u{302}"),
877+
("A\u{302}", "\u{302}"),
878+
("E\u{301}", nil),
879+
("a\u{301}", "a"),
880+
("\u{E1}", nil),
881+
("a\u{302}", "a"),
882+
("\u{E2}", nil),
883+
("\u{E3}", nil),
884+
("\u{EF}", nil),
885+
("e\u{301}", nil),
886+
("e\u{302}", "\u{302}"),
887+
("\u{2705}", "\u{2705}"),
888+
("", ""),
889+
("\u{376}", "\u{376}"),
890+
("\u{850}", "\u{850}"),
891+
("a\u{302}\u{315}", "a"),
892+
semanticLevel: .unicodeScalar
893+
)
894+
firstMatchTests(
895+
#"(?i)[a\u{302}-✅]"#,
896+
("a", "a"),
897+
("\u{302}", "\u{302}"),
898+
("A\u{302}", "A"),
899+
("E\u{301}", nil),
900+
("a\u{301}", "a"),
901+
("\u{E1}", nil),
902+
("a\u{302}", "a"),
903+
("\u{E2}", nil),
904+
("\u{E3}", nil),
905+
("\u{EF}", nil),
906+
("e\u{301}", nil),
907+
("e\u{302}", "\u{302}"),
908+
("\u{2705}", "\u{2705}"),
909+
("", ""),
910+
("\u{376}", "\u{376}"),
911+
("\u{850}", "\u{850}"),
912+
("a\u{302}\u{315}", "a"),
913+
semanticLevel: .unicodeScalar
914+
)
915+
firstMatchTests(
916+
#"[e\u{301}-\u{302}]"#,
917+
("a", nil),
918+
("e", "e"),
919+
("\u{302}", "\u{302}"),
920+
("A\u{302}", "\u{302}"),
921+
("E\u{301}", "\u{301}"),
922+
("\u{C8}", nil),
923+
("\u{C9}", nil),
924+
("\u{CA}", nil),
925+
("\u{CB}", nil),
926+
("a\u{301}", "\u{301}"),
927+
("a\u{302}", "\u{302}"),
928+
("e\u{301}", "e"),
929+
("e\u{302}", "e"),
930+
("\u{E1}", nil),
931+
("\u{E2}", nil),
932+
("\u{E9}", nil),
933+
("\u{EA}", nil),
934+
("\u{EF}", nil),
935+
semanticLevel: .unicodeScalar
936+
)
937+
firstMatchTests(
938+
#"(?i)[e\u{301}-\u{302}]"#,
939+
("a", nil),
940+
("e", "e"),
941+
("\u{302}", "\u{302}"),
942+
("A\u{302}", "\u{302}"),
943+
("E\u{301}", "E"),
944+
("\u{C8}", nil),
945+
("\u{C9}", nil),
946+
("\u{CA}", nil),
947+
("\u{CB}", nil),
948+
("a\u{301}", "\u{301}"),
949+
("a\u{302}", "\u{302}"),
950+
("e\u{301}", "e"),
951+
("e\u{302}", "e"),
952+
("\u{E1}", nil),
953+
("\u{E2}", nil),
954+
("\u{E9}", nil),
955+
("\u{EA}", nil),
956+
("\u{EF}", nil),
957+
semanticLevel: .unicodeScalar
958+
)
959+
960+
// Set operation scalar coalescing.
961+
firstMatchTests(
962+
#"[e\u{301}&&e\u{301}e\u{302}]"#,
963+
("e", nil),
964+
("\u{301}", nil),
965+
("\u{302}", nil),
966+
("e\u{301}", "e\u{301}"),
967+
("e\u{302}", nil))
968+
firstMatchTests(
969+
#"[e\u{301}~~[[e\u{301}]e\u{302}]]"#,
970+
("e", nil),
971+
("\u{301}", nil),
972+
("\u{302}", nil),
973+
("e\u{301}", nil),
974+
("e\u{302}", "e\u{302}"))
975+
firstMatchTests(
976+
#"[e\u{301}[e\u{303}]--[[e\u{301}]e\u{302}]]"#,
977+
("e", nil),
978+
("\u{301}", nil),
979+
("\u{302}", nil),
980+
("\u{303}", nil),
981+
("e\u{301}", nil),
982+
("e\u{302}", nil),
983+
("e\u{303}", "e\u{303}"))
984+
768985
firstMatchTest("[-]", input: "123-abcxyz", match: "-")
769986

770987
// These are metacharacters in certain contexts, but normal characters

Tests/RegexTests/ParseTests.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2929,6 +2929,8 @@ extension RegexTests {
29292929
diagnosticTest(#"[c-b]"#, .invalidCharacterRange(from: "c", to: "b"))
29302930
diagnosticTest(#"[\u{66}-\u{65}]"#, .invalidCharacterRange(from: "\u{66}", to: "\u{65}"))
29312931

2932+
diagnosticTest(#"[e\u{301}-e\u{302}]"#, .invalidCharacterRange(from: "\u{301}", to: "e"))
2933+
29322934
diagnosticTest("(?x)[(?#)]", .expected("]"))
29332935
diagnosticTest("(?x)[(?#abc)]", .expected("]"))
29342936

0 commit comments

Comments
 (0)