Skip to content

Commit addf750

Browse files
committed
Coalesce character class members
In grapheme semantic mode, coalesce adjacent character and scalar members of a custom character class, over which we can perform grapheme breaking. This involves potentially re-writing ranges such that they contain a complete grapheme of adjacent scalars.
1 parent fdf04b9 commit addf750

File tree

4 files changed

+383
-4
lines changed

4 files changed

+383
-4
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -775,9 +775,127 @@ fileprivate extension Compiler.ByteCodeGen {
775775
builder.label(exit)
776776
}
777777

778+
/// Coalesce any adjacent scalar members in a custom character class together.
779+
/// This is required in order to produce correct grapheme matching behavior.
780+
func coalescingCustomCharacterClassMembers(
781+
_ members: [DSLTree.CustomCharacterClass.Member]
782+
) -> [DSLTree.CustomCharacterClass.Member] {
783+
struct Accumulator {
784+
/// A series of range operands. For example, in `[ab-cde-fg]`, this will
785+
/// contain the strings `["ab", "cde", "fg"]`. From there, the resulting
786+
/// ranges will be created.
787+
private var rangeOperands: [String] = [""]
788+
789+
/// The current range operand.
790+
private var current: String {
791+
_read { yield rangeOperands[rangeOperands.count - 1] }
792+
_modify { yield &rangeOperands[rangeOperands.count - 1] }
793+
}
794+
795+
/// Try to accumulate a character class member, returning `true` if
796+
/// successful, `false` otherwise.
797+
mutating func tryAccumulate(
798+
_ member: DSLTree.CustomCharacterClass.Member
799+
) -> Bool {
800+
switch member {
801+
case .atom(let a):
802+
guard let c = a.literalCharacterValue else { return false }
803+
current.append(c)
804+
return true
805+
case .quotedLiteral(let str):
806+
current += str
807+
return true
808+
case let .range(lhs, rhs):
809+
guard let lhs = lhs.literalCharacterValue,
810+
let rhs = rhs.literalCharacterValue
811+
else { return false }
812+
current.append(lhs)
813+
rangeOperands.append(String(rhs))
814+
return true
815+
default:
816+
return false
817+
}
818+
}
819+
820+
func finish() -> [DSLTree.CustomCharacterClass.Member] {
821+
if rangeOperands.count == 1 {
822+
// If we didn't have any additional range operands, this isn't a
823+
// range, we can just form a standard quoted literal.
824+
return [.quotedLiteral(current)]
825+
}
826+
var members = [DSLTree.CustomCharacterClass.Member]()
827+
828+
// We have other range operands, splice them together. For N operands
829+
// we have N - 1 ranges.
830+
for (i, lhs) in rangeOperands.dropLast().enumerated() {
831+
let rhs = rangeOperands[i + 1]
832+
833+
// If this is the first operand we only need to drop the last
834+
// character for its quoted members, otherwise this is both an LHS
835+
// and RHS of a range, and as such needs both sides trimmed.
836+
let leading = i == 0 ? lhs.dropLast() : lhs.dropFirst().dropLast()
837+
if !leading.isEmpty {
838+
members.append(.quotedLiteral(String(leading)))
839+
}
840+
members.append(.range(.char(lhs.last!), .char(rhs.first!)))
841+
}
842+
// We've handled everything except the quoted portion of the last
843+
// operand, add it now.
844+
let trailing = rangeOperands.last!.dropFirst()
845+
if !trailing.isEmpty {
846+
members.append(.quotedLiteral(String(trailing)))
847+
}
848+
return members
849+
}
850+
}
851+
return members
852+
.map { m -> DSLTree.CustomCharacterClass.Member in
853+
// First we need to recursively coalsce any child character classes.
854+
switch m {
855+
case .custom(let ccc):
856+
return .custom(coalescingCustomCharacterClass(ccc))
857+
case .intersection(let lhs, let rhs):
858+
return .intersection(
859+
coalescingCustomCharacterClass(lhs),
860+
coalescingCustomCharacterClass(rhs))
861+
case .subtraction(let lhs, let rhs):
862+
return .subtraction(
863+
coalescingCustomCharacterClass(lhs),
864+
coalescingCustomCharacterClass(rhs))
865+
case .symmetricDifference(let lhs, let rhs):
866+
return .symmetricDifference(
867+
coalescingCustomCharacterClass(lhs),
868+
coalescingCustomCharacterClass(rhs))
869+
case .atom, .range, .quotedLiteral, .trivia:
870+
return m
871+
}
872+
}
873+
.coalescing(with: Accumulator(), into: { $0.finish() }) { accum, member in
874+
accum.tryAccumulate(member)
875+
}
876+
}
877+
878+
func coalescingCustomCharacterClass(
879+
_ ccc: DSLTree.CustomCharacterClass
880+
) -> DSLTree.CustomCharacterClass {
881+
// This only needs to be done in grapheme semantic mode. In scalar semantic
882+
// mode, we don't want to coalesce any scalars into a grapheme. This
883+
// means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
884+
// U+302.
885+
guard options.semanticLevel == .graphemeCluster else { return ccc }
886+
887+
let members = coalescingCustomCharacterClassMembers(ccc.members)
888+
return .init(members: members, isInverted: ccc.isInverted)
889+
}
890+
778891
mutating func emitCustomCharacterClass(
779892
_ ccc: DSLTree.CustomCharacterClass
780893
) throws {
894+
// Before emitting a custom character class in grapheme semantic mode, we
895+
// need to coalesce together any adjacent characters and scalars, over which
896+
// we can perform grapheme breaking. This includes e.g range bounds for
897+
// `[e\u{301}-\u{302}]`.
898+
let ccc = coalescingCustomCharacterClass(ccc)
781899
if let asciiBitset = ccc.asAsciiBitset(options),
782900
optimizationsEnabled {
783901
if options.semanticLevel == .unicodeScalar {

Sources/_StringProcessing/Utility/Misc.swift

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@
1111

1212
extension Array {
1313
/// Coalesce adjacent elements using a given accumulator. The accumulator is
14-
/// transformed into an element of the array by `finish`. The `accumulate`
14+
/// transformed into elements of the array by `finish`. The `accumulate`
1515
/// function should return `true` if the accumulator has coalesced the
1616
/// element, `false` otherwise.
1717
func coalescing<T>(
18-
with initialAccumulator: T, into finish: (T) -> Element,
18+
with initialAccumulator: T, into finish: (T) -> Self,
1919
accumulate: (inout T, Element) -> Bool
2020
) -> Self {
2121
var didAccumulate = false
@@ -32,16 +32,28 @@ extension Array {
3232
if didAccumulate {
3333
// We have a leftover accumulator, which needs to be finished before we
3434
// can append the next element.
35-
result.append(finish(accumulator))
35+
result += finish(accumulator)
3636
accumulator = initialAccumulator
3737
didAccumulate = false
3838
}
3939
result.append(elt)
4040
}
4141
// Handle a leftover accumulation.
4242
if didAccumulate {
43-
result.append(finish(accumulator))
43+
result += finish(accumulator)
4444
}
4545
return result
4646
}
47+
48+
/// Coalesce adjacent elements using a given accumulator. The accumulator is
49+
/// transformed into an element of the array by `finish`. The `accumulate`
50+
/// function should return `true` if the accumulator has coalesced the
51+
/// element, `false` otherwise.
52+
func coalescing<T>(
53+
with initialAccumulator: T, into finish: (T) -> Element,
54+
accumulate: (inout T, Element) -> Bool
55+
) -> Self {
56+
coalescing(
57+
with: initialAccumulator, into: { [finish($0) ]}, accumulate: accumulate)
58+
}
4759
}

0 commit comments

Comments
 (0)