@@ -775,9 +775,127 @@ fileprivate extension Compiler.ByteCodeGen {
775
775
builder. label ( exit)
776
776
}
777
777
778
+ /// Coalesce any adjacent scalar members in a custom character class together.
779
+ /// This is required in order to produce correct grapheme matching behavior.
780
+ func coalescingCustomCharacterClassMembers(
781
+ _ members: [ DSLTree . CustomCharacterClass . Member ]
782
+ ) -> [ DSLTree . CustomCharacterClass . Member ] {
783
+ struct Accumulator {
784
+ /// A series of range operands. For example, in `[ab-cde-fg]`, this will
785
+ /// contain the strings `["ab", "cde", "fg"]`. From there, the resulting
786
+ /// ranges will be created.
787
+ private var rangeOperands : [ String ] = [ " " ]
788
+
789
+ /// The current range operand.
790
+ private var current : String {
791
+ _read { yield rangeOperands [ rangeOperands. count - 1 ] }
792
+ _modify { yield & rangeOperands[ rangeOperands. count - 1 ] }
793
+ }
794
+
795
+ /// Try to accumulate a character class member, returning `true` if
796
+ /// successful, `false` otherwise.
797
+ mutating func tryAccumulate(
798
+ _ member: DSLTree . CustomCharacterClass . Member
799
+ ) -> Bool {
800
+ switch member {
801
+ case . atom( let a) :
802
+ guard let c = a. literalCharacterValue else { return false }
803
+ current. append ( c)
804
+ return true
805
+ case . quotedLiteral( let str) :
806
+ current += str
807
+ return true
808
+ case let . range( lhs, rhs) :
809
+ guard let lhs = lhs. literalCharacterValue,
810
+ let rhs = rhs. literalCharacterValue
811
+ else { return false }
812
+ current. append ( lhs)
813
+ rangeOperands. append ( String ( rhs) )
814
+ return true
815
+ default :
816
+ return false
817
+ }
818
+ }
819
+
820
+ func finish( ) -> [ DSLTree . CustomCharacterClass . Member ] {
821
+ if rangeOperands. count == 1 {
822
+ // If we didn't have any additional range operands, this isn't a
823
+ // range, we can just form a standard quoted literal.
824
+ return [ . quotedLiteral( current) ]
825
+ }
826
+ var members = [ DSLTree . CustomCharacterClass. Member] ( )
827
+
828
+ // We have other range operands, splice them together. For N operands
829
+ // we have N - 1 ranges.
830
+ for (i, lhs) in rangeOperands. dropLast ( ) . enumerated ( ) {
831
+ let rhs = rangeOperands [ i + 1 ]
832
+
833
+ // If this is the first operand we only need to drop the last
834
+ // character for its quoted members, otherwise this is both an LHS
835
+ // and RHS of a range, and as such needs both sides trimmed.
836
+ let leading = i == 0 ? lhs. dropLast ( ) : lhs. dropFirst ( ) . dropLast ( )
837
+ if !leading. isEmpty {
838
+ members. append ( . quotedLiteral( String ( leading) ) )
839
+ }
840
+ members. append ( . range( . char( lhs. last!) , . char( rhs. first!) ) )
841
+ }
842
+ // We've handled everything except the quoted portion of the last
843
+ // operand, add it now.
844
+ let trailing = rangeOperands. last!. dropFirst ( )
845
+ if !trailing. isEmpty {
846
+ members. append ( . quotedLiteral( String ( trailing) ) )
847
+ }
848
+ return members
849
+ }
850
+ }
851
+ return members
852
+ . map { m -> DSLTree . CustomCharacterClass . Member in
853
+ // First we need to recursively coalsce any child character classes.
854
+ switch m {
855
+ case . custom( let ccc) :
856
+ return . custom( coalescingCustomCharacterClass ( ccc) )
857
+ case . intersection( let lhs, let rhs) :
858
+ return . intersection(
859
+ coalescingCustomCharacterClass ( lhs) ,
860
+ coalescingCustomCharacterClass ( rhs) )
861
+ case . subtraction( let lhs, let rhs) :
862
+ return . subtraction(
863
+ coalescingCustomCharacterClass ( lhs) ,
864
+ coalescingCustomCharacterClass ( rhs) )
865
+ case . symmetricDifference( let lhs, let rhs) :
866
+ return . symmetricDifference(
867
+ coalescingCustomCharacterClass ( lhs) ,
868
+ coalescingCustomCharacterClass ( rhs) )
869
+ case . atom, . range, . quotedLiteral, . trivia:
870
+ return m
871
+ }
872
+ }
873
+ . coalescing ( with: Accumulator ( ) , into: { $0. finish ( ) } ) { accum, member in
874
+ accum. tryAccumulate ( member)
875
+ }
876
+ }
877
+
878
+ func coalescingCustomCharacterClass(
879
+ _ ccc: DSLTree . CustomCharacterClass
880
+ ) -> DSLTree . CustomCharacterClass {
881
+ // This only needs to be done in grapheme semantic mode. In scalar semantic
882
+ // mode, we don't want to coalesce any scalars into a grapheme. This
883
+ // means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
884
+ // U+302.
885
+ guard options. semanticLevel == . graphemeCluster else { return ccc }
886
+
887
+ let members = coalescingCustomCharacterClassMembers ( ccc. members)
888
+ return . init( members: members, isInverted: ccc. isInverted)
889
+ }
890
+
778
891
mutating func emitCustomCharacterClass(
779
892
_ ccc: DSLTree . CustomCharacterClass
780
893
) throws {
894
+ // Before emitting a custom character class in grapheme semantic mode, we
895
+ // need to coalesce together any adjacent characters and scalars, over which
896
+ // we can perform grapheme breaking. This includes e.g range bounds for
897
+ // `[e\u{301}-\u{302}]`.
898
+ let ccc = coalescingCustomCharacterClass ( ccc)
781
899
if let asciiBitset = ccc. asAsciiBitset ( options) ,
782
900
optimizationsEnabled {
783
901
if options. semanticLevel == . unicodeScalar {
0 commit comments