@@ -775,9 +775,131 @@ fileprivate extension Compiler.ByteCodeGen {
775
775
builder. label ( exit)
776
776
}
777
777
778
+ /// Coalesce any adjacent scalar members in a custom character class together.
779
+ /// This is required in order to produce correct grapheme matching behavior.
780
+ func coalescingCustomCharacterClassMembers(
781
+ _ members: [ DSLTree . CustomCharacterClass . Member ]
782
+ ) -> [ DSLTree . CustomCharacterClass . Member ] {
783
+ struct Accumulator {
784
+ /// A series of range operands. For example, in `[ab-cde-fg]`, this will
785
+ /// contain the strings `["ab", "cde", "fg"]`. From there, the resulting
786
+ /// ranges will be created.
787
+ private var rangeOperands : [ String ] = [ " " ]
788
+
789
+ /// The current range operand.
790
+ private var current : String {
791
+ _read { yield rangeOperands [ rangeOperands. count - 1 ] }
792
+ _modify { yield & rangeOperands[ rangeOperands. count - 1 ] }
793
+ }
794
+
795
+ /// Try to accumulate a character class member, returning `true` if
796
+ /// successful, `false` otherwise.
797
+ mutating func tryAccumulate(
798
+ _ member: DSLTree . CustomCharacterClass . Member
799
+ ) -> Bool {
800
+ switch member {
801
+ case . atom( let a) :
802
+ guard let c = a. literalCharacterValue else { return false }
803
+ current. append ( c)
804
+ return true
805
+ case . quotedLiteral( let str) :
806
+ current += str
807
+ return true
808
+ case let . range( lhs, rhs) :
809
+ guard let lhs = lhs. literalCharacterValue,
810
+ let rhs = rhs. literalCharacterValue
811
+ else { return false }
812
+ current. append ( lhs)
813
+ rangeOperands. append ( String ( rhs) )
814
+ return true
815
+ case . trivia:
816
+ // Trivia can be completely ignored if we've already coalesced
817
+ // something.
818
+ return !current. isEmpty
819
+ default :
820
+ return false
821
+ }
822
+ }
823
+
824
+ func finish( ) -> [ DSLTree . CustomCharacterClass . Member ] {
825
+ if rangeOperands. count == 1 {
826
+ // If we didn't have any additional range operands, this isn't a
827
+ // range, we can just form a standard quoted literal.
828
+ return [ . quotedLiteral( current) ]
829
+ }
830
+ var members = [ DSLTree . CustomCharacterClass. Member] ( )
831
+
832
+ // We have other range operands, splice them together. For N operands
833
+ // we have N - 1 ranges.
834
+ for (i, lhs) in rangeOperands. dropLast ( ) . enumerated ( ) {
835
+ let rhs = rangeOperands [ i + 1 ]
836
+
837
+ // If this is the first operand we only need to drop the last
838
+ // character for its quoted members, otherwise this is both an LHS
839
+ // and RHS of a range, and as such needs both sides trimmed.
840
+ let leading = i == 0 ? lhs. dropLast ( ) : lhs. dropFirst ( ) . dropLast ( )
841
+ if !leading. isEmpty {
842
+ members. append ( . quotedLiteral( String ( leading) ) )
843
+ }
844
+ members. append ( . range( . char( lhs. last!) , . char( rhs. first!) ) )
845
+ }
846
+ // We've handled everything except the quoted portion of the last
847
+ // operand, add it now.
848
+ let trailing = rangeOperands. last!. dropFirst ( )
849
+ if !trailing. isEmpty {
850
+ members. append ( . quotedLiteral( String ( trailing) ) )
851
+ }
852
+ return members
853
+ }
854
+ }
855
+ return members
856
+ . map { m -> DSLTree . CustomCharacterClass . Member in
857
+ // First we need to recursively coalsce any child character classes.
858
+ switch m {
859
+ case . custom( let ccc) :
860
+ return . custom( coalescingCustomCharacterClass ( ccc) )
861
+ case . intersection( let lhs, let rhs) :
862
+ return . intersection(
863
+ coalescingCustomCharacterClass ( lhs) ,
864
+ coalescingCustomCharacterClass ( rhs) )
865
+ case . subtraction( let lhs, let rhs) :
866
+ return . subtraction(
867
+ coalescingCustomCharacterClass ( lhs) ,
868
+ coalescingCustomCharacterClass ( rhs) )
869
+ case . symmetricDifference( let lhs, let rhs) :
870
+ return . symmetricDifference(
871
+ coalescingCustomCharacterClass ( lhs) ,
872
+ coalescingCustomCharacterClass ( rhs) )
873
+ case . atom, . range, . quotedLiteral, . trivia:
874
+ return m
875
+ }
876
+ }
877
+ . coalescing ( with: Accumulator ( ) , into: { $0. finish ( ) } ) { accum, member in
878
+ accum. tryAccumulate ( member)
879
+ }
880
+ }
881
+
882
+ func coalescingCustomCharacterClass(
883
+ _ ccc: DSLTree . CustomCharacterClass
884
+ ) -> DSLTree . CustomCharacterClass {
885
+ // This only needs to be done in grapheme semantic mode. In scalar semantic
886
+ // mode, we don't want to coalesce any scalars into a grapheme. This
887
+ // means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
888
+ // U+302.
889
+ guard options. semanticLevel == . graphemeCluster else { return ccc }
890
+
891
+ let members = coalescingCustomCharacterClassMembers ( ccc. members)
892
+ return . init( members: members, isInverted: ccc. isInverted)
893
+ }
894
+
778
895
mutating func emitCustomCharacterClass(
779
896
_ ccc: DSLTree . CustomCharacterClass
780
897
) throws {
898
+ // Before emitting a custom character class in grapheme semantic mode, we
899
+ // need to coalesce together any adjacent characters and scalars, over which
900
+ // we can perform grapheme breaking. This includes e.g range bounds for
901
+ // `[e\u{301}-\u{302}]`.
902
+ let ccc = coalescingCustomCharacterClass ( ccc)
781
903
if let asciiBitset = ccc. asAsciiBitset ( options) ,
782
904
optimizationsEnabled {
783
905
if options. semanticLevel == . unicodeScalar {
@@ -791,6 +913,45 @@ fileprivate extension Compiler.ByteCodeGen {
791
913
}
792
914
}
793
915
916
+ mutating func emitConcatenation( _ children: [ DSLTree . Node ] ) throws {
917
+ // Before emitting a concatenation, we need to flatten out any nested
918
+ // concatenations, and coalesce any adjacent characters and scalars, forming
919
+ // quoted literals of their contents, over which we can perform grapheme
920
+ // breaking.
921
+ func flatten( _ node: DSLTree . Node ) -> [ DSLTree . Node ] {
922
+ switch node {
923
+ case . concatenation( let ch) :
924
+ return ch. flatMap ( flatten)
925
+ case . convertedRegexLiteral( let n, _) :
926
+ return flatten ( n)
927
+ default :
928
+ return [ node]
929
+ }
930
+ }
931
+ let children = children
932
+ . flatMap ( flatten)
933
+ . coalescing ( with: " " , into: DSLTree . Node. quotedLiteral) { str, node in
934
+ switch node {
935
+ case . atom( let a) :
936
+ guard let c = a. literalCharacterValue else { return false }
937
+ str. append ( c)
938
+ return true
939
+ case . quotedLiteral( let q) :
940
+ str += q
941
+ return true
942
+ case . trivia:
943
+ // Trivia can be completely ignored if we've already coalesced
944
+ // something.
945
+ return !str. isEmpty
946
+ default :
947
+ return false
948
+ }
949
+ }
950
+ for child in children {
951
+ try emitConcatenationComponent ( child)
952
+ }
953
+ }
954
+
794
955
@discardableResult
795
956
mutating func emitNode( _ node: DSLTree . Node ) throws -> ValueRegister ? {
796
957
switch node {
@@ -799,9 +960,7 @@ fileprivate extension Compiler.ByteCodeGen {
799
960
try emitAlternation ( children)
800
961
801
962
case let . concatenation( children) :
802
- for child in children {
803
- try emitConcatenationComponent ( child)
804
- }
963
+ try emitConcatenation ( children)
805
964
806
965
case let . capture( name, refId, child, transform) :
807
966
options. beginScope ( )
0 commit comments