Skip to content

Commit bef00ab

Browse files
committed
use normative source for Grapheme class data
@mahkoh points out in #15628 that unicode.py does not use normative data for Grapheme classes. This pr fixes that issue. In addition, GC_RegionalIndicator is renamed GC_Regional_Indicator in order to stay in line with the Unicode class name definitions. I have updated refs in u_str.rs, and verified that there are no refs elsewhere in the codebase. However, in principle someone using the unicode tables for their own purposes might see breakage from this.
1 parent aab8669 commit bef00ab

File tree

3 files changed

+410
-449
lines changed

3 files changed

+410
-449
lines changed

src/etc/unicode.py

+23-57
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,11 @@
1212

1313
# This script uses the following Unicode tables:
1414
# - DerivedCoreProperties.txt
15+
# - DerivedNormalizationProps.txt
1516
# - EastAsianWidth.txt
17+
# - auxiliary/GraphemeBreakProperty.txt
1618
# - PropList.txt
19+
# - ReadMe.txt
1720
# - Scripts.txt
1821
# - UnicodeData.txt
1922
#
@@ -51,41 +54,20 @@
5154
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
5255
}
5356

54-
55-
# Grapheme cluster data
56-
# taken from UAX29, http://www.unicode.org/reports/tr29/
57-
# these code points are excluded from the Control category
58-
# NOTE: CR and LF are also technically excluded, but for
59-
# the sake of convenience we leave them in the Control group
60-
# and manually check them in the appropriate place. This is
61-
# still compliant with the implementation requirements.
62-
grapheme_control_exceptions = set([0x200c, 0x200d])
63-
64-
# the Regional_Indicator category
65-
grapheme_regional_indicator = [(0x1f1e6, 0x1f1ff)]
66-
67-
# "The following ... are specifically excluded" from the SpacingMark category
68-
# http://www.unicode.org/reports/tr29/#SpacingMark
69-
grapheme_spacingmark_exceptions = [(0x102b, 0x102c), (0x1038, 0x1038),
70-
(0x1062, 0x1064), (0x1067, 0x106d), (0x1083, 0x1083), (0x1087, 0x108c),
71-
(0x108f, 0x108f), (0x109a, 0x109c), (0x19b0, 0x19b4), (0x19b8, 0x19b9),
72-
(0x19bb, 0x19c0), (0x19c8, 0x19c9), (0x1a61, 0x1a61), (0x1a63, 0x1a64),
73-
(0xaa7b, 0xaa7b), (0xaa7d, 0xaa7d)]
74-
75-
# these are included in the SpacingMark category
76-
grapheme_spacingmark_extra = set([0xe33, 0xeb3])
57+
# these are the surrogate codepoints, which are not valid rust characters
58+
surrogate_codepoints = (0xd800, 0xdfff)
7759

7860
def fetch(f):
79-
if not os.path.exists(f):
61+
if not os.path.exists(os.path.basename(f)):
8062
os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
8163
% f)
8264

83-
if not os.path.exists(f):
65+
if not os.path.exists(os.path.basename(f)):
8466
sys.stderr.write("cannot load %s" % f)
8567
exit(1)
8668

8769
def is_surrogate(n):
88-
return 0xD800 <= n <= 0xDFFF
70+
return surrogate_codepoints[0] <= n <= surrogate_codepoints[1]
8971

9072
def load_unicode_data(f):
9173
fetch(f)
@@ -228,7 +210,7 @@ def load_properties(f, interestingprops):
228210
re1 = re.compile("^([0-9A-F]+) +; (\w+)")
229211
re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+) +; (\w+)")
230212

231-
for line in fileinput.input(f):
213+
for line in fileinput.input(os.path.basename(f)):
232214
prop = None
233215
d_lo = 0
234216
d_hi = 0
@@ -623,20 +605,14 @@ def optimize_width_table(wtable):
623605
(canon_decomp, compat_decomp, gencats, combines,
624606
lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
625607
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]
626-
other_derived = ["Default_Ignorable_Code_Point", "Grapheme_Extend"]
608+
other_derived = ["Default_Ignorable_Code_Point"]
627609
derived = load_properties("DerivedCoreProperties.txt", want_derived + other_derived)
628610
scripts = load_properties("Scripts.txt", [])
629611
props = load_properties("PropList.txt",
630612
["White_Space", "Join_Control", "Noncharacter_Code_Point"])
631613
norm_props = load_properties("DerivedNormalizationProps.txt",
632614
["Full_Composition_Exclusion"])
633615

634-
# grapheme cluster category from DerivedCoreProperties
635-
# the rest are defined below
636-
grapheme_cats = {}
637-
grapheme_cats["Extend"] = derived["Grapheme_Extend"]
638-
del(derived["Grapheme_Extend"])
639-
640616
# bsearch_range_table is used in all the property modules below
641617
emit_bsearch_range_table(rf)
642618

@@ -691,34 +667,24 @@ def optimize_width_table(wtable):
691667

692668
### grapheme cluster module
693669
# from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
694-
# Hangul syllable categories
695-
want_hangul = ["L", "V", "T", "LV", "LVT"]
696-
grapheme_cats.update(load_properties("HangulSyllableType.txt", want_hangul))
670+
grapheme_cats = load_properties("auxiliary/GraphemeBreakProperty.txt", [])
697671

698672
# Control
673+
# Note 1:
699674
# This category also includes Cs (surrogate codepoints), but Rust's `char`s are
700675
# Unicode Scalar Values only, and surrogates are thus invalid `char`s.
701-
grapheme_cats["Control"] = set()
702-
for cat in ["Zl", "Zp", "Cc", "Cf"]:
703-
grapheme_cats["Control"] |= set(ungroup_cat(gencats[cat]))
676+
# Thus, we have to remove Cs from the Control category
677+
# Note 2:
678+
# 0x0a and 0x0d (CR and LF) are not in the Control category for Graphemes.
679+
# However, the Graphemes iterator treats these as a special case, so they
680+
# should be included in grapheme_cats["Control"] for our implementation.
704681
grapheme_cats["Control"] = group_cat(list(
705-
grapheme_cats["Control"]
706-
- grapheme_control_exceptions
707-
| (set(ungroup_cat(gencats["Cn"]))
708-
& set(ungroup_cat(derived["Default_Ignorable_Code_Point"])))))
709-
710-
# Regional Indicator
711-
grapheme_cats["RegionalIndicator"] = grapheme_regional_indicator
712-
713-
# Prepend - "Currently there are no characters with this value"
714-
# (from UAX#29, Unicode 7.0)
715-
716-
# SpacingMark
717-
grapheme_cats["SpacingMark"] = group_cat(list(
718-
set(ungroup_cat(gencats["Mc"]))
719-
- set(ungroup_cat(grapheme_cats["Extend"]))
720-
| grapheme_spacingmark_extra
721-
- set(ungroup_cat(grapheme_spacingmark_exceptions))))
682+
(set(ungroup_cat(grapheme_cats["Control"]))
683+
| set(ungroup_cat(grapheme_cats["CR"]))
684+
| set(ungroup_cat(grapheme_cats["LF"])))
685+
- set(ungroup_cat([surrogate_codepoints]))))
686+
del(grapheme_cats["CR"])
687+
del(grapheme_cats["LF"])
722688

723689
grapheme_table = []
724690
for cat in grapheme_cats:

0 commit comments

Comments
 (0)