34
34
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
35
35
'''
36
36
37
- UNICODE_VERSION = (12 , 1 , 0 )
37
+ UNICODE_VERSION = (13 , 0 , 0 )
38
38
39
39
UNICODE_VERSION_NUMBER = "%s.%s.%s" % UNICODE_VERSION
40
40
@@ -54,7 +54,7 @@ def load_properties(f, interestingprops = None):
54
54
re1 = re .compile (r"^ *([0-9A-F]+) *; *(\w+)" )
55
55
re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)" )
56
56
57
- for line in fileinput .input (os .path .basename (f )):
57
+ for line in fileinput .input (os .path .basename (f ), openhook = fileinput . hook_encoded ( "utf-8" ) ):
58
58
prop = None
59
59
d_lo = 0
60
60
d_hi = 0
@@ -81,6 +81,28 @@ def load_properties(f, interestingprops = None):
81
81
82
82
return props
83
83
84
+ def load_confusables (f ):
85
+ fetch (f )
86
+ confusables = []
87
+ re1 = re .compile (r"^((?:[0-9A-F]+ )+);\t((?:[0-9A-F]+ )+);\t\w*" )
88
+
89
+ for line in fileinput .input (os .path .basename (f ), openhook = fileinput .hook_encoded ("utf-8" )):
90
+ d_input = 0
91
+ d_outputs = []
92
+ m = re1 .match (line )
93
+ if not m :
94
+ continue
95
+ d_inputs = m .group (1 ).split ()
96
+ if len (d_inputs ) != 1 :
97
+ raise Exception ('More than one code point in first column' )
98
+ d_input = int (d_inputs [0 ].strip (), 16 )
99
+ for d_output in m .group (2 ).split ():
100
+ d_outputitem = int (d_output , 16 );
101
+ d_outputs .append (d_outputitem );
102
+ confusables .append ((d_input , d_outputs ))
103
+
104
+ return confusables
105
+
84
106
def format_table_content (f , content , indent ):
85
107
line = " " * indent
86
108
first = True
@@ -99,6 +121,18 @@ def format_table_content(f, content, indent):
99
121
def escape_char (c ):
100
122
return "'\\ u{%x}'" % c
101
123
124
+ def escape_char_list (l ):
125
+ line = "[" ;
126
+ first = True ;
127
+ for c in l :
128
+ if first :
129
+ line += escape_char (c );
130
+ else :
131
+ line += ", " + escape_char (c );
132
+ first = False ;
133
+ line += "]" ;
134
+ return line
135
+
102
136
def emit_table (f , name , t_data , t_type = "&'static [(char, char)]" , is_pub = True ,
103
137
pfun = lambda x : "(%s,%s)" % (escape_char (x [0 ]), escape_char (x [1 ])), is_const = True ):
104
138
pub_string = "const"
@@ -173,10 +207,51 @@ def emit_identifier_module(f):
173
207
pfun = lambda x : "(%s,%s, IdentifierType::%s)" % (escape_char (x [0 ]), escape_char (x [1 ]), x [2 ]))
174
208
f .write ("}\n \n " )
175
209
210
+ def emit_confusable_detection_module (f ):
211
+ f .write ("pub mod confusable_detection {" )
212
+ f .write ("""
213
+
214
+ #[inline]
215
+ pub fn char_confusable_prototype(c: char) -> Option<&'static [char]> {
216
+ // FIXME: do we want to special case ASCII here?
217
+ match c as usize {
218
+ _ => super::util::bsearch_value_table(c, CONFUSABLES)
219
+ }
220
+ }
221
+
222
+ """ )
223
+
224
+ f .write (" // Confusable table:\n " )
225
+ confusable_table = load_confusables ("confusables.txt" )
226
+ confusable_table .sort (key = lambda w : w [0 ])
227
+
228
+ last_key = None
229
+ for (k , v ) in confusable_table :
230
+ if k == last_key :
231
+ raise Exception ("duplicate keys in confusables table: %s" % k )
232
+ last_key = k
233
+
234
+ emit_table (f , "CONFUSABLES" , confusable_table , "&'static [(char, &'static [char])]" , is_pub = False ,
235
+ pfun = lambda x : "(%s, &%s)" % (escape_char (x [0 ]), escape_char_list (x [1 ])))
236
+ f .write ("}\n \n " )
237
+
238
+
176
239
def emit_util_mod (f ):
177
240
f .write ("""
178
241
pub mod util {
179
242
use core::result::Result::{Ok, Err};
243
+
244
+ #[inline]
245
+ pub fn bsearch_value_table<T: Copy>(c: char, r: &'static [(char, T)]) -> Option<T> {
246
+ match r.binary_search_by_key(&c, |&(k, _)| k) {
247
+ Ok(idx) => {
248
+ let (_, v) = r[idx];
249
+ Some(v)
250
+ }
251
+ Err(_) => None
252
+ }
253
+ }
254
+
180
255
#[inline]
181
256
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
182
257
use core::cmp::Ordering::{Equal, Less, Greater};
@@ -224,3 +299,5 @@ def emit_util_mod(f):
224
299
emit_util_mod (rf )
225
300
### identifier module
226
301
emit_identifier_module (rf )
302
+ ### confusable_detection module
303
+ emit_confusable_detection_module (rf )
0 commit comments