Skip to content

Commit fa5b4c6

Browse files
committed
Update to Unicode 13.0 and implement confusable detection.
1 parent 916eec5 commit fa5b4c6

File tree

6 files changed

+3839
-1186
lines changed

6 files changed

+3839
-1186
lines changed

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ exclude = [ "target/*", "Cargo.lock" ]
1717

1818
[dependencies]
1919
unicode-script = { version = "0.4.0", default-features = false }
20+
unicode-normalization = { version = "0.1.12", default-features = false }
2021
std = { version = "1.0", package = "rustc-std-workspace-std", optional = true }
2122
core = { version = "1.0", package = "rustc-std-workspace-core", optional = true }
2223
compiler_builtins = { version = "0.1", optional = true }

scripts/unicode.py

Lines changed: 79 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
3535
'''
3636

37-
UNICODE_VERSION = (12, 1, 0)
37+
UNICODE_VERSION = (13, 0, 0)
3838

3939
UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION
4040

@@ -54,7 +54,7 @@ def load_properties(f, interestingprops = None):
5454
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
5555
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
5656

57-
for line in fileinput.input(os.path.basename(f)):
57+
for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
5858
prop = None
5959
d_lo = 0
6060
d_hi = 0
@@ -81,6 +81,28 @@ def load_properties(f, interestingprops = None):
8181

8282
return props
8383

84+
def load_confusables(f):
85+
fetch(f)
86+
confusables = []
87+
re1 = re.compile(r"^((?:[0-9A-F]+ )+);\t((?:[0-9A-F]+ )+);\t\w*")
88+
89+
for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
90+
d_input = 0
91+
d_outputs = []
92+
m = re1.match(line)
93+
if not m:
94+
continue
95+
d_inputs = m.group(1).split()
96+
if len(d_inputs) != 1:
97+
raise Exception('More than one code point in first column')
98+
d_input = int(d_inputs[0].strip(), 16)
99+
for d_output in m.group(2).split():
100+
d_outputitem = int(d_output, 16);
101+
d_outputs.append(d_outputitem);
102+
confusables.append((d_input, d_outputs))
103+
104+
return confusables
105+
84106
def format_table_content(f, content, indent):
85107
line = " "*indent
86108
first = True
@@ -99,6 +121,18 @@ def format_table_content(f, content, indent):
99121
def escape_char(c):
100122
return "'\\u{%x}'" % c
101123

124+
def escape_char_list(l):
125+
line = "[";
126+
first = True;
127+
for c in l:
128+
if first:
129+
line += escape_char(c);
130+
else:
131+
line += ", " + escape_char(c);
132+
first = False;
133+
line += "]";
134+
return line
135+
102136
def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
103137
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
104138
pub_string = "const"
@@ -173,10 +207,51 @@ def emit_identifier_module(f):
173207
pfun=lambda x: "(%s,%s, IdentifierType::%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
174208
f.write("}\n\n")
175209

210+
def emit_confusable_detection_module(f):
211+
f.write("pub mod confusable_detection {")
212+
f.write("""
213+
214+
#[inline]
215+
pub fn char_confusable_prototype(c: char) -> Option<&'static [char]> {
216+
// FIXME: do we want to special case ASCII here?
217+
match c as usize {
218+
_ => super::util::bsearch_value_table(c, CONFUSABLES)
219+
}
220+
}
221+
222+
""")
223+
224+
f.write(" // Confusable table:\n")
225+
confusable_table = load_confusables("confusables.txt")
226+
confusable_table.sort(key=lambda w: w[0])
227+
228+
last_key = None
229+
for (k, v) in confusable_table:
230+
if k == last_key:
231+
raise Exception("duplicate keys in confusables table: %s" % k)
232+
last_key = k
233+
234+
emit_table(f, "CONFUSABLES", confusable_table, "&'static [(char, &'static [char])]", is_pub=False,
235+
pfun=lambda x: "(%s, &%s)" % (escape_char(x[0]), escape_char_list(x[1])))
236+
f.write("}\n\n")
237+
238+
176239
def emit_util_mod(f):
177240
f.write("""
178241
pub mod util {
179242
use core::result::Result::{Ok, Err};
243+
244+
#[inline]
245+
pub fn bsearch_value_table<T: Copy>(c: char, r: &'static [(char, T)]) -> Option<T> {
246+
match r.binary_search_by_key(&c, |&(k, _)| k) {
247+
Ok(idx) => {
248+
let (_, v) = r[idx];
249+
Some(v)
250+
}
251+
Err(_) => None
252+
}
253+
}
254+
180255
#[inline]
181256
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
182257
use core::cmp::Ordering::{Equal, Less, Greater};
@@ -224,3 +299,5 @@ def emit_util_mod(f):
224299
emit_util_mod(rf)
225300
### identifier module
226301
emit_identifier_module(rf)
302+
### confusable_detection module
303+
emit_confusable_detection_module(rf)

src/confusable_detection.rs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
//! [Confusable detection](https://www.unicode.org/reports/tr39/#Confusable_Detection)
2+
3+
use core::iter;
4+
5+
enum OnceOrMore<T, I> {
6+
Once(iter::Once<T>),
7+
More(I),
8+
}
9+
10+
impl<T, I> Iterator for OnceOrMore<T, I>
11+
where
12+
I: Iterator<Item = T>,
13+
{
14+
type Item = T;
15+
16+
fn next(&mut self) -> Option<T> {
17+
use OnceOrMore::*;
18+
match self {
19+
Once(v) => v.next(),
20+
More(i) => i.next(),
21+
}
22+
}
23+
}
24+
25+
type StaticSliceIterCloned = core::iter::Cloned<core::slice::Iter<'static, char>>;
26+
27+
fn char_prototype(c: char) -> OnceOrMore<char, StaticSliceIterCloned> {
28+
use crate::tables::confusable_detection::char_confusable_prototype;
29+
match char_confusable_prototype(c) {
30+
None => OnceOrMore::Once(iter::once(c)),
31+
Some(l) => OnceOrMore::More(l.iter().cloned()),
32+
}
33+
}
34+
35+
/// Calculate skeleton for string, as defined by UTS 39
36+
pub fn skeleton(s: &str) -> impl Iterator<Item = char> + '_ {
37+
use unicode_normalization::UnicodeNormalization;
38+
s.chars().nfd().flat_map(char_prototype).nfd()
39+
}

src/lib.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
#![cfg_attr(feature = "bench", feature(test))]
5050
#![no_std]
5151

52+
extern crate alloc;
53+
5254
#[cfg(test)]
5355
#[macro_use]
5456
extern crate std;
@@ -58,10 +60,12 @@ extern crate test;
5860

5961
pub use tables::UNICODE_VERSION;
6062

63+
pub mod confusable_detection;
6164
pub mod general_security_profile;
6265
pub mod mixed_script;
6366
pub mod restriction_level;
6467

68+
pub use confusable_detection::skeleton;
6569
pub use general_security_profile::GeneralSecurityProfile;
6670
pub use mixed_script::MixedScript;
6771
pub use restriction_level::{RestrictionLevel, RestrictionLevelDetection};

0 commit comments

Comments
 (0)