Skip to content

move Unicode width() functions to unicode_width crate #24402

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 1 addition & 141 deletions src/etc/unicode.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
#
# Copyright 2011-2013 The Rust Project Developers. See the COPYRIGHT
# Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
Expand All @@ -13,7 +13,6 @@
# This script uses the following Unicode tables:
# - DerivedCoreProperties.txt
# - DerivedNormalizationProps.txt
# - EastAsianWidth.txt
# - auxiliary/GraphemeBreakProperty.txt
# - PropList.txt
# - ReadMe.txt
Expand Down Expand Up @@ -236,43 +235,6 @@ def load_properties(f, interestingprops):
props[prop].append((d_lo, d_hi))
return props

# load all widths of want_widths, except those in except_cats
def load_east_asian_width(want_widths, except_cats):
f = "EastAsianWidth.txt"
fetch(f)
widths = {}
re1 = re.compile("^([0-9A-F]+);(\w+) +# (\w+)")
re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+);(\w+) +# (\w+)")

for line in fileinput.input(f):
width = None
d_lo = 0
d_hi = 0
cat = None
m = re1.match(line)
if m:
d_lo = m.group(1)
d_hi = m.group(1)
width = m.group(2)
cat = m.group(3)
else:
m = re2.match(line)
if m:
d_lo = m.group(1)
d_hi = m.group(2)
width = m.group(3)
cat = m.group(4)
else:
continue
if cat in except_cats or width not in want_widths:
continue
d_lo = int(d_lo, 16)
d_hi = int(d_hi, 16)
if width not in widths:
widths[width] = []
widths[width].append((d_lo, d_hi))
return widths

def escape_char(c):
return "'\\u{%x}'" % c

Expand Down Expand Up @@ -395,48 +357,6 @@ def emit_grapheme_module(f, grapheme_table, grapheme_cats):
is_pub=False)
f.write("}\n")

def emit_charwidth_module(f, width_table):
f.write("pub mod charwidth {\n")
f.write(" use core::option::Option;\n")
f.write(" use core::option::Option::{Some, None};\n")
f.write(" use core::slice::SliceExt;\n")
f.write(" use core::result::Result::{Ok, Err};\n")
f.write("""
fn bsearch_range_value_table(c: char, is_cjk: bool, r: &'static [(char, char, u8, u8)]) -> u8 {
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _, _)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}) {
Ok(idx) => {
let (_, _, r_ncjk, r_cjk) = r[idx];
if is_cjk { r_cjk } else { r_ncjk }
}
Err(_) => 1
}
}
""")

f.write("""
pub fn width(c: char, is_cjk: bool) -> Option<usize> {
match c as usize {
_c @ 0 => Some(0), // null is zero width
cu if cu < 0x20 => None, // control sequences have no width
cu if cu < 0x7F => Some(1), // ASCII
cu if cu < 0xA0 => None, // more control sequences
_ => Some(bsearch_range_value_table(c, is_cjk, charwidth_table) as usize)
}
}

""")

f.write(" // character width table. Based on Markus Kuhn's free wcwidth() implementation,\n")
f.write(" // http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n")
emit_table(f, "charwidth_table", width_table, "&'static [(char, char, u8, u8)]", is_pub=False,
pfun=lambda x: "(%s,%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2], x[3]))
f.write("}\n\n")

def emit_norm_module(f, canon, compat, combine, norm_props):
canon_keys = canon.keys()
canon_keys.sort()
Expand Down Expand Up @@ -527,43 +447,6 @@ def comp_pfun(char):

""")

def remove_from_wtable(wtable, val):
wtable_out = []
while wtable:
if wtable[0][1] < val:
wtable_out.append(wtable.pop(0))
elif wtable[0][0] > val:
break
else:
(wt_lo, wt_hi, width, width_cjk) = wtable.pop(0)
if wt_lo == wt_hi == val:
continue
elif wt_lo == val:
wtable_out.append((wt_lo+1, wt_hi, width, width_cjk))
elif wt_hi == val:
wtable_out.append((wt_lo, wt_hi-1, width, width_cjk))
else:
wtable_out.append((wt_lo, val-1, width, width_cjk))
wtable_out.append((val+1, wt_hi, width, width_cjk))
if wtable:
wtable_out.extend(wtable)
return wtable_out



def optimize_width_table(wtable):
wtable_out = []
w_this = wtable.pop(0)
while wtable:
if w_this[1] == wtable[0][0] - 1 and w_this[2:3] == wtable[0][2:3]:
w_tmp = wtable.pop(0)
w_this = (w_this[0], w_tmp[1], w_tmp[2], w_tmp[3])
else:
wtable_out.append(w_this)
w_this = wtable.pop(0)
wtable_out.append(w_this)
return wtable_out

if __name__ == "__main__":
r = "tables.rs"
if os.path.exists(r):
Expand Down Expand Up @@ -605,29 +488,6 @@ def optimize_width_table(wtable):
emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props)
emit_conversions_module(rf, lowerupper, upperlower)

### character width module
width_table = []
for zwcat in ["Me", "Mn", "Cf"]:
width_table.extend(map(lambda (lo, hi): (lo, hi, 0, 0), gencats[zwcat]))
width_table.append((4448, 4607, 0, 0))

# get widths, except those that are explicitly marked zero-width above
ea_widths = load_east_asian_width(["W", "F", "A"], ["Me", "Mn", "Cf"])
# these are doublewidth
for dwcat in ["W", "F"]:
width_table.extend(map(lambda (lo, hi): (lo, hi, 2, 2), ea_widths[dwcat]))
width_table.extend(map(lambda (lo, hi): (lo, hi, 1, 2), ea_widths["A"]))

width_table.sort(key=lambda w: w[0])

# soft hyphen is not zero width in preformatted text; it's used to indicate
# a hyphen inserted to facilitate a linebreak.
width_table = remove_from_wtable(width_table, 173)

# optimize the width table by collapsing adjacent entities when possible
width_table = optimize_width_table(width_table)
emit_charwidth_module(rf, width_table)

### grapheme cluster module
# from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
grapheme_cats = load_properties("auxiliary/GraphemeBreakProperty.txt", [])
Expand Down
19 changes: 0 additions & 19 deletions src/libcollections/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1736,25 +1736,6 @@ impl str {
UnicodeStr::words(&self[..])
}

/// Returns a string's displayed width in columns.
///
/// Control characters have zero width.
///
/// `is_cjk` determines behavior for characters in the Ambiguous category:
/// if `is_cjk` is
/// `true`, these are 2 columns wide; otherwise, they are 1.
/// In CJK locales, `is_cjk` should be
/// `true`, else it should be `false`.
/// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// recommends that these
/// characters be treated as 1 column (i.e., `is_cjk = false`) if the
/// locale is unknown.
#[unstable(feature = "unicode",
reason = "this functionality may only be provided by libunicode")]
pub fn width(&self, is_cjk: bool) -> usize {
UnicodeStr::width(&self[..], is_cjk)
}

/// Returns a `&str` with leading and trailing whitespace removed.
///
/// # Examples
Expand Down
9 changes: 0 additions & 9 deletions src/libcollectionstest/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,6 @@ fn test_len() {
assert_eq!("\u{2620}".chars().count(), 1);
assert_eq!("\u{1d11e}".chars().count(), 1);
assert_eq!("ประเทศไทย中华Việt Nam".chars().count(), 19);

assert_eq!("hello".width(false), 10);
assert_eq!("hello".width(true), 10);
assert_eq!("\0\0\0\0\0".width(false), 0);
assert_eq!("\0\0\0\0\0".width(true), 0);
assert_eq!("".width(false), 0);
assert_eq!("".width(true), 0);
assert_eq!("\u{2081}\u{2082}\u{2083}\u{2084}".width(false), 4);
assert_eq!("\u{2081}\u{2082}\u{2083}\u{2084}".width(true), 8);
}

#[test]
Expand Down
27 changes: 0 additions & 27 deletions src/libcoretest/char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -209,30 +209,3 @@ fn test_len_utf16() {
assert!('\u{a66e}'.len_utf16() == 1);
assert!('\u{1f4a9}'.len_utf16() == 2);
}

#[test]
fn test_width() {
assert_eq!('\x00'.width(false),Some(0));
assert_eq!('\x00'.width(true),Some(0));

assert_eq!('\x0A'.width(false),None);
assert_eq!('\x0A'.width(true),None);

assert_eq!('w'.width(false),Some(1));
assert_eq!('w'.width(true),Some(1));

assert_eq!('h'.width(false),Some(2));
assert_eq!('h'.width(true),Some(2));

assert_eq!('\u{AD}'.width(false),Some(1));
assert_eq!('\u{AD}'.width(true),Some(1));

assert_eq!('\u{1160}'.width(false),Some(0));
assert_eq!('\u{1160}'.width(true),Some(0));

assert_eq!('\u{a1}'.width(false),Some(1));
assert_eq!('\u{a1}'.width(true),Some(2));

assert_eq!('\u{300}'.width(false),Some(0));
assert_eq!('\u{300}'.width(true),Some(0));
}
5 changes: 2 additions & 3 deletions src/librustc_driver/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
#![feature(staged_api)]
#![feature(exit_status)]
#![feature(set_stdio)]
#![feature(unicode)]

extern crate arena;
extern crate flate;
Expand Down Expand Up @@ -574,7 +573,7 @@ Available lint options:
let builtin_groups = sort_lint_groups(builtin_groups);

let max_name_len = plugin.iter().chain(builtin.iter())
.map(|&s| s.name.width(true))
.map(|&s| s.name.chars().count())
.max().unwrap_or(0);
let padded = |x: &str| {
let mut s = repeat(" ").take(max_name_len - x.chars().count())
Expand All @@ -601,7 +600,7 @@ Available lint options:


let max_name_len = plugin_groups.iter().chain(builtin_groups.iter())
.map(|&(s, _)| s.width(true))
.map(|&(s, _)| s.chars().count())
.max().unwrap_or(0);
let padded = |x: &str| {
let mut s = repeat(" ").take(max_name_len - x.chars().count())
Expand Down
14 changes: 6 additions & 8 deletions src/libsyntax/diagnostic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,7 @@ fn highlight_lines(err: &mut EmitterWriter,
let mut s = String::new();
// Skip is the number of characters we need to skip because they are
// part of the 'filename:line ' part of the previous line.
let skip = fm.name.width(false) + digits + 3;
let skip = fm.name.chars().count() + digits + 3;
for _ in 0..skip {
s.push(' ');
}
Expand All @@ -615,7 +615,7 @@ fn highlight_lines(err: &mut EmitterWriter,
col += 8 - col%8;
s.push('\t');
},
c => for _ in 0..c.width(false).unwrap_or(0) {
_ => {
col += 1;
s.push(' ');
},
Expand All @@ -627,7 +627,7 @@ fn highlight_lines(err: &mut EmitterWriter,
let count = match lastc {
// Most terminals have a tab stop every eight columns by default
'\t' => 8 - col%8,
_ => lastc.width(false).unwrap_or(0),
_ => 1,
};
col += count;
s.extend(::std::iter::repeat('~').take(count));
Expand All @@ -638,7 +638,7 @@ fn highlight_lines(err: &mut EmitterWriter,
if pos >= hi.col.to_usize() { break; }
let count = match ch {
'\t' => 8 - col%8,
_ => ch.width(false).unwrap_or(0),
_ => 1,
};
col += count;
s.extend(::std::iter::repeat('~').take(count));
Expand Down Expand Up @@ -694,7 +694,7 @@ fn end_highlight_lines(w: &mut EmitterWriter,
}
let last_line_start = format!("{}:{} ", fm.name, lines[lines.len()-1].line_index + 1);
let hi = cm.lookup_char_pos(sp.hi);
let skip = last_line_start.width(false);
let skip = last_line_start.chars().count();
let mut s = String::new();
for _ in 0..skip {
s.push(' ');
Expand All @@ -710,9 +710,7 @@ fn end_highlight_lines(w: &mut EmitterWriter,
// position.
match ch {
'\t' => s.push('\t'),
c => for _ in 0..c.width(false).unwrap_or(0) {
s.push(' ');
},
_ => s.push(' ')
}
}
}
Expand Down
15 changes: 1 addition & 14 deletions src/libunicode/char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
use core::char::CharExt as C;
use core::option::Option::{self, Some};
use core::iter::Iterator;
use tables::{derived_property, property, general_category, conversions, charwidth};
use tables::{derived_property, property, general_category, conversions};

// stable reexports
pub use core::char::{MAX, from_u32, from_digit, EscapeUnicode, EscapeDefault};
Expand Down Expand Up @@ -435,17 +435,4 @@ impl char {
pub fn to_uppercase(self) -> ToUppercase {
ToUppercase(Some(conversions::to_upper(self)))
}

/// Returns this character's displayed width in columns, or `None` if it is a
/// control character other than `'\x00'`.
///
/// `is_cjk` determines behavior for characters in the Ambiguous category:
/// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1.
/// In CJK contexts, `is_cjk` should be `true`, else it should be `false`.
/// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// recommends that these characters be treated as 1 column (i.e.,
/// `is_cjk` = `false`) if the context cannot be reliably determined.
#[unstable(feature = "unicode",
reason = "needs expert opinion. is_cjk flag stands out as ugly")]
pub fn width(self, is_cjk: bool) -> Option<usize> { charwidth::width(self, is_cjk) }
}
Loading