Skip to content

Commit 11e3de3

Browse files
blussdotdash
andcommitted
Add fast path for ASCII in UTF-8 validation
This speeds up the ascii case (and long stretches of ascii in otherwise mixed UTF-8 data) when checking UTF-8 validity. Benchmark results suggest that on purely ASCII input, we can improve throughput (megabytes verified / second) by a factor of 13 to 14! On xml and mostly english language input (en.wikipedia xml dump), throughput increases by a factor 7. On mostly non-ASCII input, performance increases slightly or is the same. The UTF-8 validation is rewritten to use indexed access; since all access is preceded by a (mandatory for validation) length check, they are statically elided by llvm and this formulation is in fact the best for performance. A previous version had losses due to slice to iterator conversions. A large credit to Björn Steinbrink who improved this patch immensely, writing this second version. Benchmark results on x86-64 (Sandy Bridge) compiled with -C opt-level=3. Old code is `regular`, this PR is called `fast`. Datasets: - `ascii` is just ascii (2.5 kB) - `cyr` is cyrillic script with ascii spaces (5 kB) - `dewik10` is 10MB of a de.wikipedia xml dump - `enwik10` is 100MB of an en.wikipedia xml dump - `jawik10` is 10MB of a ja.wikipedia xml dump ``` test from_utf8_ascii_fast ... bench: 140 ns/iter (+/- 4) = 18221 MB/s test from_utf8_ascii_regular ... bench: 1,932 ns/iter (+/- 19) = 1320 MB/s test from_utf8_cyr_fast ... bench: 10,025 ns/iter (+/- 245) = 511 MB/s test from_utf8_cyr_regular ... bench: 12,250 ns/iter (+/- 437) = 418 MB/s test from_utf8_dewik10_fast ... bench: 6,017,909 ns/iter (+/- 105,755) = 1740 MB/s test from_utf8_dewik10_regular ... bench: 11,669,493 ns/iter (+/- 264,045) = 891 MB/s test from_utf8_enwik8_fast ... bench: 14,085,692 ns/iter (+/- 1,643,316) = 7000 MB/s test from_utf8_enwik8_regular ... bench: 93,657,410 ns/iter (+/- 5,353,353) = 1000 MB/s test from_utf8_jawik10_fast ... bench: 29,154,073 ns/iter (+/- 4,659,534) = 340 MB/s test from_utf8_jawik10_regular ... bench: 29,112,917 ns/iter (+/- 2,475,123) = 340 MB/s ``` Co-authored-by: Björn Steinbrink <[email protected]>
1 parent 42c3ef8 commit 11e3de3

File tree

2 files changed

+69
-27
lines changed

2 files changed

+69
-27
lines changed

src/libcollectionstest/str.rs

+12
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,18 @@ fn test_is_utf8() {
470470
assert!(from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok());
471471
}
472472

473+
#[test]
474+
fn from_utf8_mostly_ascii() {
475+
// deny invalid bytes embedded in long stretches of ascii
476+
for i in 32..64 {
477+
let mut data = [0; 128];
478+
data[i] = 0xC0;
479+
assert!(from_utf8(&data).is_err());
480+
data[i] = 0xC2;
481+
assert!(from_utf8(&data).is_err());
482+
}
483+
}
484+
473485
#[test]
474486
fn test_is_utf16() {
475487
use rustc_unicode::str::is_utf16;

src/libcore/str/mod.rs

+57-27
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ use option::Option::{self, None, Some};
3232
use raw::{Repr, Slice};
3333
use result::Result::{self, Ok, Err};
3434
use slice::{self, SliceExt};
35+
use usize;
3536

3637
pub mod pattern;
3738

@@ -240,7 +241,7 @@ impl Utf8Error {
240241
/// ```
241242
#[stable(feature = "rust1", since = "1.0.0")]
242243
pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> {
243-
try!(run_utf8_validation_iterator(&mut v.iter()));
244+
try!(run_utf8_validation(v));
244245
Ok(unsafe { from_utf8_unchecked(v) })
245246
}
246247

@@ -1074,46 +1075,44 @@ unsafe fn cmp_slice(a: &str, b: &str, len: usize) -> i32 {
10741075
}
10751076

10761077
/*
1077-
Section: Misc
1078+
Section: UTF-8 validation
10781079
*/
10791080

1081+
// use truncation to fit u64 into usize
1082+
const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;
1083+
1084+
/// Return `true` if any byte in the word `x` is nonascii (>= 128).
1085+
#[inline]
1086+
fn contains_nonascii(x: usize) -> bool {
1087+
(x & NONASCII_MASK) != 0
1088+
}
1089+
10801090
/// Walk through `iter` checking that it's a valid UTF-8 sequence,
10811091
/// returning `true` in that case, or, if it is invalid, `false` with
10821092
/// `iter` reset such that it is pointing at the first byte in the
10831093
/// invalid sequence.
10841094
#[inline(always)]
1085-
fn run_utf8_validation_iterator(iter: &mut slice::Iter<u8>)
1086-
-> Result<(), Utf8Error> {
1087-
let whole = iter.as_slice();
1088-
loop {
1089-
// save the current thing we're pointing at.
1090-
let old = iter.clone();
1091-
1092-
// restore the iterator we had at the start of this codepoint.
1095+
fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
1096+
let mut offset = 0;
1097+
let len = v.len();
1098+
while offset < len {
1099+
let old_offset = offset;
10931100
macro_rules! err { () => {{
1094-
*iter = old.clone();
10951101
return Err(Utf8Error {
1096-
valid_up_to: whole.len() - iter.as_slice().len()
1102+
valid_up_to: old_offset
10971103
})
10981104
}}}
10991105

1100-
macro_rules! next { () => {
1101-
match iter.next() {
1102-
Some(a) => *a,
1103-
// we needed data, but there was none: error!
1104-
None => err!(),
1106+
macro_rules! next { () => {{
1107+
offset += 1;
1108+
// we needed data, but there was none: error!
1109+
if offset >= len {
1110+
err!()
11051111
}
1106-
}}
1107-
1108-
let first = match iter.next() {
1109-
Some(&b) => b,
1110-
// we're at the end of the iterator and a codepoint
1111-
// boundary at the same time, so this string is valid.
1112-
None => return Ok(())
1113-
};
1112+
v[offset]
1113+
}}}
11141114

1115-
// ASCII characters are always valid, so only large
1116-
// bytes need more examination.
1115+
let first = v[offset];
11171116
if first >= 128 {
11181117
let w = UTF8_CHAR_WIDTH[first as usize];
11191118
let second = next!();
@@ -1156,8 +1155,39 @@ fn run_utf8_validation_iterator(iter: &mut slice::Iter<u8>)
11561155
}
11571156
_ => err!()
11581157
}
1158+
offset += 1;
1159+
} else {
1160+
// Ascii case, try to skip forward quickly.
1161+
let ptr = v.as_ptr();
1162+
let align = (ptr as usize + offset) & (usize::BYTES - 1);
1163+
if align == 0 {
1164+
// When the pointer is aligned, read 2 words of data per iteration
1165+
// until we find a word containing a non-ascii byte.
1166+
while offset <= len - 2 * usize::BYTES {
1167+
unsafe {
1168+
let u = *(ptr.offset(offset as isize) as *const usize);
1169+
let v = *(ptr.offset((offset + usize::BYTES) as isize) as *const usize);
1170+
1171+
// break if there is a nonascii byte
1172+
let zu = contains_nonascii(u);
1173+
let zv = contains_nonascii(v);
1174+
if zu || zv {
1175+
break;
1176+
}
1177+
}
1178+
offset += usize::BYTES * 2;
1179+
}
1180+
// step from the point where the wordwise loop stopped
1181+
while offset < len && v[offset] < 128 {
1182+
offset += 1;
1183+
}
1184+
} else {
1185+
offset += 1;
1186+
}
11591187
}
11601188
}
1189+
1190+
Ok(())
11611191
}
11621192

11631193
// https://tools.ietf.org/html/rfc3629

0 commit comments

Comments
 (0)