Skip to content

Commit edd318c

Browse files
committed
Add {floor,ceil}_char_boundary methods to str
1 parent c5e4148 commit edd318c

File tree

5 files changed

+176
-23
lines changed

5 files changed

+176
-23
lines changed

library/alloc/tests/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#![feature(binary_heap_as_slice)]
3030
#![feature(inplace_iteration)]
3131
#![feature(iter_advance_by)]
32+
#![feature(round_char_boundary)]
3233
#![feature(slice_group_by)]
3334
#![feature(slice_partition_dedup)]
3435
#![feature(string_remove_matches)]

library/alloc/tests/str.rs

+92
Original file line numberDiff line numberDiff line change
@@ -2272,3 +2272,95 @@ fn utf8_char_counts() {
22722272
}
22732273
}
22742274
}
2275+
2276+
#[test]
2277+
fn floor_char_boundary() {
2278+
fn check_many(s: &str, arg: impl IntoIterator<Item = usize>, ret: usize) {
2279+
for idx in arg {
2280+
assert_eq!(
2281+
s.floor_char_boundary(idx),
2282+
ret,
2283+
"{:?}.floor_char_boundary({:?}) != {:?}",
2284+
s,
2285+
idx,
2286+
ret
2287+
);
2288+
}
2289+
}
2290+
2291+
// edge case
2292+
check_many("", [0, 1, isize::MAX as usize, usize::MAX], 0);
2293+
2294+
// basic check
2295+
check_many("x", [0], 0);
2296+
check_many("x", [1, isize::MAX as usize, usize::MAX], 1);
2297+
2298+
// 1-byte chars
2299+
check_many("jp", [0], 0);
2300+
check_many("jp", [1], 1);
2301+
check_many("jp", 2..4, 2);
2302+
2303+
// 2-byte chars
2304+
check_many("ĵƥ", 0..2, 0);
2305+
check_many("ĵƥ", 2..4, 2);
2306+
check_many("ĵƥ", 4..6, 4);
2307+
2308+
// 3-byte chars
2309+
check_many("日本", 0..3, 0);
2310+
check_many("日本", 3..6, 3);
2311+
check_many("日本", 6..8, 6);
2312+
2313+
// 4-byte chars
2314+
check_many("🇯🇵", 0..4, 0);
2315+
check_many("🇯🇵", 4..8, 4);
2316+
check_many("🇯🇵", 8..10, 8);
2317+
}
2318+
2319+
#[test]
2320+
fn ceil_char_boundary() {
2321+
fn check_many(s: &str, arg: impl IntoIterator<Item = usize>, ret: usize) {
2322+
for idx in arg {
2323+
assert_eq!(
2324+
s.ceil_char_boundary(idx),
2325+
ret,
2326+
"{:?}.ceil_char_boundary({:?}) != {:?}",
2327+
s,
2328+
idx,
2329+
ret
2330+
);
2331+
}
2332+
}
2333+
2334+
// edge case
2335+
check_many("", [0], 0);
2336+
2337+
// basic check
2338+
check_many("x", [0], 0);
2339+
check_many("x", [1], 1);
2340+
2341+
// 1-byte chars
2342+
check_many("jp", [0], 0);
2343+
check_many("jp", [1], 1);
2344+
check_many("jp", [2], 2);
2345+
2346+
// 2-byte chars
2347+
check_many("ĵƥ", 0..=0, 0);
2348+
check_many("ĵƥ", 1..=2, 2);
2349+
check_many("ĵƥ", 3..=4, 4);
2350+
2351+
// 3-byte chars
2352+
check_many("日本", 0..=0, 0);
2353+
check_many("日本", 1..=3, 3);
2354+
check_many("日本", 4..=6, 6);
2355+
2356+
// 4-byte chars
2357+
check_many("🇯🇵", 0..=0, 0);
2358+
check_many("🇯🇵", 1..=4, 4);
2359+
check_many("🇯🇵", 5..=8, 8);
2360+
}
2361+
2362+
#[test]
2363+
#[should_panic]
2364+
fn ceil_char_boundary_above_len_panic() {
2365+
let _ = "x".ceil_char_boundary(2);
2366+
}

library/core/src/num/mod.rs

+5
Original file line numberDiff line numberDiff line change
@@ -809,6 +809,11 @@ impl u8 {
809809
pub fn escape_ascii(&self) -> ascii::EscapeDefault {
810810
ascii::escape_default(*self)
811811
}
812+
813+
pub(crate) fn is_utf8_char_boundary(self) -> bool {
814+
// This is bit magic equivalent to: b < 128 || b >= 192
815+
(self as i8) >= -0x40
816+
}
812817
}
813818

814819
#[lang = "u16"]

library/core/src/str/mod.rs

+78-10
Original file line numberDiff line numberDiff line change
@@ -76,15 +76,14 @@ use iter::MatchIndicesInternal;
7676
use iter::SplitInternal;
7777
use iter::{MatchesInternal, SplitNInternal};
7878

79-
use validations::truncate_to_char_boundary;
80-
8179
#[inline(never)]
8280
#[cold]
8381
#[track_caller]
8482
fn slice_error_fail(s: &str, begin: usize, end: usize) -> ! {
8583
const MAX_DISPLAY_LENGTH: usize = 256;
86-
let (truncated, s_trunc) = truncate_to_char_boundary(s, MAX_DISPLAY_LENGTH);
87-
let ellipsis = if truncated { "[...]" } else { "" };
84+
let trunc_len = s.floor_char_boundary(MAX_DISPLAY_LENGTH);
85+
let s_trunc = &s[..trunc_len];
86+
let ellipsis = if trunc_len < s.len() { "[...]" } else { "" };
8887

8988
// 1. out of bounds
9089
if begin > s.len() || end > s.len() {
@@ -105,10 +104,7 @@ fn slice_error_fail(s: &str, begin: usize, end: usize) -> ! {
105104
// 3. character boundary
106105
let index = if !s.is_char_boundary(begin) { begin } else { end };
107106
// find the character
108-
let mut char_start = index;
109-
while !s.is_char_boundary(char_start) {
110-
char_start -= 1;
111-
}
107+
let char_start = s.floor_char_boundary(index);
112108
// `char_start` must be less than len and a char boundary
113109
let ch = s[char_start..].chars().next().unwrap();
114110
let char_range = char_start..char_start + ch.len_utf8();
@@ -215,8 +211,80 @@ impl str {
215211
// code on higher opt-levels. See PR #84751 for more details.
216212
None => index == self.len(),
217213

218-
// This is bit magic equivalent to: b < 128 || b >= 192
219-
Some(&b) => (b as i8) >= -0x40,
214+
Some(&b) => b.is_utf8_char_boundary(),
215+
}
216+
}
217+
218+
/// Finds the closest `x` not exceeding `index` where `is_char_boundary(x)` is `true`.
219+
///
220+
/// This method can help you truncate a string so that it's still valid UTF-8, but doesn't
221+
/// exceed a given number of bytes. Note that this is done purely at the character level
222+
/// and can still visually split graphemes, even though the underlying characters aren't
223+
/// split. For example, the emoji 🧑‍🔬 (scientist) could be split so that the string only
224+
/// includes 🧑 (person) instead.
225+
///
226+
/// # Examples
227+
///
228+
/// ```
229+
/// #![feature(round_char_boundary)]
230+
/// let s = "❤️🧡💛💚💙💜";
231+
/// assert_eq!(s.len(), 26);
232+
/// assert!(!s.is_char_boundary(13));
233+
///
234+
/// let closest = s.floor_char_boundary(13);
235+
/// assert_eq!(closest, 10);
236+
/// assert_eq!(&s[..closest], "❤️🧡");
237+
/// ```
238+
#[unstable(feature = "round_char_boundary", issue = "93743")]
239+
#[inline]
240+
pub fn floor_char_boundary(&self, index: usize) -> usize {
241+
if index >= self.len() {
242+
self.len()
243+
} else {
244+
let lower_bound = index.saturating_sub(3);
245+
let new_index = self.as_bytes()[lower_bound..=index]
246+
.iter()
247+
.rposition(|b| b.is_utf8_char_boundary());
248+
249+
// SAFETY: we know that the character boundary will be within four bytes
250+
unsafe { lower_bound + new_index.unwrap_unchecked() }
251+
}
252+
}
253+
254+
/// Finds the closest `x` not below `index` where `is_char_boundary(x)` is `true`.
255+
///
256+
/// This method is the natural complement to [`floor_char_boundary`]. See that method
257+
/// for more details.
258+
///
259+
/// [`floor_char_boundary`]: str::floor_char_boundary
260+
///
261+
/// # Panics
262+
///
263+
/// Panics if `index > self.len()`.
264+
///
265+
/// # Examples
266+
///
267+
/// ```
268+
/// #![feature(round_char_boundary)]
269+
/// let s = "❤️🧡💛💚💙💜";
270+
/// assert_eq!(s.len(), 26);
271+
/// assert!(!s.is_char_boundary(13));
272+
///
273+
/// let closest = s.ceil_char_boundary(13);
274+
/// assert_eq!(closest, 14);
275+
/// assert_eq!(&s[..closest], "❤️🧡💛");
276+
/// ```
277+
#[unstable(feature = "round_char_boundary", issue = "93743")]
278+
#[inline]
279+
pub fn ceil_char_boundary(&self, index: usize) -> usize {
280+
if index > self.len() {
281+
slice_error_fail(self, index, index)
282+
} else {
283+
let upper_bound = Ord::min(index + 4, self.len());
284+
self.as_bytes()[index..upper_bound]
285+
.iter()
286+
.position(|b| b.is_utf8_char_boundary())
287+
.map_or(upper_bound, |pos| pos + index)
220288
}
221289
}
222290

library/core/src/str/validations.rs

-13
Original file line numberDiff line numberDiff line change
@@ -273,16 +273,3 @@ pub const fn utf8_char_width(b: u8) -> usize {
273273

274274
/// Mask of the value bits of a continuation byte.
275275
const CONT_MASK: u8 = 0b0011_1111;
276-
277-
// truncate `&str` to length at most equal to `max`
278-
// return `true` if it were truncated, and the new str.
279-
pub(super) fn truncate_to_char_boundary(s: &str, mut max: usize) -> (bool, &str) {
280-
if max >= s.len() {
281-
(false, s)
282-
} else {
283-
while !s.is_char_boundary(max) {
284-
max -= 1;
285-
}
286-
(true, &s[..max])
287-
}
288-
}

0 commit comments

Comments
 (0)