|
1 | 1 | use super::from_utf8_unchecked;
|
2 |
| -use super::validations::utf8_char_width; |
3 | 2 | use crate::fmt;
|
4 | 3 | use crate::fmt::{Formatter, Write};
|
5 | 4 | use crate::iter::FusedIterator;
|
@@ -197,93 +196,27 @@ impl<'a> Iterator for Utf8Chunks<'a> {
|
197 | 196 | return None;
|
198 | 197 | }
|
199 | 198 |
|
200 |
| - const TAG_CONT_U8: u8 = 128; |
201 |
| - fn safe_get(xs: &[u8], i: usize) -> u8 { |
202 |
| - *xs.get(i).unwrap_or(&0) |
203 |
| - } |
204 |
| - |
205 |
| - let mut i = 0; |
206 |
| - let mut valid_up_to = 0; |
207 |
| - while i < self.source.len() { |
208 |
| - // SAFETY: `i < self.source.len()` per previous line. |
209 |
| - // For some reason the following are both significantly slower: |
210 |
| - // while let Some(&byte) = self.source.get(i) { |
211 |
| - // while let Some(byte) = self.source.get(i).copied() { |
212 |
| - let byte = unsafe { *self.source.get_unchecked(i) }; |
213 |
| - i += 1; |
214 |
| - |
215 |
| - if byte < 128 { |
216 |
| - // This could be a `1 => ...` case in the match below, but for |
217 |
| - // the common case of all-ASCII inputs, we bypass loading the |
218 |
| - // sizeable UTF8_CHAR_WIDTH table into cache. |
219 |
| - } else { |
220 |
| - let w = utf8_char_width(byte); |
221 |
| - |
222 |
| - match w { |
223 |
| - 2 => { |
224 |
| - if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
225 |
| - break; |
226 |
| - } |
227 |
| - i += 1; |
228 |
| - } |
229 |
| - 3 => { |
230 |
| - match (byte, safe_get(self.source, i)) { |
231 |
| - (0xE0, 0xA0..=0xBF) => (), |
232 |
| - (0xE1..=0xEC, 0x80..=0xBF) => (), |
233 |
| - (0xED, 0x80..=0x9F) => (), |
234 |
| - (0xEE..=0xEF, 0x80..=0xBF) => (), |
235 |
| - _ => break, |
236 |
| - } |
237 |
| - i += 1; |
238 |
| - if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
239 |
| - break; |
240 |
| - } |
241 |
| - i += 1; |
242 |
| - } |
243 |
| - 4 => { |
244 |
| - match (byte, safe_get(self.source, i)) { |
245 |
| - (0xF0, 0x90..=0xBF) => (), |
246 |
| - (0xF1..=0xF3, 0x80..=0xBF) => (), |
247 |
| - (0xF4, 0x80..=0x8F) => (), |
248 |
| - _ => break, |
249 |
| - } |
250 |
| - i += 1; |
251 |
| - if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
252 |
| - break; |
253 |
| - } |
254 |
| - i += 1; |
255 |
| - if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
256 |
| - break; |
257 |
| - } |
258 |
| - i += 1; |
259 |
| - } |
260 |
| - _ => break, |
261 |
| - } |
| 199 | + match super::from_utf8(self.source) { |
| 200 | + Ok(valid) => { |
| 201 | + // Truncate the slice, no need to touch the pointer. |
| 202 | + self.source = &self.source[..0]; |
| 203 | + Some(Utf8Chunk { valid, invalid: &[] }) |
| 204 | + } |
| 205 | + Err(err) => { |
| 206 | + let valid_up_to = err.valid_up_to(); |
| 207 | + let error_len = err.error_len().unwrap_or(self.source.len() - valid_up_to); |
| 208 | + // SAFETY: `valid_up_to` is the valid UTF-8 string length, so is in bound. |
| 209 | + let (valid, remaining) = unsafe { self.source.split_at_unchecked(valid_up_to) }; |
| 210 | + // SAFETY: `error_len` is the errornous byte sequence length, so is in bound. |
| 211 | + let (invalid, after_invalid) = unsafe { remaining.split_at_unchecked(error_len) }; |
| 212 | + self.source = after_invalid; |
| 213 | + Some(Utf8Chunk { |
| 214 | + // SAFETY: All bytes up to `valid_up_to` are valid UTF-8. |
| 215 | + valid: unsafe { from_utf8_unchecked(valid) }, |
| 216 | + invalid, |
| 217 | + }) |
262 | 218 | }
|
263 |
| - |
264 |
| - valid_up_to = i; |
265 | 219 | }
|
266 |
| - |
267 |
| - // SAFETY: `i <= self.source.len()` because it is only ever incremented |
268 |
| - // via `i += 1` and in between every single one of those increments, `i` |
269 |
| - // is compared against `self.source.len()`. That happens either |
270 |
| - // literally by `i < self.source.len()` in the while-loop's condition, |
271 |
| - // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The |
272 |
| - // loop is terminated as soon as the latest `i += 1` has made `i` no |
273 |
| - // longer less than `self.source.len()`, which means it'll be at most |
274 |
| - // equal to `self.source.len()`. |
275 |
| - let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) }; |
276 |
| - self.source = remaining; |
277 |
| - |
278 |
| - // SAFETY: `valid_up_to <= i` because it is only ever assigned via |
279 |
| - // `valid_up_to = i` and `i` only increases. |
280 |
| - let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) }; |
281 |
| - |
282 |
| - Some(Utf8Chunk { |
283 |
| - // SAFETY: All bytes up to `valid_up_to` are valid UTF-8. |
284 |
| - valid: unsafe { from_utf8_unchecked(valid) }, |
285 |
| - invalid, |
286 |
| - }) |
287 | 220 | }
|
288 | 221 | }
|
289 | 222 |
|
|
0 commit comments