@@ -88,7 +88,7 @@ impl EscapeError {
88
88
///
89
89
/// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
90
90
/// the callback will be called exactly once.
91
- pub fn unescape_non_mixed < F > ( src : & str , mode : Mode , callback : & mut F )
91
+ pub fn unescape_non_mixed < F > ( src : & str , mode : Mode , callback : & mut F ) -> Rfc3349
92
92
where
93
93
F : FnMut ( Range < usize > , Result < char , EscapeError > ) ,
94
94
{
97
97
let mut chars = src. chars ( ) ;
98
98
let res = unescape_char_or_byte ( & mut chars, mode) ;
99
99
callback ( 0 ..( src. len ( ) - chars. as_str ( ) . len ( ) ) , res) ;
100
+ Rfc3349 :: Unused // rfc3349 never triggered by char or byte literals
100
101
}
101
102
Str => unescape_non_raw_common ( src, mode, callback) ,
102
103
RawStr => check_raw_common ( src, mode, callback) ,
@@ -107,7 +108,7 @@ where
107
108
result = Err ( EscapeError :: NulInCStr ) ;
108
109
}
109
110
callback ( r, result)
110
- } ) ;
111
+ } )
111
112
}
112
113
ByteStr { .. } | CStr => unreachable ! ( ) ,
113
114
}
@@ -148,7 +149,7 @@ impl From<u8> for MixedUnit {
148
149
/// a sequence of escaped characters or errors.
149
150
///
150
151
/// Values are returned by invoking `callback`.
151
- pub fn unescape_mixed < F > ( src : & str , mode : Mode , callback : & mut F )
152
+ pub fn unescape_mixed < F > ( src : & str , mode : Mode , callback : & mut F ) -> Rfc3349
152
153
where
153
154
F : FnMut ( Range < usize > , Result < MixedUnit , EscapeError > ) ,
154
155
{
@@ -160,7 +161,7 @@ where
160
161
result = Err ( EscapeError :: NulInCStr ) ;
161
162
}
162
163
callback ( r, result)
163
- } ) ;
164
+ } )
164
165
}
165
166
Char | Byte | Str | RawStr | RawByteStr { .. } | RawCStr => unreachable ! ( ) ,
166
167
}
@@ -178,6 +179,15 @@ pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
178
179
unescape_char_or_byte ( & mut src. chars ( ) , Byte ) . map ( byte_from_char)
179
180
}
180
181
182
+ /// Used to indicate if rfc3349 (mixed-utf8-literals) was required for the
183
+ /// literal to be valid.
184
+ #[ derive( Debug , PartialEq ) ]
185
+ #[ must_use]
186
+ pub enum Rfc3349 {
187
+ Used ,
188
+ Unused ,
189
+ }
190
+
181
191
/// What kind of literal do we parse.
182
192
#[ derive( Debug , Clone , Copy , PartialEq ) ]
183
193
pub enum Mode {
@@ -214,24 +224,24 @@ impl Mode {
214
224
215
225
/// Are unicode (non-ASCII) chars allowed?
216
226
#[ inline]
217
- fn allow_unicode_chars ( self ) -> bool {
227
+ fn allow_unicode_chars ( self , rfc3349 : & mut Rfc3349 ) -> bool {
218
228
match self {
219
- Byte | ByteStr { rfc3349 : false } | RawByteStr { rfc3349 : false } => false ,
229
+ Byte => false ,
230
+ ByteStr { .. } | RawByteStr { .. } => { * rfc3349 = Rfc3349 :: Used ; true }
220
231
Char
221
232
| Str
222
233
| RawStr
223
- | ByteStr { rfc3349 : true }
224
- | RawByteStr { rfc3349 : true }
225
234
| CStr
226
235
| RawCStr => true ,
227
236
}
228
237
}
229
238
230
239
/// Are unicode escapes (`\u`) allowed?
231
- fn allow_unicode_escapes ( self ) -> bool {
240
+ fn allow_unicode_escapes ( self , rfc3349 : & mut Rfc3349 ) -> bool {
232
241
match self {
233
- Byte | ByteStr { rfc3349 : false } => false ,
234
- Char | Str | ByteStr { rfc3349 : true } | CStr => true ,
242
+ Byte => false ,
243
+ ByteStr { .. } => { * rfc3349 = Rfc3349 :: Used ; true }
244
+ Char | Str | CStr => true ,
235
245
RawByteStr { .. } | RawStr | RawCStr => unreachable ! ( ) ,
236
246
}
237
247
}
@@ -245,9 +255,12 @@ impl Mode {
245
255
}
246
256
}
247
257
258
+ // The bool in the return value indicates if rfc3349 must be enabled for the
259
+ // escape to be accepted.
248
260
fn scan_escape < T : From < char > + From < u8 > > (
249
261
chars : & mut Chars < ' _ > ,
250
262
mode : Mode ,
263
+ rfc3349 : & mut Rfc3349 ,
251
264
) -> Result < T , EscapeError > {
252
265
// Previous character was '\\', unescape what follows.
253
266
let res: char = match chars. next ( ) . ok_or ( EscapeError :: LoneSlash ) ? {
@@ -277,15 +290,17 @@ fn scan_escape<T: From<char> + From<u8>>(
277
290
Ok ( T :: from ( value as u8 ) )
278
291
} ;
279
292
}
280
- // njn: gate: is it a ByteStr?
281
- 'u' => return scan_unicode ( chars, mode. allow_unicode_escapes ( ) ) . map ( T :: from) ,
293
+ 'u' => {
294
+ // njn: convert all mode matches back to equality checks
295
+ return scan_unicode ( chars, mode, rfc3349) . map ( T :: from) ;
296
+ }
282
297
_ => return Err ( EscapeError :: InvalidEscape ) ,
283
298
} ;
284
299
Ok ( T :: from ( res) )
285
300
}
286
301
287
302
// njn: change arg to mode in precursor?
288
- fn scan_unicode ( chars : & mut Chars < ' _ > , allow_unicode_escapes : bool ) -> Result < char , EscapeError > {
303
+ fn scan_unicode ( chars : & mut Chars < ' _ > , mode : Mode , rfc3349 : & mut Rfc3349 ) -> Result < char , EscapeError > {
289
304
// We've parsed '\u', now we have to parse '{..}'.
290
305
291
306
if chars. next ( ) != Some ( '{' ) {
@@ -313,7 +328,7 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
313
328
314
329
// Incorrect syntax has higher priority for error reporting
315
330
// than unallowed value for a literal.
316
- if !allow_unicode_escapes {
331
+ if !mode . allow_unicode_escapes ( rfc3349 ) {
317
332
return Err ( EscapeError :: UnicodeEscapeInByte ) ;
318
333
}
319
334
@@ -339,19 +354,28 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
339
354
}
340
355
341
356
#[ inline]
342
- fn ascii_check ( c : char , allow_unicode_chars : bool ) -> Result < char , EscapeError > {
343
- if allow_unicode_chars || c. is_ascii ( ) { Ok ( c) } else { Err ( EscapeError :: NonAsciiCharInByte ) }
357
+ fn ascii_check ( c : char , mode : Mode , rfc3349 : & mut Rfc3349 ) -> Result < char , EscapeError > {
358
+ // Note: we must check `is_ascii` first, to avoid setting `rfc3349` unnecessarily.
359
+ if c. is_ascii ( ) || mode. allow_unicode_chars ( rfc3349) {
360
+ Ok ( c)
361
+ } else {
362
+ Err ( EscapeError :: NonAsciiCharInByte )
363
+ }
344
364
}
345
365
346
366
fn unescape_char_or_byte ( chars : & mut Chars < ' _ > , mode : Mode ) -> Result < char , EscapeError > {
347
367
let c = chars. next ( ) . ok_or ( EscapeError :: ZeroChars ) ?;
368
+ let mut rfc3349 = Rfc3349 :: Unused ;
348
369
let res = match c {
349
- '\\' => scan_escape ( chars, mode) ,
370
+ '\\' => scan_escape ( chars, mode, & mut rfc3349 ) ,
350
371
'\n' | '\t' | '\'' => Err ( EscapeError :: EscapeOnlyChar ) ,
351
372
'\r' => Err ( EscapeError :: BareCarriageReturn ) ,
352
- // njn: this is the only ascii_check that will remain
353
- _ => ascii_check ( c, mode. allow_unicode_chars ( ) ) ,
373
+ _ => ascii_check ( c, mode, & mut rfc3349) ,
354
374
} ?;
375
+
376
+ // rfc3349 cannot be triggered for char or byte literals.
377
+ assert_eq ! ( rfc3349, Rfc3349 :: Unused ) ;
378
+
355
379
if chars. next ( ) . is_some ( ) {
356
380
return Err ( EscapeError :: MoreThanOneChar ) ;
357
381
}
@@ -360,12 +384,12 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
360
384
361
385
/// Takes a contents of a string literal (without quotes) and produces a
362
386
/// sequence of escaped characters or errors.
363
- fn unescape_non_raw_common < F , T : From < char > + From < u8 > > ( src : & str , mode : Mode , callback : & mut F )
387
+ fn unescape_non_raw_common < F , T : From < char > + From < u8 > > ( src : & str , mode : Mode , callback : & mut F ) -> Rfc3349
364
388
where
365
389
F : FnMut ( Range < usize > , Result < T , EscapeError > ) ,
366
390
{
367
391
let mut chars = src. chars ( ) ;
368
- let allow_unicode_chars = mode . allow_unicode_chars ( ) ; // get this outside the loop
392
+ let mut rfc3349 = Rfc3349 :: Unused ;
369
393
370
394
// The `start` and `end` computation here is complicated because
371
395
// `skip_ascii_whitespace` makes us to skip over chars without counting
@@ -385,20 +409,17 @@ where
385
409
} ) ;
386
410
continue ;
387
411
}
388
- _ => scan_escape :: < T > ( & mut chars, mode) ,
412
+ _ => scan_escape :: < T > ( & mut chars, mode, & mut rfc3349 ) ,
389
413
}
390
414
}
391
415
'"' => Err ( EscapeError :: EscapeOnlyChar ) ,
392
416
'\r' => Err ( EscapeError :: BareCarriageReturn ) ,
393
-
394
- // njn: gate, similar to check_raw_common, check:
395
- // - is it a ByteStr AND does it contain a unicode char
396
-
397
- _ => ascii_check ( c, allow_unicode_chars) . map ( T :: from) ,
417
+ _ => ascii_check ( c, mode, & mut rfc3349) . map ( T :: from) ,
398
418
} ;
399
419
let end = src. len ( ) - chars. as_str ( ) . len ( ) ;
400
420
callback ( start..end, res) ;
401
421
}
422
+ rfc3349
402
423
}
403
424
404
425
fn skip_ascii_whitespace < F > ( chars : & mut Chars < ' _ > , start : usize , callback : & mut F )
@@ -431,12 +452,12 @@ where
431
452
/// sequence of characters or errors.
432
453
/// NOTE: Raw strings do not perform any explicit character escaping, here we
433
454
/// only produce errors on bare CR.
434
- fn check_raw_common < F > ( src : & str , mode : Mode , callback : & mut F )
455
+ fn check_raw_common < F > ( src : & str , mode : Mode , callback : & mut F ) -> Rfc3349
435
456
where
436
457
F : FnMut ( Range < usize > , Result < char , EscapeError > ) ,
437
458
{
438
459
let mut chars = src. chars ( ) ;
439
- let allow_unicode_chars = mode . allow_unicode_chars ( ) ; // get this outside the loop
460
+ let mut rfc3349 = Rfc3349 :: Unused ;
440
461
441
462
// The `start` and `end` computation here matches the one in
442
463
// `unescape_non_raw_common` for consistency, even though this function
@@ -445,20 +466,12 @@ where
445
466
let start = src. len ( ) - chars. as_str ( ) . len ( ) - c. len_utf8 ( ) ;
446
467
let res = match c {
447
468
'\r' => Err ( EscapeError :: BareCarriageReturnInRawString ) ,
448
-
449
- // njn: gate: need to somehow return an indication of whether
450
- // rfc3349 unicode char allowance was required for this literal,
451
- // i.e. check
452
- // - is it a RawByteStr AND does it contain a unicode char
453
- //
454
- // njn: but the ascii_check itself isn't necessary
455
- // - or make it return three values? ok, ok-with-3349, bad?
456
-
457
- _ => ascii_check ( c, allow_unicode_chars) ,
469
+ _ => ascii_check ( c, mode, & mut rfc3349) ,
458
470
} ;
459
471
let end = src. len ( ) - chars. as_str ( ) . len ( ) ;
460
472
callback ( start..end, res) ;
461
473
}
474
+ rfc3349
462
475
}
463
476
464
477
#[ inline]
0 commit comments