@@ -58,6 +58,11 @@ impl TranslatorBuilder {
58
58
/// When disabled (the default), the translator is guaranteed to produce
59
59
/// an expression that will only ever match valid UTF-8 (otherwise, the
60
60
/// translator will return an error).
61
+ ///
62
+ /// Note that currently, even when invalid UTF-8 is banned, the translator
63
+ /// will permit a negated ASCII word boundary (i.e., `(?-u:\B)`) even
64
+ /// though it can actually match at invalid UTF-8 boundaries. This bug
65
+ /// will be fixed on the next semver release.
61
66
pub fn allow_invalid_utf8 (
62
67
& mut self ,
63
68
yes : bool ,
@@ -290,7 +295,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
290
295
self . push ( HirFrame :: Expr ( try!( self . hir_dot ( span) ) ) ) ;
291
296
}
292
297
Ast :: Assertion ( ref x) => {
293
- self . push ( HirFrame :: Expr ( self . hir_assertion ( x) ) ) ;
298
+ self . push ( HirFrame :: Expr ( try! ( self . hir_assertion ( x) ) ) ) ;
294
299
}
295
300
Ast :: Class ( ast:: Class :: Perl ( ref x) ) => {
296
301
if self . flags ( ) . unicode ( ) {
@@ -679,10 +684,10 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
679
684
} )
680
685
}
681
686
682
- fn hir_assertion ( & self , asst : & ast:: Assertion ) -> Hir {
687
+ fn hir_assertion ( & self , asst : & ast:: Assertion ) -> Result < Hir > {
683
688
let unicode = self . flags ( ) . unicode ( ) ;
684
689
let multi_line = self . flags ( ) . multi_line ( ) ;
685
- match asst. kind {
690
+ Ok ( match asst. kind {
686
691
ast:: AssertionKind :: StartLine => {
687
692
Hir :: anchor ( if multi_line {
688
693
hir:: Anchor :: StartLine
@@ -714,10 +719,20 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
714
719
Hir :: word_boundary ( if unicode {
715
720
hir:: WordBoundary :: UnicodeNegate
716
721
} else {
722
+ // It is possible for negated ASCII word boundaries to
723
+ // match at invalid UTF-8 boundaries, even when searching
724
+ // valid UTF-8.
725
+ //
726
+ // TODO(ag): Enable this error when regex goes to 1.0.
727
+ // Otherwise, it is too steep of a breaking change.
728
+ // if !self.trans().allow_invalid_utf8 {
729
+ // return Err(self.error(
730
+ // asst.span, ErrorKind::InvalidUtf8));
731
+ // }
717
732
hir:: WordBoundary :: AsciiNegate
718
733
} )
719
734
}
720
- }
735
+ } )
721
736
}
722
737
723
738
fn hir_group ( & self , group : & ast:: Group , expr : Hir ) -> Hir {
@@ -1490,7 +1505,15 @@ mod tests {
1490
1505
assert_eq ! ( t( r"\b" ) , hir_word( hir:: WordBoundary :: Unicode ) ) ;
1491
1506
assert_eq ! ( t( r"\B" ) , hir_word( hir:: WordBoundary :: UnicodeNegate ) ) ;
1492
1507
assert_eq ! ( t( r"(?-u)\b" ) , hir_word( hir:: WordBoundary :: Ascii ) ) ;
1493
- assert_eq ! ( t( r"(?-u)\B" ) , hir_word( hir:: WordBoundary :: AsciiNegate ) ) ;
1508
+ assert_eq ! (
1509
+ t_bytes( r"(?-u)\B" ) ,
1510
+ hir_word( hir:: WordBoundary :: AsciiNegate ) ) ;
1511
+
1512
+ // TODO(ag): Enable this tests when regex goes to 1.0.
1513
+ // assert_eq!(t_err(r"(?-u)\B"), TestError {
1514
+ // kind: hir::ErrorKind::InvalidUtf8,
1515
+ // span: Span::new(Position::new(5, 1, 6), Position::new(7, 1, 8)),
1516
+ // });
1494
1517
}
1495
1518
1496
1519
#[ test]
@@ -2355,13 +2378,13 @@ mod tests {
2355
2378
assert ! ( t_bytes( r"[^a][^a]" ) . is_always_utf8( ) ) ;
2356
2379
assert ! ( t_bytes( r"\b" ) . is_always_utf8( ) ) ;
2357
2380
assert ! ( t_bytes( r"\B" ) . is_always_utf8( ) ) ;
2381
+ assert ! ( t_bytes( r"(?-u)\b" ) . is_always_utf8( ) ) ;
2358
2382
2359
2383
// Negative examples.
2360
2384
assert ! ( !t_bytes( r"(?-u)\xFF" ) . is_always_utf8( ) ) ;
2361
2385
assert ! ( !t_bytes( r"(?-u)\xFF\xFF" ) . is_always_utf8( ) ) ;
2362
2386
assert ! ( !t_bytes( r"(?-u)[^a]" ) . is_always_utf8( ) ) ;
2363
2387
assert ! ( !t_bytes( r"(?-u)[^a][^a]" ) . is_always_utf8( ) ) ;
2364
- assert ! ( !t_bytes( r"(?-u)\b" ) . is_always_utf8( ) ) ;
2365
2388
assert ! ( !t_bytes( r"(?-u)\B" ) . is_always_utf8( ) ) ;
2366
2389
}
2367
2390
@@ -2490,7 +2513,7 @@ mod tests {
2490
2513
assert ! ( t( r"\A" ) . is_match_empty( ) ) ;
2491
2514
assert ! ( t( r"\z" ) . is_match_empty( ) ) ;
2492
2515
assert ! ( t( r"\B" ) . is_match_empty( ) ) ;
2493
- assert ! ( t ( r"(?-u)\B" ) . is_match_empty( ) ) ;
2516
+ assert ! ( t_bytes ( r"(?-u)\B" ) . is_match_empty( ) ) ;
2494
2517
2495
2518
// Negative examples.
2496
2519
assert ! ( !t( r"a+" ) . is_match_empty( ) ) ;
0 commit comments