Skip to content

Commit c230e59

Browse files
committed
syntax/hir: fix handling of ASCII word boundaries
Previously, we had some inconsistencies in how we were handling ASCII word boundaries. In particular, the translator was accepting a negated ASCII word boundary even if the caller didn't disable the UTF-8 invariant. This is wrong, since a negated ASCII word boundary can match between any two arbitrary bytes. However, fixing this is a breaking change, so for now we document the bug. We plan to fix it with regex 1.0. See #457. Additionally, we were incorrectly declaring that an ASCII word boundary matched invalid UTF-8 via the Hir::is_always_utf8 property. An ASCII word boundary must always match an ASCII byte on one side, which implies a valid UTF-8 position.
1 parent c7c7a43 commit c230e59

File tree

3 files changed

+36
-11
lines changed

3 files changed

+36
-11
lines changed

regex-syntax/src/hir/mod.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -302,10 +302,7 @@ impl Hir {
302302
// A negated word boundary matches the empty string, but a normal
303303
// word boundary does not!
304304
info.set_match_empty(word_boundary.is_negated());
305-
// ASCII word boundaries can match invalid UTF-8.
306-
if let WordBoundary::Ascii = word_boundary {
307-
info.set_always_utf8(false);
308-
}
305+
// Negated ASCII word boundaries can match invalid UTF-8.
309306
if let WordBoundary::AsciiNegate = word_boundary {
310307
info.set_always_utf8(false);
311308
}

regex-syntax/src/hir/translate.rs

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ impl TranslatorBuilder {
5858
/// When disabled (the default), the translator is guaranteed to produce
5959
/// an expression that will only ever match valid UTF-8 (otherwise, the
6060
/// translator will return an error).
61+
///
62+
/// Note that currently, even when invalid UTF-8 is banned, the translator
63+
/// will permit a negated ASCII word boundary (i.e., `(?-u:\B)`) even
64+
/// though it can actually match at invalid UTF-8 boundaries. This bug
65+
/// will be fixed on the next semver release.
6166
pub fn allow_invalid_utf8(
6267
&mut self,
6368
yes: bool,
@@ -290,7 +295,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
290295
self.push(HirFrame::Expr(try!(self.hir_dot(span))));
291296
}
292297
Ast::Assertion(ref x) => {
293-
self.push(HirFrame::Expr(self.hir_assertion(x)));
298+
self.push(HirFrame::Expr(try!(self.hir_assertion(x))));
294299
}
295300
Ast::Class(ast::Class::Perl(ref x)) => {
296301
if self.flags().unicode() {
@@ -679,10 +684,10 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
679684
})
680685
}
681686

682-
fn hir_assertion(&self, asst: &ast::Assertion) -> Hir {
687+
fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
683688
let unicode = self.flags().unicode();
684689
let multi_line = self.flags().multi_line();
685-
match asst.kind {
690+
Ok(match asst.kind {
686691
ast::AssertionKind::StartLine => {
687692
Hir::anchor(if multi_line {
688693
hir::Anchor::StartLine
@@ -714,10 +719,20 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
714719
Hir::word_boundary(if unicode {
715720
hir::WordBoundary::UnicodeNegate
716721
} else {
722+
// It is possible for negated ASCII word boundaries to
723+
// match at invalid UTF-8 boundaries, even when searching
724+
// valid UTF-8.
725+
//
726+
// TODO(ag): Enable this error when regex goes to 1.0.
727+
// Otherwise, it is too steep of a breaking change.
728+
// if !self.trans().allow_invalid_utf8 {
729+
// return Err(self.error(
730+
// asst.span, ErrorKind::InvalidUtf8));
731+
// }
717732
hir::WordBoundary::AsciiNegate
718733
})
719734
}
720-
}
735+
})
721736
}
722737

723738
fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir {
@@ -1490,7 +1505,15 @@ mod tests {
14901505
assert_eq!(t(r"\b"), hir_word(hir::WordBoundary::Unicode));
14911506
assert_eq!(t(r"\B"), hir_word(hir::WordBoundary::UnicodeNegate));
14921507
assert_eq!(t(r"(?-u)\b"), hir_word(hir::WordBoundary::Ascii));
1493-
assert_eq!(t(r"(?-u)\B"), hir_word(hir::WordBoundary::AsciiNegate));
1508+
assert_eq!(
1509+
t_bytes(r"(?-u)\B"),
1510+
hir_word(hir::WordBoundary::AsciiNegate));
1511+
1512+
// TODO(ag): Enable this tests when regex goes to 1.0.
1513+
// assert_eq!(t_err(r"(?-u)\B"), TestError {
1514+
// kind: hir::ErrorKind::InvalidUtf8,
1515+
// span: Span::new(Position::new(5, 1, 6), Position::new(7, 1, 8)),
1516+
// });
14941517
}
14951518

14961519
#[test]
@@ -2355,13 +2378,13 @@ mod tests {
23552378
assert!(t_bytes(r"[^a][^a]").is_always_utf8());
23562379
assert!(t_bytes(r"\b").is_always_utf8());
23572380
assert!(t_bytes(r"\B").is_always_utf8());
2381+
assert!(t_bytes(r"(?-u)\b").is_always_utf8());
23582382

23592383
// Negative examples.
23602384
assert!(!t_bytes(r"(?-u)\xFF").is_always_utf8());
23612385
assert!(!t_bytes(r"(?-u)\xFF\xFF").is_always_utf8());
23622386
assert!(!t_bytes(r"(?-u)[^a]").is_always_utf8());
23632387
assert!(!t_bytes(r"(?-u)[^a][^a]").is_always_utf8());
2364-
assert!(!t_bytes(r"(?-u)\b").is_always_utf8());
23652388
assert!(!t_bytes(r"(?-u)\B").is_always_utf8());
23662389
}
23672390

@@ -2490,7 +2513,7 @@ mod tests {
24902513
assert!(t(r"\A").is_match_empty());
24912514
assert!(t(r"\z").is_match_empty());
24922515
assert!(t(r"\B").is_match_empty());
2493-
assert!(t(r"(?-u)\B").is_match_empty());
2516+
assert!(t_bytes(r"(?-u)\B").is_match_empty());
24942517

24952518
// Negative examples.
24962519
assert!(!t(r"a+").is_match_empty());

regex-syntax/src/parser.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,11 @@ impl ParserBuilder {
8787
/// When disabled (the default), the parser is guaranteed to produce
8888
/// an expression that will only ever match valid UTF-8 (otherwise, the
8989
/// parser will return an error).
90+
///
91+
/// Note that currently, even when invalid UTF-8 is banned, the parser
92+
/// will permit a negated ASCII word boundary (i.e., `(?-u:\B)`) even
93+
/// though it can actually match at invalid UTF-8 boundaries. This bug
94+
/// will be fixed on the next semver release.
9095
pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder {
9196
self.hir.allow_invalid_utf8(yes);
9297
self

0 commit comments

Comments
 (0)