Skip to content

Commit c640f31

Browse files
committed
avoid string validation in rustc_serialize, check a marker byte instead
since the serialization format isn't self-describing we need a way to detect when encoder and decoder don't match up. but that doesn't have to be utf8 validation for strings, which does cost a few % of performance. Instead we can use a marker byte at the end to be reasonably sure that we're dealing with a string and it wasn't overwritten in some way.
1 parent 207c80f commit c640f31

File tree

1 file changed

+17
-4
lines changed

1 file changed

+17
-4
lines changed

compiler/rustc_serialize/src/opaque.rs

+17-4
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,13 @@ macro_rules! write_leb128 {
5555
}};
5656
}
5757

58+
/// A byte that [cannot occur in UTF8 sequences][utf8]. Used to mark the end of a string.
59+
/// This way we can skip validation and still be relatively sure that deserialization
60+
/// did not desynchronize.
61+
///
62+
/// [utf8]: https://en.wikipedia.org/w/index.php?title=UTF-8&oldid=1058865525#Codepage_layout
63+
const STR_SENTINEL: u8 = 0xC1;
64+
5865
impl serialize::Encoder for Encoder {
5966
type Error = !;
6067

@@ -150,7 +157,8 @@ impl serialize::Encoder for Encoder {
150157
#[inline]
151158
fn emit_str(&mut self, v: &str) -> EncodeResult {
152159
self.emit_usize(v.len())?;
153-
self.emit_raw_bytes(v.as_bytes())
160+
self.emit_raw_bytes(v.as_bytes())?;
161+
self.emit_u8(STR_SENTINEL)
154162
}
155163

156164
#[inline]
@@ -502,7 +510,8 @@ impl serialize::Encoder for FileEncoder {
502510
#[inline]
503511
fn emit_str(&mut self, v: &str) -> FileEncodeResult {
504512
self.emit_usize(v.len())?;
505-
self.emit_raw_bytes(v.as_bytes())
513+
self.emit_raw_bytes(v.as_bytes())?;
514+
self.emit_u8(STR_SENTINEL)
506515
}
507516

508517
#[inline]
@@ -656,8 +665,12 @@ impl<'a> serialize::Decoder for Decoder<'a> {
656665
#[inline]
657666
fn read_str(&mut self) -> Result<Cow<'_, str>, Self::Error> {
658667
let len = self.read_usize()?;
659-
let s = std::str::from_utf8(&self.data[self.position..self.position + len]).unwrap();
660-
self.position += len;
668+
let sentinel = self.data[self.position + len];
669+
assert!(sentinel == STR_SENTINEL);
670+
let s = unsafe {
671+
std::str::from_utf8_unchecked(&self.data[self.position..self.position + len])
672+
};
673+
self.position += len + 1;
661674
Ok(Cow::Borrowed(s))
662675
}
663676

0 commit comments

Comments
 (0)