Handle formatter flags in WTF-8 OsStr Display

thaliaarchi · thaliaarchi · commit 96762a40d817 · 2025-02-06T14:21:24.000-08:00
The Display implementation for `OsStr` and `Path` on Windows (the WTF-8 version) only handles formatter flags when the entire string is valid UTF-8. As most paths are valid UTF-8, the common case is formatted like `str`; however, flags are ignored when they contain an unpaired surrogate. Implement its Display with the same logic as that of `str`. Fixes #136617 for Windows.
diff --git a/library/core/src/fmt/mod.rs b/library/core/src/fmt/mod.rs
@@ -1513,8 +1513,11 @@ unsafe fn getcount(args: &[rt::Argument<'_>], cnt: &rt::Count) -> Option<usize>
 }
 
 /// Padding after the end of something. Returned by `Formatter::padding`.
+#[doc(hidden)]
 #[must_use = "don't forget to write the post padding"]
-pub(crate) struct PostPadding {
+#[unstable(feature = "fmt_internals", reason = "internal to standard library", issue = "none")]
+#[derive(Debug)]
+pub struct PostPadding {
     fill: char,
     padding: usize,
 }
@@ -1525,7 +1528,9 @@ impl PostPadding {
     }
 
     /// Writes this post padding.
-    pub(crate) fn write(self, f: &mut Formatter<'_>) -> Result {
+    #[doc(hidden)]
+    #[unstable(feature = "fmt_internals", reason = "internal to standard library", issue = "none")]
+    pub fn write(self, f: &mut Formatter<'_>) -> Result {
         for _ in 0..self.padding {
             f.buf.write_char(self.fill)?;
         }
@@ -1743,7 +1748,9 @@ impl<'a> Formatter<'a> {
     ///
     /// Callers are responsible for ensuring post-padding is written after the
     /// thing that is being padded.
-    pub(crate) fn padding(
+    #[doc(hidden)]
+    #[unstable(feature = "fmt_internals", reason = "internal to standard library", issue = "none")]
+    pub fn padding(
         &mut self,
         padding: usize,
         default: Alignment,
diff --git a/library/std/src/ffi/os_str/tests.rs b/library/std/src/ffi/os_str/tests.rs
@@ -105,6 +105,22 @@ fn test_os_string_join() {
     assert_eq!("a b c", strings_abc.join(OsStr::new(" ")));
 }
 
+#[test]
+fn display() {
+    let os_string = OsString::from("bcd");
+    assert_eq!(format!("a{:^10}e", os_string.display()), "a   bcd    e");
+}
+
+#[cfg(windows)]
+#[test]
+fn display_invalid_wtf8_windows() {
+    use crate::os::windows::ffi::OsStringExt;
+
+    let os_string = OsString::from_wide(&[b'b' as _, 0xD800, b'd' as _]);
+    assert_eq!(format!("a{:^10}e", os_string.display()), "a   b�d    e");
+    assert_eq!(format!("a{:^10}e", os_string.as_os_str().display()), "a   b�d    e");
+}
+
 #[test]
 fn test_os_string_default() {
     let os_string: OsString = Default::default();
diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs
@@ -587,23 +587,40 @@ impl fmt::Debug for Wtf8 {
 /// Formats the string with unpaired surrogates substituted with the replacement
 /// character, U+FFFD.
 impl fmt::Display for Wtf8 {
-    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let wtf8_bytes = &self.bytes;
-        let mut pos = 0;
-        loop {
-            match self.next_surrogate(pos) {
-                Some((surrogate_pos, _)) => {
-                    formatter.write_str(unsafe {
-                        str::from_utf8_unchecked(&wtf8_bytes[pos..surrogate_pos])
-                    })?;
-                    formatter.write_str(UTF8_REPLACEMENT_CHARACTER)?;
-                    pos = surrogate_pos + 3;
-                }
-                None => {
-                    let s = unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..]) };
-                    if pos == 0 { return s.fmt(formatter) } else { return formatter.write_str(s) }
-                }
-            }
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        // Corresponds to `Formatter::pad`, but for `Wtf8` instead of `str`.
+
+        // Make sure there's a fast path up front.
+        if f.options().get_width().is_none() && f.options().get_precision().is_none() {
+            return self.write_lossy(f);
+        }
+
+        // The `precision` field can be interpreted as a maximum width for the
+        // string being formatted.
+        let max_code_point_count = f.options().get_precision().unwrap_or(usize::MAX);
+        let mut iter = self.code_points();
+        let code_point_count = iter.by_ref().take(max_code_point_count).count();
+
+        // If our string is longer than the maximum width, truncate it and
+        // handle other flags in terms of the truncated string.
+        let byte_len = self.len() - iter.as_slice().len();
+        // SAFETY: The index is derived from the offset of `.code_points()`,
+        // which is guaranteed to be in-bounds and between character boundaries.
+        let s = unsafe { Wtf8::from_bytes_unchecked(self.bytes.get_unchecked(..byte_len)) };
+
+        // The `width` field is more of a minimum width parameter at this point.
+        if let Some(width) = f.options().get_width()
+            && code_point_count < width
+        {
+            // If we're under the minimum width, then fill up the minimum width
+            // with the specified string + some alignment.
+            let post_padding = f.padding(width - code_point_count, fmt::Alignment::Left)?;
+            s.write_lossy(f)?;
+            post_padding.write(f)
+        } else {
+            // If we're over the minimum width or there is no minimum width, we
+            // can just emit the string.
+            s.write_lossy(f)
         }
     }
 }
@@ -719,6 +736,19 @@ impl Wtf8 {
         }
     }
 
+    /// Writes the string as lossy UTF-8 like [`Wtf8::to_string_lossy`].
+    /// It ignores formatter flags.
+    fn write_lossy(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let wtf8_bytes = &self.bytes;
+        let mut pos = 0;
+        while let Some((surrogate_pos, _)) = self.next_surrogate(pos) {
+            f.write_str(unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..surrogate_pos]) })?;
+            f.write_str(UTF8_REPLACEMENT_CHARACTER)?;
+            pos = surrogate_pos + 3;
+        }
+        f.write_str(unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..]) })
+    }
+
     /// Converts the WTF-8 string to potentially ill-formed UTF-16
     /// and return an iterator of 16-bit code units.
     ///
@@ -1003,6 +1033,16 @@ impl Iterator for Wtf8CodePoints<'_> {
     }
 }
 
+impl<'a> Wtf8CodePoints<'a> {
+    /// Views the underlying data as a subslice of the original data.
+    #[inline]
+    pub fn as_slice(&self) -> &Wtf8 {
+        // SAFETY: `Wtf8CodePoints` is only made from a `Wtf8Str`, which
+        // guarantees the iter is valid WTF-8.
+        unsafe { Wtf8::from_bytes_unchecked(self.bytes.as_slice()) }
+    }
+}
+
 /// Generates a wide character sequence for potentially ill-formed UTF-16.
 #[stable(feature = "rust1", since = "1.0.0")]
 #[derive(Clone)]
diff --git a/library/std/src/sys_common/wtf8/tests.rs b/library/std/src/sys_common/wtf8/tests.rs
@@ -748,3 +748,18 @@ fn unwobbly_wtf8_plus_utf8_is_utf8() {
     string.push_str("some utf-8");
     assert!(string.is_known_utf8);
 }
+
+#[test]
+fn display_wtf8() {
+    let string = Wtf8Buf::from_wide(&[b'b' as _, 0xD800, b'd' as _]);
+    assert!(!string.is_known_utf8);
+    assert_eq!(format!("a{:^10}e", string), "a   b�d    e");
+    assert_eq!(format!("a{:^10}e", string.as_slice()), "a   b�d    e");
+
+    let mut string = Wtf8Buf::from_str("bcd");
+    assert!(string.is_known_utf8);
+    assert_eq!(format!("a{:^10}e", string), "a   bcd    e");
+    assert_eq!(format!("a{:^10}e", string.as_slice()), "a   bcd    e");
+    string.is_known_utf8 = false;
+    assert_eq!(format!("a{:^10}e", string), "a   bcd    e");
+}
diff --git a/library/std/tests/path.rs b/library/std/tests/path.rs
@@ -1819,6 +1819,18 @@ fn test_clone_into() {
 fn display_format_flags() {
     assert_eq!(format!("a{:#<5}b", Path::new("").display()), "a#####b");
     assert_eq!(format!("a{:#<5}b", Path::new("a").display()), "aa####b");
+    assert_eq!(format!("a{:^10}e", Path::new("bcd").display()), "a   bcd    e");
+}
+
+#[cfg(windows)]
+#[test]
+fn display_invalid_wtf8_windows() {
+    use std::ffi::OsString;
+    use std::os::windows::ffi::OsStringExt;
+
+    let path_buf = PathBuf::from(OsString::from_wide(&[b'b' as _, 0xD800, b'd' as _]));
+    assert_eq!(format!("a{:^10}e", path_buf.display()), "a   b�d    e");
+    assert_eq!(format!("a{:^10}e", path_buf.as_path().display()), "a   b�d    e");
 }
 
 #[test]